LibreOffice plugin to pipe whole Writer documents through Google Translate, that ought to keep most of the page formatting.

โŒˆโŒ‹ โŽ‡ branch:  PageTranslate


Artifact [2131a56f64]

Artifact 2131a56f6409b09118207e0196de42bb074cdc58:

  • File off/contentxmltrans.py — part of check-in [d4530cf627] at 2020-05-03 14:53:53 on branch trunk — Reenable cursor/selection mode (Tradutor trigger). Prepare binding to Draw documents. (user: mario size: 3778)

#!/usr/bin/python
# encoding: utf-8
# api: cli
# type: main
# category: transform
# title: PageTranslate
# description: rewrite content.xml from Draw document
# version: 0.6
# state: experimental
# author: mario
# license: GNU LGPL 2.1
# config: -
# 
# Rewrite text nodes from Draw content.xml document
#


# OpenOffice UNO bridge
None
# sys modules
import string
import json
import sys
from traceback import format_exc
# http preparations
import urllib
from urllib2 import urlopen, Request
from six.moves.urllib.parse import urlencode, quote, quote_plus
ssl_args = dict()
if sys.platform != 'win32':
    import ssl
    myssl = ssl.create_default_context();
    myssl.check_hostname = False
    myssl.verify_mode = ssl.CERT_NONE
    ssl_args["context"] = myssl
http_headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux; LibreOffice/6.3; TradutorLibreText/1.3+PageTranslate/0.5)"
}
# log file
import logging
logging.basicConfig(filename='/tmp/pagetranslate-libreoffice.log', level=logging.DEBUG)
# regex
import re
rx_xmltrans = re.compile('(<text:span[^>]+>)([\w\s,.]+)(?=</text)', re.S|re.UNICODE)
rx_gtrans = re.compile('class="t0">(.+?)</div>', re.S)
rx_splitpara = re.compile("(.{1,1895\.}|.{1,1900}\s|.*$)", re.S)
rx_empty = re.compile("^[\s\d,.:;ยง():-]+$")
rx_letters = re.compile("\w\w+", re.UNICODE)





# cli tool
class pagetranslate:


    def __init__(self, ctx):
        try:
            xml = open("content.xml", "r").read().decode("utf-8")
            xml = re.sub(rx_xmltrans, self.repl, xml)
            open("content.en.xml", "w").write(xml.encode("utf-8"))
        except Exception as e:
            print(format_exc(e))

    # re.sub callback
    def repl(self, parts):
        parts = [parts.group(1), parts.group(2)]
        parts[1] = self.translate(parts[1])
        print(repr(parts))
        return "".join(parts)

    # request text translation from google
    def askgoogle(self, text, dst_lang="en", src_lang='auto'):
        # fetch translation page
        url = "http://translate.google.com/m?hl=%s&sl=%s&q=%s" % (
            dst_lang, src_lang, quote_plus(text.encode("utf-8"))
        )
        html = urlopen(
            Request(url, headers=http_headers), **ssl_args
        ).read().decode('utf-8')
        # extract content from text <div>
        m = rx_gtrans.search(html)
        if m:
            text = m.group(1)
            text = text.replace("&#39;", "'").replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">").replace("&quot;", '"')
            #@todo: https://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
        else:
            logging.warning("NO TRANSLATION RESULT EXTRACTED: " + html)
            logging.debug("ORIG TEXT: " + repr(text))
        return text

    # iterate over text segments (1900 char limit)        
    def translate(self, text, lang="auto"):
        #logging.debug("translate %d chars" % len(text))
        if len(text) < 2:
            logging.debug("skipping/len<2")
            return text
        elif rx_empty.match(text):
            logging.debug("skipping/empty")
            return text
        elif not rx_letters.search(text):
            logging.debug("skipping/noletters")
            return text
        elif len(text) >= 1900:
            logging.debug("spliterate/1900+")
            return " ".join(self.askgoogle(segment) for segment in rx_splitpara.findall(text))
        else:
            return self.askgoogle(text)
            
    # translate w/ preserving paragraph breaks (meant for table cell content)
    def linebreakwise(self, text, lang="auto"):
        return "\n\n".join(self.translate(text) for text in text.split("\n\n"))
        # alternatively, use a temp placeholder '/#/'


# register with LibreOffice
pagetranslate(None)