LibreOffice plugin to pipe whole Writer documents through Google Translate, that ought to keep most of the page formatting.

⌈⌋ branch:  PageTranslate


Artifact Content

Artifact 92428ac95a8e92fb2e7e113e0f4bcdcde2e0f4d2:

  • File pagetranslate.py — part of check-in [748623aaf6] at 2020-05-23 14:05:33 on branch trunk — Introduce pt_opts handler for settings window (user: mario size: 13832)

#!/usr/bin/python
# encoding: utf-8
# api: uno
# type: callback
# category: language
# title: PageTranslate
# description: Action button to get whole Writer document translated
# version: 1.0.65
# state: beta
# author: mario
# url: https://fossil.include-once.org/pagetranslate/
# depends: python:requests (>= 2.5)
# pack: *.py, META-INF/*, pkg-desc, *.x*, icons/*
# license: GNU LGPL 2.1
# forked-from: TradutorLibreText (Claudemir de Almeida Rosa)
# config: -
# 
# LibreOffice plugin for translating documents that's supposed to retain formatting.
# Per default does not require a text selection to operate, but works on the whole
# page.
# The original mode (TradutorLibreText) is still supported and used whenever a text
# portion is selected. It also uses the default target language (English) then.
# Unless a different mode/language from the Tools>PageTranslate menu is requested.
#
# Beware that Writer freezes during the dozens of translation calls to Google.
# In particular long documents might take ages, because each paragraph/line or
# text longer 1900 chars causes another roundtrip.
#
# Basic support for Draw/Impress documents is now provided. (No text selection
# mode there however).
#
# Always creates a log file: /tmp/pagetranslate-libreoffice.log
#


# OpenOffice UNO bridge
import uno, unohelper
from com.sun.star.task import XJobExecutor
from com.sun.star.awt.MessageBoxButtons import BUTTONS_OK, BUTTONS_OK_CANCEL, BUTTONS_YES_NO, BUTTONS_YES_NO_CANCEL, BUTTONS_RETRY_CANCEL, BUTTONS_ABORT_IGNORE_RETRY
from com.sun.star.awt.MessageBoxButtons import DEFAULT_BUTTON_OK, DEFAULT_BUTTON_CANCEL, DEFAULT_BUTTON_RETRY, DEFAULT_BUTTON_YES, DEFAULT_BUTTON_NO, DEFAULT_BUTTON_IGNORE
from com.sun.star.awt.MessageBoxType import MESSAGEBOX, INFOBOX, WARNINGBOX, ERRORBOX, QUERYBOX
from com.sun.star.beans import PropertyValue
from com.sun.star.awt import XActionListener, XContainerWindowEventHandler
from com.sun.star.lang import Locale, XServiceInfo, XInitialization
# sys modules
import string
import json
import sys
from traceback import format_exc
from tempfile import gettempdir
import re
# log file
import logging as log
log.basicConfig(filename='%s/pagetranslate-libreoffice.log'%gettempdir(), level=log.DEBUG)
# http preparations
http_headers = {
    "User-Agent": "Mozilla/5.0 (X11; Linux; LibreOffice/6.3), TradutorLibreText/1.3+PageTranslate/1.1",
    "Accept-Language": "*; q=1.0",
    "Accept-Encoding": "utf-8"
}
import urllib
from urllib.parse import urlencode, quote, quote_plus
try:
    import requests
    log.info(requests.__file__)
    http = requests.Session()
    http.headers.update(http_headers)
except Exception as e:
    log.error("pythonpath/requests/ not found")
    requests = None
    from urllib.request import urlopen, Request
    ssl_args = dict()
    if sys.platform != 'win32':
        import ssl
        myssl = ssl.create_default_context();
        myssl.check_hostname = False
        myssl.verify_mode = ssl.CERT_NONE
        ssl_args["context"] = myssl



# translation backend/service
class via_googletranslate:

    # regex
    rx_gtrans = re.compile('class="t0">(.+?)</div>', re.S)
    rx_splitpara = re.compile("(.{1,1895\.}|.{1,1900}\s|.*$)", re.S)
    rx_empty = re.compile("^[\s\d,.:;§():-]+$")
    rx_letters = re.compile("\w\w+", re.UNICODE)
    rx_breakln = re.compile("\s?/\s?#\s?§\s?/\s?")

    def __init__(self, params={}):
        self.params = params  # config+argparse

    # request text translation from google
    def askgoogle(self, text, dst_lang="en", src_lang='auto'):
        # fetch translation page
        url = "http://translate.google.com/m?hl=%s&sl=%s&q=%s" % (
            dst_lang, src_lang, urllib.parse.quote_plus(text)
        )
        if requests:
            html = http.get(url).content.decode("utf-8")
        else:
            html = urlopen(
                Request(url, headers=http_headers), **ssl_args
            ).read().decode('utf-8')
        # extract content from text <div>
        m = self.rx_gtrans.search(html)
        if m:
            text = m.group(1)
            text = text.replace("&#39;", "'").replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">").replace("&quot;", '"')
            #@todo: https://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
        else:
            log.warning("NO TRANSLATION RESULT EXTRACTED: " + html)
            log.debug("ORIG TEXT: " + repr(text))
        return text

    # iterate over text segments (1900 char limit)        
    def translate(self, text, lang="auto"):
        if lang == "auto":
            lang = self.params["lang"]
        #log.debug("translate %d chars" % len(text))
        if len(text) < 2:
            log.debug("skipping/len<2")
            return text
        elif self.rx_empty.match(text):
            log.debug("skipping/empty")
            return text
        elif not self.rx_letters.search(text):
            log.debug("skipping/noletters")
            return text
        elif len(text) >= 1900:
            log.debug("spliterate/1900+")
            return " ".join(self.askgoogle(segment, lang) for segment in self.rx_splitpara.findall(text))
        else:
            return self.askgoogle(text, lang)
            
    # translate w/ preserving paragraph breaks (meant for table cell content)
    def linebreakwise(self, text, lang="auto"):
        if self.params["crlf"] != "quick":
            # split on linebreaks and translate each individually
            text = "\n\n".join(self.translate(text, lang) for text in text.split("\n\n"))
        else:
            # use temporary placeholder `/#§/`
            text = self.translate(text.replace("\n\n", "/#$/"), lang)
            text = re.sub(self.rx_breakln, "\n\n", text)
        return text

class via_deepl_web(via_googletranslate):
    pass

class via_deepl_api(via_deepl_web):
    pass


# Office plugin
class pagetranslate(unohelper.Base, XJobExecutor):

    # defaualts + config + command args
    params = dict(
        mode = "page",      # "trigger"/"page", or "tradutor"
        lang = "en",        # target language, or "paragraph", or "locale"
        crlf = "iterate",   # split paragraph sentences? or "quick" for temporary placeholder
        log = "debug",      # logging level
        google = 1,         # backend to use
        deepl_web = 0,
        deepl_api = 0,
        deepl_key = "",
    )
    t = None #via_googletranslate(self.params)

    # gets instantiated as XJobExecutor by LibreOffice
    def __init__(self, ctx):
        log.info("init")
        self.ctx = ctx
        desktop = self.ctx.ServiceManager.createInstanceWithContext( "com.sun.star.frame.Desktop", self.ctx )
        self.document = desktop.getCurrentComponent()
        #self.dispatcher = self.ctx.ServiceManager.createInstanceWithContext("com.sun.star.frame.DispatchHelper", self.ctx)


    # invoked from toolbar button
    def trigger(self, args):
        log.debug(".trigger(args=%s) invoked" % repr(args))
        self.argparse(args)
        self.t = via_googletranslate(self.params)
        try:
            log.debug(dir(self.document))
            # Draw/Impress?
            if self.document.supportsService("com.sun.star.drawing.DrawingDocument") or self.document.supportsService("com.sun.star.presentation.PresentationDocument"):
                log.info(self.document)
                self.drawtranslate(self.document.getDrawPages())
                return
            # check for text selection, and switch to TradutorLibreText method then
            selection = self.document.getCurrentController().getSelection().getByIndex(0)
            if len(selection.getString()):
                return self.rewrite_selection(selection)
            # else iterate over text snippets
            tree = self.document.getText().createEnumeration()
            self.traverse(tree)
        except Exception as exc:
            log.error(format_exc())
            self.MessageBox(formet_exc())
        log.info("----")

    
    # break up UNO service: url query string `.pagetranslate?page&lang=en`
    def argparse(self, args):
        # leading ?action&
        self.params["mode"] = re.findall("^(\w*)(?=&|$)", args)[0]
        # key=value pairs
        for pair in re.findall("(\w+)=([\w-]+)", args):
            self.params[pair[0]] = pair[1]
        # replace default locale
        if self.params.get("lang","-") == "locale":
            self.params["lang"] = self.getOoLocale()
        # log
        #log.basicConfig(level=log.__dict__[params["log"].upper()])
        log.info(repr(self.params))


    #-- iterate over TextContent/TextTable nodes
    def traverse(self, tree):
        log.info("TextDocument.Enumeration…")
        while tree.hasMoreElements():
            para = tree.nextElement()
            log.info(para)
            # table/cells
            if para.supportsService("com.sun.star.text.TextTable"):
                for cellname in para.getCellNames():
                    log.debug(cellname)
                    text = para.getCellByName(cellname).getText()
                    #self.traverse(text.createEnumeration())
                    text.setString(self.t.linebreakwise(text.getString())) # or .translate #linebreakwise
                pass
            # normal flow text
            elif para.supportsService("com.sun.star.text.TextContent"):
                text = para.getString()
                text = self.t.translate(text)
                para.setString(text)
                # the paragraph itself can be enumerated for text portions,
                # but for now it's really slow enough
            else:
                log.warning("Unsupported document element.")

    #-- iterate over DrawPages and TextShapes
    def drawtranslate(self, pages):
        for pi in range(0, pages.getCount()):
            page = pages.getByIndex(pi)
            for si in range(0, page.getCount()):
                shape = page.getByIndex(si)
                if shape.supportsService("com.sun.star.drawing.TextShape"):
                    log.info(shape)
                    shape.Text.setString(self.t.translate(shape.Text.getString()))


    #-- TradutorLibreText (selection rewrite)
    def rewrite_selection(self, xTextRange):
        log.info("rewrite text selection")

        # Get selected text
        string = xTextRange.getString()
        if self.params["lang"] == "paragraph":
            self.params["lang"] = xTextRange.CharLocale.Language
        elif self.params["mode"] == "tradutor":
            code = self.getOoLocale()
            self.params["lang"] = self.getParaLang(xTextRange).Language

        try:
            trans = self.t.linebreakwise(string)
            trans = trans.replace('\\n',"\n").replace('\\r',"\n")
            xTextRange.setString(trans)

        except Exception as e:
            try:
                self.MessageBox(str(e))
            except Exception as e:
                log.info(e)

    # Query system locale
    def getOoLocale(self):
        self.language = self.ctx.ServiceManager.createInstanceWithContext("com.sun.star.i18n.LocaleData", self.ctx)
        self.lang = self.ctx.ServiceManager.createInstanceWithContext("com.sun.star.configuration.ConfigurationProvider", self.ctx)
        properties = []
        arg = PropertyValue()
        arg.Name = "nodepath"
        arg.Value = "/org.openoffice.Setup/L10N"
        properties.append(arg)
        properties = tuple(properties)
        code = self.lang.createInstanceWithArguments("com.sun.star.configuration.ConfigurationAccess", properties).getByName("ooLocale")
        log.info("ooLocale="+repr(code))
        return code

    # Langinfo=(com.sun.star.i18n.LanguageCountryInfo){ Language = (string)"de", LanguageDefaultName = (string)"German", Country = (string)"DE", CountryDefaultName = (string)"Germany", Variant = (string)"" }
    def getParaLang(self, xTextRange):
        Langinfo = self.language.getLanguageCountryInfo(xTextRange.CharLocale)
        log.info("Langinfo="+repr(Langinfo))
        return Langinfo

    # user notifications
    def MessageBox(self,MsgText, MsgTitle="", MsgType=MESSAGEBOX, MsgButtons=BUTTONS_OK):
        ParentWin = self.document.getCurrentController().Frame.ContainerWindow
        ctx = uno.getComponentContext()
        sm = ctx.ServiceManager
        sv = sm.createInstanceWithContext("com.sun.star.awt.Toolkit", ctx)
        myBox = sv.createMessageBox(ParentWin, MsgType, MsgButtons, MsgTitle, MsgText)
        return myBox.execute()



# @src https://forum.openoffice.org/en/forum/viewtopic.php?f=20&t=96509 → the __init__(,args) parameter was invalid
class pt_opts(unohelper.Base, XContainerWindowEventHandler, XServiceInfo):
    #sid = "org.libreoffice.comp.pyuno.pagetranslate_options"
    #sid = "org.openoffice.comp.pyuno.pagetranslate_options"
    #sid = "service:org.openoffice.comp.pyuno.pagetranslate_options"
    sid = "vnd.include-once.OptionsPageTranslate"

    def __init__(self, ctx, *kargs):
        log.info("pt_opts.init")
        self.ctx = ctx

    # XContainerWindowEventHandler
    def getSupportedMethodNames(self): return ("external_event",)
    # XServiceInfo
    def supportsService(self, name): return (name == self.sid)
    def getImplementationName(self): return self.sid
    def getSupportedServiceNames(self): return (self.sid,)
    def getServiceNames(self): return (self.sid,)
    
    def callHandlerMethod(self, window, ev, name):
        log.info(window)
        log.info(ev)
        log.info(name)
        return True
   


# register with LibreOffice
g_ImplementationHelper = unohelper.ImplementationHelper()
g_ImplementationHelper.addImplementation(
    pagetranslate,
    "org.openoffice.comp.pyuno.pagetranslate",
    ("com.sun.star.task.Job",),
)
g_ImplementationHelper.addImplementation( pt_opts, pt_opts.sid, () )