#!/usr/bin/python
# encoding: utf-8
# api: cli
# type: main
# category: transform
# title: PageTranslate
# description: rewrite content.xml from Draw document
# version: 0.6
# state: experimental
# author: mario
# license: GNU LGPL 2.1
# config: -
#
# Rewrite text nodes from Draw content.xml document
#
# OpenOffice UNO bridge
None
# sys modules
import string
import json
import sys
from traceback import format_exc
# http preparations
import urllib
from urllib2 import urlopen, Request
from six.moves.urllib.parse import urlencode, quote, quote_plus
ssl_args = dict()
if sys.platform != 'win32':
import ssl
myssl = ssl.create_default_context();
myssl.check_hostname = False
myssl.verify_mode = ssl.CERT_NONE
ssl_args["context"] = myssl
http_headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux; LibreOffice/6.3; TradutorLibreText/1.3+PageTranslate/0.5)"
}
# log file
import logging
logging.basicConfig(filename='/tmp/pagetranslate-libreoffice.log', level=logging.DEBUG)
# regex
import re
rx_xmltrans = re.compile('(<text:span[^>]+>)([\w\s,.]+)(?=</text)', re.S|re.UNICODE)
rx_gtrans = re.compile('class="t0">(.+?)</div>', re.S)
rx_splitpara = re.compile("(.{1,1895\.}|.{1,1900}\s|.*$)", re.S)
rx_empty = re.compile("^[\s\d,.:;ยง():-]+$")
rx_letters = re.compile("\w\w+", re.UNICODE)
# cli tool
class pagetranslate:
def __init__(self, ctx):
try:
xml = open("content.xml", "r").read().decode("utf-8")
xml = re.sub(rx_xmltrans, self.repl, xml)
open("content.en.xml", "w").write(xml.encode("utf-8"))
except Exception as e:
print(format_exc(e))
# re.sub callback
def repl(self, parts):
parts = [parts.group(1), parts.group(2)]
parts[1] = self.translate(parts[1])
print(repr(parts))
return "".join(parts)
# request text translation from google
def askgoogle(self, text, dst_lang="en", src_lang='auto'):
# fetch translation page
url = "http://translate.google.com/m?hl=%s&sl=%s&q=%s" % (
dst_lang, src_lang, quote_plus(text.encode("utf-8"))
)
html = urlopen(
Request(url, headers=http_headers), **ssl_args
).read().decode('utf-8')
# extract content from text <div>
m = rx_gtrans.search(html)
if m:
text = m.group(1)
text = text.replace("'", "'").replace("&", "&").replace("<", "<").replace(">", ">").replace(""", '"')
#@todo: https://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
else:
logging.warning("NO TRANSLATION RESULT EXTRACTED: " + html)
logging.debug("ORIG TEXT: " + repr(text))
return text
# iterate over text segments (1900 char limit)
def translate(self, text, lang="auto"):
#logging.debug("translate %d chars" % len(text))
if len(text) < 2:
logging.debug("skipping/len<2")
return text
elif rx_empty.match(text):
logging.debug("skipping/empty")
return text
elif not rx_letters.search(text):
logging.debug("skipping/noletters")
return text
elif len(text) >= 1900:
logging.debug("spliterate/1900+")
return " ".join(self.askgoogle(segment) for segment in rx_splitpara.findall(text))
else:
return self.askgoogle(text)
# translate w/ preserving paragraph breaks (meant for table cell content)
def linebreakwise(self, text, lang="auto"):
return "\n\n".join(self.translate(text) for text in text.split("\n\n"))
# alternatively, use a temp placeholder '/#/'
# register with LibreOffice
pagetranslate(None)