# encoding: utf-8
# api: pagetranslate
# type: classes
# category: language
# title: via_* translation backends
# description: Implements the alternative services (google, deepl, ...)
# version: 1.3
# state: beta
# depends: python:requests (>= 2.5)
# config: -
#
# Different online service backends and http interfaces are now coalesced here.
#
# modules
import re, json, time
import os, subprocess, shlex
import urllib
from urllib.parse import urlencode, quote, quote_plus
from httprequests import http
log = None
# regex
rx_gtrans = re.compile('class="t0">(.+?)</div>', re.S)
rx_splitpara = re.compile("(.{1,1895\.}|.{1,1900}\s|.*$)", re.S)
rx_empty = re.compile("^[\s\d,.:;§():-]+$")
rx_letters = re.compile("\w\w+", re.UNICODE)
rx_breakln = re.compile("\s?/\s?#\s?§\s?/\s?")
# Google Translate (default backend)
#
# · calls mobile page http://translate.google.com/m?hl=en&sl=auto&q=TRANSLATE
# · iterates over each 1900 characters
#
class google:
def __init__(self, params={}):
self.params = params # config+argparse
# request text translation from google
def askgoogle(self, text, dst_lang="en", src_lang='auto'):
# fetch translation page
url = "https://translate.google.com/m?hl=%s&sl=%s&q=%s" % (
dst_lang, src_lang, quote_plus(text)
)
html = http.get(url).content.decode("utf-8")
# extract content from text <div>
m = rx_gtrans.search(html)
if m:
text = m.group(1)
text = text.replace("'", "'").replace("&", "&").replace("<", "<").replace(">", ">").replace(""", '"')
#@todo: https://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
else:
log.warning("NO TRANSLATION RESULT EXTRACTED: " + html)
log.debug("ORIG TEXT: " + repr(text))
return text
# iterate over text segments (1900 char limit)
def translate(self, text, lang="auto"):
if lang == "auto":
lang = self.params["lang"]
#log.debug("translate %d chars" % len(text))
if len(text) < 2:
log.debug("skipping/len<2")
return text
elif rx_empty.match(text):
log.debug("skipping/empty")
return text
elif not rx_letters.search(text):
log.debug("skipping/noletters")
return text
elif len(text) >= 1900:
log.debug("spliterate/1900+")
return " ".join(self.askgoogle(segment, lang) for segment in rx_splitpara.findall(text))
else:
return self.askgoogle(text, lang)
# translate w/ preserving paragraph breaks (meant for table cell content)
def linebreakwise(self, text, lang="auto"):
if self.params.get("quick"):
# split on linebreaks and translate each individually
text = "\n\n".join(self.translate(text, lang) for text in text.split("\n\n"))
else:
# use temporary placeholder `/#§/`
text = self.translate(text.replace("\n\n", "/#§/"))
text = re.sub(rx_breakln, "\n\n", text)
return text
# DeepL online translator uses some kind of json-rpc
#
# · haven't quite extracted all necesssary bits (origin of id unclear)
# · will yield HTTP 429 Too many requests,
# so probably not useful for multi-paragraph translation anyway
#
#
class deepl_web(google):
# < https://www2.deepl.com/jsonrpc
# cookies: LMTBID: GUID...
# referer: https://www.deepl.com/translator
# body:
# > result.translations[0].beams[0].postprocessed_sentence
def __init__(self, params):
self.params = params
self.id = 702005000
self.lang = params["lang"].upper()
r = http.get("https://www.deepl.com/translator") # should fetch us the cookie / No, it doesn't
def rpc(self, text):
return json.dumps({
"jsonrpc" : "2.0",
"method" : "LMT_handle_jobs",
"id" : self.id,
"params" : {
"lang" : {
"target_lang" : self.lang,
"user_preferred_langs" : [
self.lang,
"EN"
],
"source_lang_user_selected" : "auto"
},
"timestamp" : int(time.time()*1000),
"priority" : -1,
"commonJobParams" : {},
"jobs" : [
{
"raw_en_context_after" : [],
"raw_en_context_before" : [],
"kind" : "default",
"preferred_num_beams" : 4,
"raw_en_sentence" : text,
"quality" : "fast"
}
]
}
})
def translate(self, text):
# skip empty paragraph/table snippets
if len(text) < 2 or rx_empty.match(text) or not rx_letters.search(text):
return text
# request
r = http.post(
"https://www2.deepl.com/jsonrpc",
data=self.rpc(text),
headers={"Referer": "https://www.deepl.com/translator", "Content-Type": "text/plain"}
)
if r.status_code != 200:
log.error(repr(r.content))
return text
#return r, r.content
# decode
r = r.json()
if r.get("id"):
self.id = r["id"] + 1
if r.get("result"):
return r["result"]["translations"][0]["beams"][0]["postprocessed_sentence"]
else:
return text
# DeepL API costs money
#
# Not sure if anyone will use this really. Unless the _web version allows testing,
# nobody's gonna shell out money for a subscription - even if it surpassed GoogleT.
# Likely makes sense for commercial users however. And the API is quite simple, so
# that's why it's here.
#
# ENTIRELY UNTESTED
#
class deepl_api(deepl_web):
def __init__(self, params):
self.params = params
def translate(self, text, preserve=0):
# skip empty paragraph/table snippets
if len(text) < 2 or rx_empty.match(text) or not rx_letters.search(text):
return text
# https://www.deepl.com/docs-api/translating-text/request/
r = http.get(
"https://api.deepl.com/v2/translate", params={
"auth_key": self.params["api_key"],
"text": text,
"target_lang": self.params["lang"],
"split_sentences": "1",
"preserve_formatting": str(preserve)
#"tag_handling": "xml"
}
)
if r.status_code == 200:
r = r.json().get("translations")
if r:
return r[0]["text"]
else:
log.error(r.text)
return text
def linebreakwise(self, text):
return self.translate(text, preserve=1)
# Translate-python
# requires `pip install translate`
#
# · provides "microsoft" backend (requires OAuth secret in api_key)
#
# · or "mymemory" (with email in `api_key` instead)
#
# https://translate-python.readthedocs.io/en/latest/
#
class translate_python(google):
def __init__(self, params={}):
self.params = params # config+argparse
#self.error = pagetranslate.MessageBox
Translator = None
try:
from translate import Translator
except:
raise Exception("Run `pip install translate` to use this module.")
# interestingly this backend function might just work as is.
if params.get("mymemory"):
self.translate = Translator(
provider="mymemory", to_lang=params["lang"], email=params["api_key"]
)
else:
self.translate = Translator(
provider="microsoft", to_lang=params["lang"], secret_access_key=params["api_key"]
)
# though .linebreakwise has no equivalent, not sure if necessaary,
# or if formatting/linebreaks are preserved anyway
# (or: we might just use the default google. implementation)
self.linebreakwise = self.translate
translate = None
#linebreakwise = None
# Because, why not?
# Invokes a commandline tool for translating texts.
#
# → with e.g. `translate-cli -t {}` in "api_key"
#
class cli(google):
def __init__(self, params):
self.params = params
self.cmd = params["api_key"]
# pipe text through external program
def translate(self, text):
if rx_empty.match(text) or not rx_letters.search(text):
return text
cmd = [s.format(text) for s in shlex.split(self.cmd)]
proc = subprocess.run(cmd, stdout=subprocess.PIPE)
return proc.stdout.decode("utf-8")
# maps a pagetranslate.t.* object (in main module),
# according to config dict {"goog":1, "deepl":0}
def assign_service(params):
if params.get("deepl_web"):
return deepl_web(params)
elif params.get("deepl_api"):
return deepl_api(params)
elif params.get("translate_python") or params.get("microsoft") or params.get("mymemory"):
return translate_python(params)
elif params.get("cli"):
return cli(params)
else:
return google(params)