# encoding: utf-8
# api: pagetranslate
# type: classes
# category: language
# title: via_* translation backends
# description: hooks up the translation services (google, mymemory, deepl, ...)
# version: 1.7
# state: beta
# depends: python:requests (>= 2.5), python:langdetect, python:translate
# config: -
#
# Different online service backends and http interfaces are now coalesced here.
# Each class handles sentence/blockwise transfer to one of the online services,
# to get text snippets transformed.
#
# modules
import re, json, time, uuid, html
import os, subprocess, shlex
from random import randrange as rand
from httprequests import http, urlencode, quote, quote_plus
import logging as log
from traceback import format_exc
# regex
rx_gtrans = re.compile('class="(?:t0|result-container)">(.+?)</div>', re.S)
rx_split1900 = re.compile("(.{1,1895}\.|.{1,1900}\s|.*$)", re.S)
rx_split500 = re.compile("(.{1,495}\.|.{1,500}\s|.*$)", re.S)
rx_empty = re.compile("^[\s\d,.:;§():-]+$")
rx_letters = re.compile("\w\w+", re.UNICODE)
rx_breakln = re.compile("\s?/\s?#\s?§\s?/\s?", re.UNICODE)
# Google Translate (default backend)
#
# · calls mobile page http://translate.google.com/m?hl=en&sl=auto&q=TRANSLATE
# · iterates over each 1900 characters
#
class google:
def __init__(self, params={}):
self.params = params # config+argparse
self.max_len = 1900
self.rx_split = rx_split1900
# request text translation from google
def fetch(self, text, dst_lang="en", src_lang='auto'):
# fetch translation page
url = "https://translate.google.com/m?tl=%s&hl=%s&sl=%s&q=%s" % (
dst_lang, dst_lang, src_lang, quote_plus(text.encode("utf-8"))
)
result = http.get(url).content.decode("utf-8")
# extract content from text <div>
m = rx_gtrans.search(result)
if m:
text = m.group(1)
text = self.html_unescape(text)
else:
log.warning("NO TRANSLATION RESULT EXTRACTED: " + html)
log.debug("ORIG TEXT: " + repr(text))
return text
# decode HTML entities
def html_unescape(self, s):
try:
return html.unescape(s)
except:
return s.replace("'", "'").replace("&", "&").replace("<", "<").replace(">", ">").replace(""", '"')
# iterate over text segments (1900 char limit)
def translate(self, text, lang="auto"):
if lang == "auto":
lang = self.params["lang"]
#log.debug("translate %d chars" % len(text))
if len(text) < 2:
log.debug("skipping/len<2")
return text
elif rx_empty.match(text):
log.debug("skipping/empty")
return text
elif not rx_letters.search(text):
log.debug("skipping/noletters")
return text
elif len(text) >= self.max_len:
log.debug("spliterate/%s+" % self.max_len)
return " ".join(self.fetch(segment, lang) for segment in self.rx_split.findall(text))
else:
return self.fetch(text, lang)
# translate w/ preserving paragraph breaks (meant for table cell content)
def linebreakwise(self, text, lang="auto"):
if not self.params.get("quick"):
# split on linebreaks and translate each individually
text = "\n\n".join(self.translate(text) for text in text.split("\n\n"))
else:
# use temporary placeholder `/#§/`
text = self.translate(text.replace("\n\n", u"/#§/"))
text = re.sub(rx_breakln, "\n\n", text)
return text
# DeepL online translator
# · will easily yield HTTP 429 Too many requests,
# so probably not useful for multi-paragraph translation anyway (just text selections)
# · uses some kind of json-rpc
#
# data origins:
# · https://www.deepl.com/translator = nothing
# · jsonrpcId = random integer
# · sessionId = random client-side guid
# (https://www.deepl.com/js/translator_glossary_late.min.js?v=… → generated in `function u()`)
# · instanceId
# (https://www.deepl.com/PHP/backend/clientState.php?request_type=jsonrpc&il=EN → "uid":"(.+=)")
# · LMTBID cookie
# (https://s.deepl.com/web/stats?request_type=jsonrpc ← jsonrpc+session+instId+clientinfos)
#
# translation requests:
# < https://www2.deepl.com/jsonrpc
# cookies: LMTBID: GUID...
# referer: https://www.deepl.com/translator
# repsonse body:
# > result.translations[0].beams[0].postprocessed_sentence
#
class deepl_web(google):
def __init__(self, params):
self.params = params
self.lang = params["lang"].upper()
self.id_ = rand(202002000, 959009000) # e.g. 702005000, arbitrary, part of jsonrpc req-resp association
self.sess = str(uuid.uuid4()) # e.g. 233beb7c-96bc-459c-ae20-157c0bebb2e4
self.inst = "" # e.g. ef629644-3d1b-41a4-a2de-0626d23c99ee
# fetch homepage (redundant)
html = http.get("https://www.deepl.com/translator").text # should fetch us the cookie / No, it doesn't
self.versions = dict(re.findall("([\w.]+)\?v=(\d+)", html))
#print(html)
# instanceId from clientState…
j = http.post(
"https://www.deepl.com/PHP/backend/clientState.php?request_type=jsonrpc&il=EN",
data = json.dumps({"jsonrpc":"2.0", "method":"getClientState", "params":{"v":"20180814"}, "id":self.id()})
).json()
self.inst = j.get("clientVars", {}).get("uid", "")
print(j)
# aquire LMTBID cookie (not sure if needed)
h = http.post(
"https://s.deepl.com/web/stats?request_type=jsonrpc",
data = json.dumps({
"jsonrpc":"2.0", "method":"WebAppPushStatistics", "id":self.id(),
"params":{
"value":{
"instanceId":self.inst,
"sessionId":self.sess,
"event":"web/pageview",
"url":"https://www.deepl.com/translator",
"userAgent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0",
"resolution":{"width":1920,"height":1080,"devicePixelRatio":1,"viewportWidth":1900,"viewportHeight":916},
"data":{"referrer":""}
}
}
})
)
print(h.headers)
def id(self):
self.id_ += 1
return self.id_
def rpc(self, text):
return json.dumps({
"jsonrpc" : "2.0",
"method" : "LMT_handle_jobs",
"id" : self.id(),
"params" : {
"lang" : {
"target_lang" : self.lang,
"user_preferred_langs" : [
self.lang,
"EN"
],
"source_lang_user_selected" : "auto"
},
"timestamp" : int(time.time()*1000),
"priority" : -1,
"commonJobParams" : {},
"jobs" : [
{
"raw_en_context_after" : [],
"raw_en_context_before" : [],
"kind" : "default",
"preferred_num_beams" : 4,
"raw_en_sentence" : text,
"quality" : "fast"
}
]
}
})
def translate(self, text):
# skip empty paragraph/table snippets
if len(text) < 2 or rx_empty.match(text) or not rx_letters.search(text):
return text
# delay?
time.sleep(rand(1, 15) / 10.0)
# request
r = http.post(
"https://www2.deepl.com/jsonrpc",
data=self.rpc(text),
headers={"Referer": "https://www.deepl.com/translator", "Content-Type": "text/plain"}
)
if r.status_code != 200:
log.error(repr(r.content))
return text
#return r, r.content
# decode
r = r.json()
print(r)
if r.get("result"):
return r["result"]["translations"][0]["beams"][0]["postprocessed_sentence"]
else:
return text
# DeepL API
#
# So, there's a free API and the pro API now. This might make the _web scraping
# dancearound redundant. The free API is certainly more enticing for testing.
# In general, DeepL provides a more streamlined translation than GoogleTranslate.
# It's mostly in here because the API is quite simple.
#
# ENTIRELY UNTESTED
#
class deepl_api(deepl_web):
def __init__(self, params):
self.params = params
self.api_url = "https://api.deepl.com/v2/translate"
def translate(self, text, preserve=0):
# skip empty paragraph/table snippets
if len(text) < 2 or rx_empty.match(text) or not rx_letters.search(text):
return text
# https://www.deepl.com/docs-api/translating-text/request/
r = http.get(
self.api_url, params={
"auth_key": self.params["api_key"],
"text": text,
"target_lang": self.params["lang"],
"split_sentences": "1",
"preserve_formatting": str(preserve)
#"tag_handling": "xml"
}
)
if r.status_code == 200:
r = r.json().get("translations")
if r:
return r[0]["text"]
else:
log.error(r.text)
return text
def linebreakwise(self, text):
return self.translate(text, preserve=1)
# DeepL free API
#
# Registration is broken (error 10040 or whatever, "contact support" lel), even though
# it seems to create an account regardless; but API yields SSL or connection errors.
# Thus STILL UNTESTED.
#
class deepl_free_api(deepl_api):
def __init__(self, params):
self.params = params
self.api_url = "https://api.deepl.com/v2/translate"
# Translate-python
# requires `pip install translate`
#
# · provides "microsoft" backend (requires OAuth secret in api_key)
#
# · or "mymemory" (with email in `email` instead)
#
# https://translate-python.readthedocs.io/en/latest/
#
class translate_python(google):
def __init__(self, params={}):
self.params = params # config+argparse
#self.error = pagetranslate.MessageBox
Translator = None
try:
from translate import Translator
except:
log.error(format_exc())
raise Exception("Run `pip install translate` to use this module.")
# interestingly this backend function might just work as is.
if re.search("mymemory", params.get("backend", ""), re.I):
self.translate = Translator(
provider="mymemory", to_lang=params["lang"], email=params.get("email", "")
).translate
else:
self.translate = Translator(
provider="microsoft", to_lang=params["lang"], secret_access_key=params["api_key"]
).translate
# though .linebreakwise has no equivalent, not sure if necessary,
# or if formatting/linebreaks are preserved anyway
# (or: we might just use the default google. implementation)
#self.linebreakwise = self.translate
translate = None
#linebreakwise = None
# deep-translator
# requires `pip install deep-translator`
# · more backends than pytranslate,
# though PONS+Linguee are just dictionaries
# → https://github.com/nidhaloff/deep-translator
#
class deep_translator(google):
def __init__(self, params={}):
# config+argparse
self.params = params
backend = params.get("backend", "Pons")
source = self.coarse_lang(params.get("from", "auto"))
target = self.coarse_lang(params.get("lang", "en"))
# import
import functools
import deep_translator
# map to backends / uniform decorators
if re.search("linguee", backend, re.I):
self.translate = self.from_words(
deep_translator.LingueeTranslator(source=source, target=target).translate
)
elif re.search("pons", backend, re.I):
self.translate = self.from_words(
deep_translator.PonsTranslator(source=source, target=target).translate
)
elif re.search("QCRI", backend, re.I):
self.translate = functools.partial(
deep_translator.QCRI(params["api_key"]).translate, source=source, target=target
)
elif re.search("yandex", backend, re.I):
self.translate = functools.partial(
deep_translator.YandexTranslator(params["api_key"]).translate, source=source, target=target
)
# shorten language co-DE to just two-letter moniker
def coarse_lang(self, id):
if id.find("-") > 0:
id = re.sub("(?<!zh)-\w+", "", id)
return id
# decorator to translate word-wise
def from_words(self, fn):
def translate(text):
words = re.findall("(\w+)", text)
words = { w: fn(w) for w in list(set(words)) }
text = re.sub("(\w+)", lambda m: words.get(m[0], m[0]), text)
return text
return translate
translate = None
#linebreakwise = None
# MyMemory, only allows max 500 bytes input per API request. Therefore reusing
# the Google backend, but with a different rx_split.
#
# We kinda need the source language here, as mymem provides no "auto" detection.
# Thus importing langdetect here, else fall back to "en". The alternative would
# be fiddling with OOs paragraph locales again, and turning it into a full on
# usability nightmare.
#
# doc:
# https://mymemory.translated.net/doc/spec.php
# errs:
# 'PLEASE SELECT TWO DISTINCT LANGUAGES'
# 'INVALID EMAIL PROVIDED'
# 'AUTO' IS AN INVALID SOURCE LANGUAGE . EXAMPLE: LANGPAIR=EN|IT USING 2 LETTER ISO OR RFC3066 LIKE ZH-CN. ALMOST ALL LANGUAGES SUPPORTED BUT SOME MAY HAVE NO CONTENT"
# 'SELECT' IS AN INVALID SOURCE LANGUAGE . EXAMPLE: LANGPAIR=EN|IT USING 2 LETTER ISO OR RFC3066 LIKE ZH-CN. ALMOST ALL LANGUAGES SUPPORTED BUT SOME MAY HAVE NO CONTENT"
#
class mymemory(google):
def __init__(self, params={}):
self.params = params # config+argparse
self.max_len = 500
self.rx_split = rx_split500
try:
import langdetect
self.lang = langdetect.detect
except:
log.warning("`pip install langdetect` for best results\n"+format_exc())
# API
def fetch(self, text, lang="en", src_lang="en"):
#@todo: this is absolutely the wrong place for language detection,
# need more consistent parameter signatures all around
if self.params.get("from") and self.params["from"] not in ("", "auto", "select"):
src_lang = self.params["from"]
else:
src_lang = self.lang(text)
if lang == src_lang:
log.info("Skipping "+src_lang+"|"+lang)
return text
# call
url = "https://api.mymemory.translated.net/get?q=%s&langpair=%s|%s&of=json&mt=1" % (
quote_plus(text.encode("utf-8")), src_lang, lang
)
if self.params.get("email"):
url = url + "&de=" + self.params["email"]
# any exceptions are covered in main
j = http.get(url).content.decode("utf-8")
log.debug(j)
j = json.loads(j)
if j["responseStatus"] in ("200", 200):
text = j["responseData"]["translatedText"]
# or match[0]…
else:
raise Exception(j)
return text
# static fallback
def lang(self, text):
return "en"
# Because, why not?
# Invokes a commandline tool for translating texts.
# The "cmd" can be:
#
# `translate-cli -t {text}`
# Or
# `deep_translator -trans "google" -src "auto" -tg {lang} -txt {text}`
#
# Don't quote placeholders {}, {text} or {lang} in the command.
#
class cli(google):
def __init__(self, params):
self.params = params
self.cmd = params.get("cmd", "translate-cli -o -f auto -t {lang} {text}")
# pipe text through external program
def translate(self, text):
if rx_empty.match(text) or not rx_letters.search(text):
return text
cmd = [self.repl(arg, text, self.params) for arg in shlex.split(self.cmd)]
try:
proc = subprocess.run(cmd, stdout=subprocess.PIPE)
return proc.stdout.decode("utf-8")
except AttributeError as e:
proc = subprocess.Popen(cmd, stdout=subprocess.PIPE)
proc.wait()
return proc.stdout.read().decode("utf-8")
# substitute placeholders: {}, {text} or $lang or %source%
def repl(self, arg, text, params):
repl = {
"text|\}": text,
"lang|target|to": params["lang"],
"from|source": params["from"]
}
for k,v in repl.items():
if re.match("""^["']?[\{%$]" + k + "[\}%$]?["']?$""", arg):
return v
return arg
# maps a pagetranslate.t.* object (in main module),
# according to configured backend (now a string)
def assign_service(params):
w = params.get("backend", "Google")
map = {
"^google": google,
"^deepl [\s_] web": deepl_web,
"^deepl [\s_] (api|pro)": deepl_api,
"^deepl \s free": deepl_free_api,
"^mymemory | translated\.net": mymemory,
"^command | ^CLI | tool | program": cli,
"^microsoft | translate[_-]py": translate_python,
"^linguee | ^pons | QCRI | yandex | deep-tr": deep_translator,
}
for rx, cls in map.items():
if re.search(rx, w, re.I|re.X):
break
else:
cls = google
return cls(params)