PageTranslate: Artifact [9e75e1c752]

Artifact 9e75e1c75205344c008a1275b4a6cc55a9672a64:

File pythonpath/translationbackends.py — part of check-in [5a0ee83349] at 2020-06-10 05:08:32 on branch trunk — Add default flags `-o -t {lang}` for cli backend. (user: mario size: 12002)
# encoding: utf-8
# api: pagetranslate
# type: classes
# category: language
# title: via_* translation backends
# description: Implements the alternative services (google, deepl, ...)
# version: 1.6
# state: beta
# depends: python:requests (>= 2.5)
# config: -
#
# Different online service backends and http interfaces are now coalesced here.
# Each class handles sentence/blockwise transfer to one of the online services,
# to get text snippets transformed.
#


# modules
import re, json, time, uuid
import os, subprocess, shlex
from random import randrange as rand
from httprequests import http, urlencode, quote, quote_plus
import logging as log
from traceback import format_exc


# regex
rx_gtrans = re.compile('class="t0">(.+?)</div>', re.S)
rx_splitpara = re.compile("(.{1,1895\.}|.{1,1900}\s|.*$)", re.S)
rx_empty = re.compile("^[\s\d,.:;§():-]+$")
rx_letters = re.compile("\w\w+", re.UNICODE)
rx_breakln = re.compile("\s?/\s?#\s?§\s?/\s?", re.UNICODE)


# Google Translate (default backend)
#
#  · calls mobile page http://translate.google.com/m?hl=en&sl=auto&q=TRANSLATE
#  · iterates over each 1900 characters
#
class google:

    def __init__(self, params={}):
        self.params = params  # config+argparse

    # request text translation from google
    def askgoogle(self, text, dst_lang="en", src_lang='auto'):
        # fetch translation page
        url = "https://translate.google.com/m?hl=%s&sl=%s&q=%s" % (
            dst_lang, src_lang, quote_plus(text.encode("utf-8"))
        )
        html = http.get(url).content.decode("utf-8")
        # extract content from text <div>
        m = rx_gtrans.search(html)
        if m:
            text = m.group(1)
            text = text.replace("&#39;", "'").replace("&amp;", "&").replace("&lt;", "<").replace("&gt;", ">").replace("&quot;", '"')
            #@todo: https://stackoverflow.com/questions/2087370/decode-html-entities-in-python-string
        else:
            log.warning("NO TRANSLATION RESULT EXTRACTED: " + html)
            log.debug("ORIG TEXT: " + repr(text))
        return text

    # iterate over text segments (1900 char limit)        
    def translate(self, text, lang="auto"):
        if lang == "auto":
            lang = self.params["lang"]
        #log.debug("translate %d chars" % len(text))
        if len(text) < 2:
            log.debug("skipping/len<2")
            return text
        elif rx_empty.match(text):
            log.debug("skipping/empty")
            return text
        elif not rx_letters.search(text):
            log.debug("skipping/noletters")
            return text
        elif len(text) >= 1900:
            log.debug("spliterate/1900+")
            return " ".join(self.askgoogle(segment, lang) for segment in rx_splitpara.findall(text))
        else:
            return self.askgoogle(text, lang)
            
    # translate w/ preserving paragraph breaks (meant for table cell content)
    def linebreakwise(self, text, lang="auto"):
        if not self.params.get("quick"):
            # split on linebreaks and translate each individually
            text = "\n\n".join(self.translate(text) for text in text.split("\n\n"))
        else:
            # use temporary placeholder `/#§/`
            text = self.translate(text.replace("\n\n", u"/#§/"))
            text = re.sub(rx_breakln, "\n\n", text)
        return text


# DeepL online translator
#  · will easily yield HTTP 429 Too many requests,
#    so probably not useful for multi-paragraph translation anyway (just text selections)
#  · uses some kind of json-rpc
#
# data origins:
#  · https://www.deepl.com/translator = nothing
#  · jsonrpcId = random integer
#  · sessionId = random client-side guid
#      (https://www.deepl.com/js/translator_glossary_late.min.js?v=… → generated in `function u()`)
#  · instanceId
#      (https://www.deepl.com/PHP/backend/clientState.php?request_type=jsonrpc&il=EN → "uid":"(.+=)")
#  · LMTBID cookie
#      (https://s.deepl.com/web/stats?request_type=jsonrpc ← jsonrpc+session+instId+clientinfos)
#
# translation requests:
#  < https://www2.deepl.com/jsonrpc
#    cookies: LMTBID: GUID...
#    referer: https://www.deepl.com/translator
# repsonse  body:
#  > result.translations[0].beams[0].postprocessed_sentence
#
class deepl_web(google):
    
    def __init__(self, params):
        self.params = params
        self.lang = params["lang"].upper()
        self.id_ = rand(202002000, 959009000) # e.g. 702005000, arbitrary, part of jsonrpc req-resp association
        self.sess = str(uuid.uuid4())    # e.g. 233beb7c-96bc-459c-ae20-157c0bebb2e4
        self.inst = ""   # e.g. ef629644-3d1b-41a4-a2de-0626d23c99ee
        
        # fetch homepage (redundant)
        html = http.get("https://www.deepl.com/translator").text  # should fetch us the cookie / No, it doesn't
        self.versions = dict(re.findall("([\w.]+)\?v=(\d+)", html))
        #print(html)
        
        # instanceId from clientState…
        j = http.post(
            "https://www.deepl.com/PHP/backend/clientState.php?request_type=jsonrpc&il=EN",
            data = json.dumps({"jsonrpc":"2.0", "method":"getClientState", "params":{"v":"20180814"}, "id":self.id()})
        ).json()
        self.inst = j.get("clientVars", {}).get("uid", "")
        #print(j)
        
        # aquire LMTBID cookie (not sure if needed)
        h = http.post(
            "https://s.deepl.com/web/stats?request_type=jsonrpc",
            data = json.dumps({
                "jsonrpc":"2.0", "method":"WebAppPushStatistics", "id":self.id(),
                "params":{
                    "value":{
                        "instanceId":self.inst,
                        "sessionId":self.sess,
                        "event":"web/pageview",
                        "url":"https://www.deepl.com/translator",
                        "userAgent":"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:72.0) Gecko/20100101 Firefox/72.0",
                        "resolution":{"width":1920,"height":1080,"devicePixelRatio":1,"viewportWidth":1900,"viewportHeight":916},
                        "data":{"referrer":""}
                    }
                }
            })
        )
        #print(h.headers)

    def id(self):
        self.id_ += 1
        return self.id_
        
    def rpc(self, text):
        return json.dumps({
           "jsonrpc" : "2.0",
           "method" : "LMT_handle_jobs",
           "id" : self.id(),
           "params" : {
              "lang" : {
                 "target_lang" : self.lang,
                 "user_preferred_langs" : [
                    self.lang,
                    "EN"
                 ],
                 "source_lang_user_selected" : "auto"
              },
              "timestamp" : int(time.time()*1000),
              "priority" : -1,
              "commonJobParams" : {},
              "jobs" : [
                 {
                    "raw_en_context_after" : [],
                    "raw_en_context_before" : [],
                    "kind" : "default",
                    "preferred_num_beams" : 4,
                    "raw_en_sentence" : text,
                    "quality" : "fast"
                 }
              ]
           }
        })

    def translate(self, text):
        # skip empty paragraph/table snippets
        if len(text) < 2 or rx_empty.match(text) or not rx_letters.search(text):
            return text
        
        # delay?
        time.sleep(rand(1, 15) / 10.0)
        
        # request
        r = http.post(
            "https://www2.deepl.com/jsonrpc",
            data=self.rpc(text),
            headers={"Referer": "https://www.deepl.com/translator", "Content-Type": "text/plain"}
        )
        if r.status_code != 200:
            log.error(repr(r.content))
            return text
            #return r, r.content
        
        # decode
        r = r.json()
        #print(r)
        if r.get("result"):
            return r["result"]["translations"][0]["beams"][0]["postprocessed_sentence"]
        else:
            return text


# DeepL API costs money
#
# Not sure if anyone will use this really. Unless the _web version allows testing,
# nobody's gonna shell out money for a subscription - even if it surpassed GoogleT.
# Likely makes sense for commercial users however. And the API is quite simple, so
# that's why it's here.
#
# ENTIRELY UNTESTED
#    
class deepl_api(deepl_web):

    def __init__(self, params):
        self.params = params
        
    def translate(self, text, preserve=0):
        # skip empty paragraph/table snippets
        if len(text) < 2 or rx_empty.match(text) or not rx_letters.search(text):
            return text

        # https://www.deepl.com/docs-api/translating-text/request/
        r = http.get(
            "https://api.deepl.com/v2/translate", params={
                "auth_key": self.params["api_key"],
                "text": text,
                "target_lang": self.params["lang"],
                "split_sentences": "1",
                "preserve_formatting": str(preserve)
                #"tag_handling": "xml"
            }
        )
        if r.status_code == 200:
            r = r.json().get("translations")
            if r:
                return r[0]["text"]
        else:
            log.error(r.text)
        return text
    
    def linebreakwise(self, text):
        return self.translate(text, preserve=1)


# Translate-python
# requires `pip install translate`
#
#  · provides "microsoft" backend (requires OAuth secret in api_key)
#
#  · or "mymemory" (with email in `email` instead)
#
# https://translate-python.readthedocs.io/en/latest/
#
class translate_python(google):

    def __init__(self, params={}):
        self.params = params  # config+argparse
        #self.error = pagetranslate.MessageBox

        Translator = None
        try:
            from translate import Translator
        except:
            log.error(format_exc())
            raise Exception("Run `pip install translate` to use this module.")
            
        # interestingly this backend function might just work as is.
        if params.get("mymemory"):
            self.translate = Translator(
                provider="mymemory", to_lang=params["lang"], email=params.get("email", "")
            ).translate
        else:
            self.translate = Translator(
                provider="microsoft", to_lang=params["lang"], secret_access_key=params["api_key"]
            ).translate

        # though .linebreakwise has no equivalent, not sure if necessary,
        # or if formatting/linebreaks are preserved anyway
        # (or: we might just use the default google. implementation)
        #self.linebreakwise = self.translate

    translate = None
    #linebreakwise = None


# Because, why not?
# Invokes a commandline tool for translating texts.
#
# → with e.g. `translate-cli -t {text}` in "cmd"
#
# Don't quote placeholders {}, {text} or {lang} in the command.
#
class cli(google):

    def __init__(self, params):
        self.params = params
        self.cmd = params.get("cmd", "translate-cli -o -f auto -t {lang} {text}")

    # pipe text through external program
    def translate(self, text):
        if rx_empty.match(text) or not rx_letters.search(text):
            return text
        repl = {"{text}": text, "{lang}": self.params["lang"]}
        cmd = [arg.format(text, **repl) for arg in shlex.split(self.cmd)]
        proc = subprocess.run(cmd, stdout=subprocess.PIPE)
        return proc.stdout.decode("utf-8")



# maps a pagetranslate.t.* object (in main module),
# according to config dict {"goog":1, "deepl":0}
def assign_service(params):
    if params.get("deepl_web"):
        return deepl_web(params)
    elif params.get("deepl_api"):
        return deepl_api(params)
    elif params.get("translate_python") or params.get("microsoft") or params.get("mymemory"):
        return translate_python(params)
    elif params.get("cli"):
        return cli(params)
    else:
        return google(params)