Check-in [6f314952b9]
Overview
Comment: | Add combined unhtml() utility function for raw page extractors. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
6f314952b924f0c34a533a936976cf75 |
User & Date: | mario on 2015-05-02 20:03:36 |
Other Links: | manifest | tags |
Context
2015-05-02
| ||
23:44 | Fix xiph search URL and by_format mapping. check-in: 026af5c9fb user: mario tags: trunk | |
20:03 | Add combined unhtml() utility function for raw page extractors. check-in: 6f314952b9 user: mario tags: trunk | |
20:03 | Clean out unneeded xml module references. check-in: 4797dcce8e user: mario tags: trunk | |
Changes
Modified channels/__init__.py from [90231f2619] to [4aa8b6efaf].
︙ | ︙ | |||
40 41 42 43 44 45 46 | import copy import inspect # Only export plugin classes __all__ = [ "GenericChannel", "ChannelPlugin", "use_rx", | | | 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | import copy import inspect # Only export plugin classes __all__ = [ "GenericChannel", "ChannelPlugin", "use_rx", "entity_decode", "strip_tags", "nl", "unhtml", "to_int" ] # generic channel module --------------------------------------- class GenericChannel(object): |
︙ | ︙ | |||
720 721 722 723 724 725 726 | return int(i[0]) # Strip newlines rx_spc = re.compile("\s+") def nl(str): return rx_spc.sub(" ", str).strip() | | > > | 720 721 722 723 724 725 726 727 728 729 730 731 | return int(i[0]) # Strip newlines rx_spc = re.compile("\s+") def nl(str): return rx_spc.sub(" ", str).strip() # Combine html tag, escapes and whitespace cleanup def unhtml(str): return nl(entity_decode(strip_tags(str))) |
Modified channels/xiph.py from [0e535d4458] to [10274b4d66].
︙ | ︙ | |||
209 210 211 212 213 214 215 | .*? class="format"\s+title="([^"]+)" .*? /by_format/([^"]+) """, html, re.X|re.S) # Assemble for homepage, title, listeners, playing, tags, url, bits, fmt in ls: r.append(dict( | | | | | 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | .*? class="format"\s+title="([^"]+)" .*? /by_format/([^"]+) """, html, re.X|re.S) # Assemble for homepage, title, listeners, playing, tags, url, bits, fmt in ls: r.append(dict( genre = unhtml(tags), title = unhtml(title), homepage = ahttp.fix_url(homepage), playing = unhtml(playing), url = "http://dir.xiph.org{}".format(url), listformat = "xspf", listeners = int(listeners), bitrate = bitrate(bits), format = self.mime_fmt(guess_format(fmt)), )) return r |
︙ | ︙ | |||
515 516 517 518 519 520 521 522 | return 0 # Extract mime type from text rx_fmt = re.compile("ogg|mp3|mp4|theora|nsv|webm|opus|mpeg") def guess_format(str): return rx_fmt.findall(str.lower() + "mpeg")[0] | < < < < < | 515 516 517 518 519 520 521 522 | return 0 # Extract mime type from text rx_fmt = re.compile("ogg|mp3|mp4|theora|nsv|webm|opus|mpeg") def guess_format(str): return rx_fmt.findall(str.lower() + "mpeg")[0] |
Modified contrib/delicast.py from [b077b90d19] to [cf6c41823e].
︙ | ︙ | |||
64 65 66 67 68 69 70 | """, tr, re.X|re.S) print ls if len(ls): homepage, country, title = ls[0] r.append(dict( homepage = homepage, playing = country, | | | | 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | """, tr, re.X|re.S) print ls if len(ls): homepage, country, title = ls[0] r.append(dict( homepage = homepage, playing = country, title = unhtml(title), url = "urn:delicast", genre = cat, # genre = unhtml(tags), )) return r # Update `url` def row(self): r = ChannelPlugin.row(self) if r.get("url") == "urn:delicast": html = ahttp.get(r["homepage"]) ls = re.findall("^var url = \"(.+)\";", html, re.M) r["url"] = ls[0] return r |