Check-in [b6d88bcd1f]
Overview
Comment: | Document more interna of radio.net extraction |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
b6d88bcd1f9fc688519407e7ecf1cdc8 |
User & Date: | mario on 2017-02-16 16:56:59 |
Other Links: | manifest | tags |
Context
2017-02-20
| ||
19:54 | Add stub parameter -w (used by pydoc) check-in: ff61e15d6a user: mario tags: trunk | |
2017-02-16
| ||
16:56 | Document more interna of radio.net extraction check-in: b6d88bcd1f user: mario tags: trunk | |
2017-02-15
| ||
21:38 | document recent channel/feature plugins check-in: 7eb6bd6410 user: mario tags: trunk | |
Changes
Modified contrib/radionet.py from [29c966c64c] to [460ab10c07].
︙ | ︙ | |||
40 41 42 43 44 45 46 | action.extract_playlist.extr_urls["rnjs"] = dict( url = r" (?x) \"streamUrl\" \s*:\s* \"(\w+:\\?/\\?/[^\"]+)\" ", title = r" (?x) \"(?:description|seoTitle)\" \s*:\s* \"([^\"]+)\" ", unesc = "json", ) | | > > > > > > > > > > > | > | 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 | action.extract_playlist.extr_urls["rnjs"] = dict( url = r" (?x) \"streamUrl\" \s*:\s* \"(\w+:\\?/\\?/[^\"]+)\" ", title = r" (?x) \"(?:description|seoTitle)\" \s*:\s* \"([^\"]+)\" ", unesc = "json", ) # Radio.net # # ยท Uses HTML block-wise regex extraction. # โ <a href="stationname.radio.net"> <imgโฆ> <strong>โฆ</strong> <small>โฆ</small> # # ยท There's an API key in each page listing, contained in a script block # as `apiKey: 'โฆ'?` # # ยท Which is needed for generating the station info JSON urls: # โ https://api.radio.net/info/v2/search/station?apikey=โฆ&pageindex=1&station=STNAME # # ยท To extract these JSON info targets, a custom extraction recipie is injected # into the action module. # โ "streamUrl": and "description": are scanned for. # class radionet (ChannelPlugin): # control flags has_search = False audioformat = "audio/mpeg" listformat = "rnjs" |
︙ | ︙ | |||
75 76 77 78 79 80 81 | # category page, get key html = ahttp.get(self.genre_url.format(cat)) for p in range(2, 4): if html.find('"?p={}">'.format(p)) >= 0: html += ahttp.get(self.genre_url.format(cat) + "?p={}".format(p)) self.set_key(html) r = [] | | < < < < < < < < < < < < < < < < < < | | 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | # category page, get key html = ahttp.get(self.genre_url.format(cat)) for p in range(2, 4): if html.find('"?p={}">'.format(p)) >= 0: html += ahttp.get(self.genre_url.format(cat) + "?p={}".format(p)) self.set_key(html) r = [] # split station blocks for row in re.split("""<div class="stationinfo""", html)[1:]: # extract text fields d = re.findall(""" <a\s+href="(?:https?:)?(//([\w-]+)\.radio\.net/?)" .*? <img\s+src="([^<">]+)" .*? <strong[^>]*>(.*?)</strong> .*? <small[^>]*>\s*(.*?)\s*</small> .*? """, row, re.X|re.S) # refurbish extracted strings if d and len(d) and len(d[0]) == 5: |
︙ | ︙ |