Check-in [466f591a1a]
Overview
Comment: | radionet updated to extract from horrendous new html pages. Perhaps switching to extracting just the JSON blob later on. urn: resolution remains unchanged. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
466f591a1a46c8c897f7621b41277885 |
User & Date: | mario on 2020-05-12 15:57:26 |
Other Links: | manifest | tags |
Context
2020-05-12
| ||
16:17 | temporary fix for MyOggRadio being offline. check-in: 7efd6c6ea2 user: mario tags: trunk | |
15:57 | radionet updated to extract from horrendous new html pages. Perhaps switching to extracting just the JSON blob later on. urn: resolution remains unchanged. check-in: 466f591a1a user: mario tags: trunk | |
15:56 | Update radiolist plugin for new wordpress site; now uses genre categories instead of countries. check-in: c952ce442c user: mario tags: trunk | |
Changes
Modified contrib/radionet.py from [06bc97f343] to [6f48b334fd].
1 2 3 4 5 | # encoding: UTF-8 # api: streamtuner2 # title: radio.net # description: Europe's biggest radio platform # url: http://radio.net/ | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 | # encoding: UTF-8 # api: streamtuner2 # title: radio.net # description: Europe's biggest radio platform # url: http://radio.net/ # version: 1.0 # type: channel # category: radio # png: # iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAt0lEQVR42mNgYGD4r+Ar/F/BDwkD+SBxojBMs1mLPBArgGlFqEEENYMNQNLsukIDYkirAvGu # ABsA1OC6XOP/5f8nwIaYAg0k2gBFsAsgTgcZkvnfDugFEeK9AFKsCPMG0CU6eZJgQ4R1eP8H7LLEivWyFJANQcQCLPBAmkGG4MJohmA6C6QA5gI5OxEUDNII # MwSvASBFIA3ociCxkWQAKMDICkSQIpgh2LDnSmP80YhsCFEJiRIMADpmeUOpqgjRAAAAAElFTkSuQmCC # priority: optional |
︙ | ︙ | |||
28 29 30 31 32 33 34 | import re from config import * from channels import * import ahttp import action | | | > > > > < < > > > > | | | > | | | > > > < > > > > | < > > > > > > > > > > > > > > > > > > | | | < < < < < | > < < | | | | | | | | | | | 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | import re from config import * from channels import * import ahttp import action # obsolete: hook special JSON format in to avoid grepping images by generic handler action.playlist_fmt_prio.insert(5, "rnjs") action.playlist_content_map.insert(7, ("rnjs", r'"logo175x175rounded"')) action.extract_playlist.extr_urls["rnjs"] = dict( url = r" (?x) \"streamUrl\" \s*:\s* \"(\w+:\\?/\\?/[^\"]+)\" ", title = r" (?x) \"(?:description|seoTitle)\" \s*:\s* \"([^\"]+)\" ", unesc = "json", ) # Radio.net # # · Uses HTML block-wise regex extraction. # → <div class="sc-1crnfmg-11 sc-1crnfmg-12 cYzyuZ"><a href="/s/kissfmuk"> # → basically just title/url, images in a separate json blob # # · Currently using an urn: to resolve stream urls at play time. # # previously: # · There's an API key in each page listing, contained in a script block # as `apiKey: '…'?` # · Which is needed for generating the station info JSON urls: # → https://api.radio.net/info/v2/search/station?apikey=…&pageindex=1&station=STNAME # · To extract these JSON info targets, a custom extraction recipie is injected # into the action module. # → "streamUrl": and "description": are scanned for. # # todo: # · https://prod.radio-api.net/stations/local?count=10 # class radionet (ChannelPlugin): # control flags has_search = False audioformat = "audio/mpeg" listformat = "href" titles = dict(listeners=False, playing="Description") img_resize = 33 # sources apiPrefix = "https://api.radio.net/info/v2" genre_url = "http://www.radio.net/genre/{}" apiKey = None # Retrieve cat list and map def update_categories(self): html = ahttp.get("http://www.radio.net/genre") self.set_key(html) ls = re.findall("""<a class="[^"]+" href="/genre/(\w+)">([^<]+)</a>""", html) self.categories = ["Top 40 and Charts"] + [i[1] for i in ls] # Fetch entries def update_streams(self, cat, search=None): # category page, get key urlcat = cat.replace(" ", "-").lower() html = ahttp.get(self.genre_url.format(urlcat)) for p in range(2, 4): if html.find('?p={}"'.format(p)) >= 0: html += ahttp.get(self.genre_url.format(urlcat) + "?p={}".format(p)) self.set_key(html) r = [] # prefetch images from embedded json (genres and location would also be sourceable from "playables":[…]) imgs = dict(re.findall('\],"id":"(\w+)","logo100x100":"(htt[^"]+)",', html)) #log.DATA(imgs) # top 100 of the most horrible html serializations """ </div></a></div> <div class="sc-1crnfmg-11 sc-1crnfmg-12 cYzyuZ"><a href="/s/kissfmuk"> <div class="sc-1crnfmg-8 cEmqZI"> <div class="sc-1crnfmg-0 iDuFwr"> <div class="lazyload-placeholder"></div> <div class="sc-1crnfmg-1 ezaTdn"> <svg class="sc-1crnfmg-9 hQgRat" xmlns="http://www.w3.org/2000/svg" viewbox="0 0 32 32"><path d="M4 32l25.26-16L4 0z"></path></svg> </div> </div> <div class="sc-1crnfmg-2 incVhC"> <div class="sc-1crnfmg-3 chXIOx">KISS FM UK</div> <div class="sc-1crnfmg-5 fitBQg">London, <!-- -->United Kingdom<!-- --> / Hits, Pop, R'n'B</div> <div class="sc-1crnfmg-6 jaxcgd"></div> </div> {"city":"Hanover","country":"Germany","genres":["Pop","80s","Top 40 & Charts"],"id":"ndr2","logo100x100":"https://d3kle7qwymxpcy.cloudfront.net/images/broadcasts/02/33/2262/1/c100.png","logo300x300":"https://d3kle7qwymxpcy.cloudfront.net/images/broadcasts/02/33/2262/1/c300.png","logo630x630":"","name":"NDR 2","type":"STATION"}') """ rx = re.compile(""" <a\s+href="(?:https?:)?(?://(?:[\w-]+)\.radio\.net)?/s/([^"]+)/?"> .*? <div[^>]+> (\w[^<]+) </div> \s* <div[^>]+> (\w[^/]+) \s+ / \s+ (\w.+?)</div> """, re.X|re.S ) # extract text fields for d in re.findall(rx, html): #log.DATA_ROW(d) href, title, location, desc = d # refurbish extracted strings r.append(dict( name = href, genre = unhtml(desc), title = unhtml(title), playing = unhtml(location), url = "urn:radionet:"+href, homepage = "http://www.radio.net/s/{}".format(href), img = imgs.get(href, "https://www.radio.net/favicon.ico"), )); return r # api search is gone, now requires to fetch streamUrl from per-radio homepage def resolve_urn(self, row): if row.get("url", "-").find("urn:radionet:") != 0: return html = ahttp.get(row["homepage"]) stream = re.findall('"stream[s:[{"\s]+url"[\s:]+"([^"]+)"', html, re.S|re.I) if stream: row["url"] = stream[0] return row # extract JavaScript key from any HTML blob (needed for station query) def set_key(self, html): ls = re.findall("""apiKey: '(\w+)'""", html) if ls: self.apiKey = ls[0] |