Check-in [2f83c61edc]
Overview
| Comment: | update regex extraction fallback |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | trunk |
| Files: | files | file ages | folders |
| SHA1: |
2f83c61edcf9633c45ca0f035db8aa61 |
| User & Date: | mario on 2022-02-16 08:16:21 |
| Other Links: | manifest | tags |
Context
|
2022-02-16
| ||
| 08:20 | updated key mapping check-in: 01a94c1fb6 user: mario tags: trunk | |
| 08:16 | update regex extraction fallback check-in: 2f83c61edc user: mario tags: trunk | |
|
2022-02-15
| ||
| 22:05 | Extract JSON blob from __NEXT_DATA__ script section check-in: b8a37b9b5b user: mario tags: trunk | |
Changes
Modified contrib/radionet.py from [d0ea1042b5] to [5a89f5ed41].
1 2 3 4 5 | # encoding: UTF-8 # api: streamtuner2 # title: radio.net # description: Europe's biggest radio platform # url: http://radio.net/ | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 | # encoding: UTF-8 # api: streamtuner2 # title: radio.net # description: Europe's biggest radio platform # url: http://radio.net/ # version: 1.2 # type: channel # category: radio # png: # iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAAt0lEQVR42mNgYGD4r+Ar/F/BDwkD+SBxojBMs1mLPBArgGlFqEEENYMNQNLsukIDYkirAvGu # ABsA1OC6XOP/5f8nwIaYAg0k2gBFsAsgTgcZkvnfDugFEeK9AFKsCPMG0CU6eZJgQ4R1eP8H7LLEivWyFJANQcQCLPBAmkGG4MJohmA6C6QA5gI5OxEUDNII # MwSvASBFIA3ociCxkWQAKMDICkSQIpgh2LDnSmP80YhsCFEJiRIMADpmeUOpqgjRAAAAAElFTkSuQmCC # priority: optional |
| ︙ | ︙ | |||
110 111 112 113 114 115 116 |
# prefetch images from embedded json (genres and location would also be sourceable from "playables":[…])
imgs = dict(re.findall('\],"id":"(\w+)","logo100x100":"(htt[^"]+)",', html))
#log.DATA(imgs)
# top 100 of the most horrible html serializations
"""
| > | | < < < < | < < < | < < | | < | | < | < | | | | 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
# prefetch images from embedded json (genres and location would also be sourceable from "playables":[…])
imgs = dict(re.findall('\],"id":"(\w+)","logo100x100":"(htt[^"]+)",', html))
#log.DATA(imgs)
# top 100 of the most horrible html serializations
"""
<div class="sc-1crnfmg-9 sc-1crnfmg-10 kqrZgK"><a title="Listen
to the station 1LIVE online now" data-testid="list-item"
target="_self" href="/s/1live"><div class="sc-1crnfmg-7
hlHPMo"><div class="sc-1crnfmg-0 fBSron"><div
class="lazyload-wrapper "><div
class="lazyload-placeholder"></div></div></div><div
class="sc-1crnfmg-1 eOYJrC"><div class="sc-1crnfmg-2
eBaEwX">1LIVE</div><div class="sc-1crnfmg-4 cKweix">Cologne,
Pop</div><div class="sc-1crnfmg-5
, kTPZiR"></div></div></div></a></div>
"""
rx = re.compile("""
<a\s+[^>]*\\bhref="(?:https?:)?(?://(?:[\w-]+)\.radio\.net)?/s/([^"]+)/?"> .{0,500}?
<div[^>]+> (\w[^<]+) </div> .*?
<div[^>]+> (\w[^/,]+) \s* [,/] \s+ (\w.+?)</div>
""", re.X|re.S
)
# extract text fields
for d in re.findall(rx, html):
#log.DATA_ROW(d)
href, title, location, desc = d
|
| ︙ | ︙ | |||
162 163 164 165 166 167 168 |
# process json
def from_json(self, ls_json):
ls = []
for js in ls_json:
js = json.loads(js)
#print(json.dumps(js, indent=4))
ls += js["props"]["pageProps"]["data"]["stations"]["playables"]
| | | 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
# process json
def from_json(self, ls_json):
ls = []
for js in ls_json:
js = json.loads(js)
#print(json.dumps(js, indent=4))
ls += js["props"]["pageProps"]["data"]["stations"]["playables"]
#ls += js["data"]["stations"]["playables"]
r = []
for row in ls:
href = row["id"]
r.append(dict(
name = href,
title = row["name"],
genre = ",".join(row.get("genres", [])),
|
| ︙ | ︙ |