Check-in [c75d34fd1e]
Overview
Comment: | Elevate liveradio.ie channel to default plugin. Introduce support for PyQuery extraction (HTML5 microdata). |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
c75d34fd1e9e5db8e4f59333852034a4 |
User & Date: | mario on 2020-05-12 18:24:51 |
Other Links: | manifest | tags |
Context
2020-05-13
| ||
06:42 | Fix integer handling in config dialog (once more). See also: ticket #4163057c37 check-in: 369203acfe user: mario tags: trunk | |
2020-05-12
| ||
18:24 | Elevate liveradio.ie channel to default plugin. Introduce support for PyQuery extraction (HTML5 microdata). check-in: c75d34fd1e user: mario tags: trunk | |
16:17 | temporary fix for MyOggRadio being offline. check-in: 7efd6c6ea2 user: mario tags: trunk | |
Changes
Modified channels/liveradio.py from [cfea0c56fd] to [479933ebd6].
1 2 3 4 5 | # encoding: UTF-8 # api: streamtuner2 # title: Liveradio.ie # description: Irish/worldwide radio station directory # url: http://liveradio.ie/ | | | < > | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | # encoding: UTF-8 # api: streamtuner2 # title: Liveradio.ie # description: Irish/worldwide radio station directory # url: http://liveradio.ie/ # version: 0.4 # type: channel # category: radio # config: - # png: # iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAABB0lEQVR4nLWTQUpDMRCGv0lregDBI3gAfW/hRrp8ZOMh5PUMXkFcu7EbTxHd # CC4EhfQkQg/QR5txYQqvMdVHwdnMZJj555uQwH+YurpaNZUOqTWl5i5qGIusDxIAZgBGuBhCsiOgrq7WUa+tkReAjepHystQgmn8zt0As40y # skYa4HwfSS5w2otd8svtWurqHyvnCZcXAHRRW7v8nANnq6bSPk0ucFQS+M3G2fkduMqLrJF5d3zSTnyYATsXmhO89WLfix8A1NWjvwhek5+m # praLGibPC8knFwnEh4U1ct9FvUvoLk0uPbjiCgCPyd+KD0/WyKX4EPcJFLG2/8EaMeLDoE91sH0B3ERWq2CKMoYAAAAASUVORK5CYII= # priority: standard # extraction-method: regex, action-handler # # LiveRadio.ie, based in Ireland, is a radio station directory. It provides # genre or country browsing (not in this plugin). Already lists over 5550 # stations (more unique selections). Also accepts user submissions. # # This channel loads their station logos as favicons. Even allows to utilize # the live search function. # # However, station URLs have to be fetched in a second page request. Such # the listings are unsuitable for exporting right away. OTOH the website is # pretty fast; so no delay there or in fetching complete categories. # import re from pq import pq from config import * from channels import * import ahttp import action # Categorized directory, secondary URL lookup class liveradio (ChannelPlugin): # control flags has_search = True listformat = "srv" audioformat = "audio/mpeg" titles = dict(listeners=False, bitrate=False, playing="Location") fixed_size = 30 img_resize = 32 # data store categories = ["Top 20"] catmap = {"Top 20":"top-20"} base = "http://www.liveradio.ie/" |
︙ | ︙ | |||
74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 | elif search: add = ahttp.get(self.base + "stations" + page_sfx, { "text": search, "country_id": "", "genre_id": ""}) html += add if re.search('/\d+">Next</a>', add): page += 1 else: break # Extract all the things # # · entries utilize HTML5 microdata classification # · title and genre available right away # · img url is embedded # · keep station ID as `urn:liveradion:12345` # r = [] ls = re.findall(""" itemtype="http://schema.org/RadioStation"> .*? href="(?:https?://www.liveradio.ie)?/stations/([\w-]+) .*? <img\s+src="/(files/images/[^"]+)" .*? ="country">([^<]+)< .*? itemprop="name"><a[^>]+>([^<]+)</a> .*? class="genre">([^<]+)< """, html, re.X|re.S) for row in ls: log.DATA(row) id, img, country, title, genre = row r.append(dict( homepage = self.base + "stations/" + id, url = "urn:liveradio:" + id, playing = unhtml(country), title = unhtml(title), genre = unhtml(genre), | > > > > > > > > > > | > > | > > > > > > > > > > > > > > > > > > > > > > > > > | 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | elif search: add = ahttp.get(self.base + "stations" + page_sfx, { "text": search, "country_id": "", "genre_id": ""}) html += add if re.search('/\d+">Next</a>', add): page += 1 else: break html = re.sub("</body>[\s\S]+<body[^>]*>", "", html) # dom or regex if conf.pyquery: try: return self.pq_extract(html) except Exception as e: log.ERR(e) return self.rx_extract(html) # Extract all the things # # · entries utilize HTML5 microdata classification # · title and genre available right away # · img url is embedded # · keep station ID as `urn:liveradion:12345` # def rx_extract(self, html): r = [] ls = re.findall(""" itemtype="http://schema.org/RadioStation"> .*? href="(?:https?://www.liveradio.ie)?/stations/([\w-]+) .*? <img\s+src="/(files/images/[^"]+)" .*? ="country">([^<]+)< .*? itemprop="name"><a[^>]+>([^<]+)</a> .*? class="genre">([^<]+)< """, html, re.X|re.S) for row in ls: log.DATA(row) id, img, country, title, genre = row r.append(dict( homepage = self.base + "stations/" + id, url = "urn:liveradio:" + id, playing = unhtml(country), title = unhtml(title), genre = unhtml(genre), img = self.base + img )) return r # using DOM extraction and itemtype/itemprop= attributes """ <div class="list_item" itemscope itemtype="http://schema.org/RadioStation"> <a class="image_outer" href="http://www.liveradio.ie/stations/soulconnexion-radio"> <span class="image"><img src="/files/images/368787/resized/140x134c/soulconnexion_radio.jpg" alt="Soulconnexion Radio" itemprop="image" /></span> <span class="overlay"><!-- --></span> <span class="country">United Kingdom</span> </a> <div class="name" itemprop="name"><a href="http://www.liveradio.ie/stations/soulconnexion-radio">Soulconnexion Radio</a></div> <div class="genre">Funk, Soul</div> """ def pq_extract(self, html): r = [] for radio in pq(html).find("*[itemscope][itemtype='http://schema.org/RadioStation']"): log.DATA(radio) radio = pq(radio) href = radio.find("*[itemprop='name'] a").attr("href") id = re.search("/([\w-]+)$", href).group(1) r.append(dict( homepage = self.base + "stations/" + id, url = "urn:liveradio:" + id, playing = radio.find("*.country").text(), title = radio.find("*[itemprop='name']").text(), genre = radio.find("*.genre").text(), img = self.base + re.sub("^/", "", radio.find("img[itemprop='image']").attr("src")) )) return r # Update `url` on station data access (incurs a delay for playing or recording) # # · utilizes action.handler["urn:liveradio"] → urn_resolve hook |
︙ | ︙ |