Check-in [0bc6843caf]
Overview
Comment: | Internet-Radio plugin restructured to alternate between regex (partially working again) and pyquery extraction (only misses a few advertised stations). Pages are now retrieved in one batch, therefore a bit faster now. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
0bc6843cafb24ab8aa2183ad56ec874a |
User & Date: | mario on 2014-05-27 19:17:29 |
Other Links: | manifest | tags |
Context
2014-05-27
| ||
21:46 | More genre categories added. check-in: ea07946943 user: mario tags: trunk | |
19:17 | Internet-Radio plugin restructured to alternate between regex (partially working again) and pyquery extraction (only misses a few advertised stations). Pages are now retrieved in one batch, therefore a bit faster now. check-in: 0bc6843caf user: mario tags: trunk | |
15:06 | Update default configuration, rename internet_radio_org_uk to just internet_radio module check-in: 99d4249ef4 user: mario tags: trunk | |
Changes
Modified channels/internet_radio.py from [323154e4b9] to [1aa6aca44b].
1 2 3 4 5 6 | # # api: streamtuner2 # title: Internet-Radio.com # description: Broad list of webradios from all genres. # type: channel # category: radio | | > > | > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 | # # api: streamtuner2 # title: Internet-Radio.com # description: Broad list of webradios from all genres. # type: channel # category: radio # version: 1.1 # priority: standard # # Internet-Radio.co.uk/.com is one of the largest directories of streams. # Available music genre classifications are mirrored verbatim and flatly. # # The new version of this plugin alternates between PyQuery and Regex # station extraction. Both overlook some paid or incomplete entries. # HTTP retrieval happens in one batch, determined by the number of pages # setting, rather than the global max_streams option. # # # # # from channels import * |
︙ | ︙ | |||
32 33 34 35 36 37 38 | title = "InternetRadio" module = "internet_radio" homepage = "http://www.internet-radio.org.uk/" listformat = "audio/x-scpls" # settings config = [ | > > > > > | > < < < | < < < < < < < < < < < < < < < < < < < | > | | < > > | > | | | < < < < < | < < < < < < < < < < < | < < < < < < < < < < < < < < < < < < < < | | < | | < > > > > > | < > | > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > | 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 | title = "InternetRadio" module = "internet_radio" homepage = "http://www.internet-radio.org.uk/" listformat = "audio/x-scpls" # settings config = [ { "name": "internetradio_max_pages", "type": "int", "value": 5, "category": "limit", "description": "How many pages to fetch and read.", }, ] # category map categories = [] current = "" default = "" # load genres def update_categories(self): html = http.get(self.homepage) rx = re.compile("""<option[^>]+value="/stations/[-+&.\w\s%]+/">([^<]+)</option>""") self.categories = rx.findall(html) # fetch station lists def update_streams(self, cat, force=0): entries = [] if cat not in self.categories: return [] rx_pages = re.compile('href="/stations/[-+\w%\d\s]+/page(\d+)">\d+</a>') # Fetch multiple pages at once html = [] max_pages = max(int(conf.internetradio_max_pages), 1) for page in range(1, max_pages): # Append HTML source html.append( http.get( self.homepage + "stations/" + cat.lower().replace(" ", "%20") + "/" + ("page"+str(page) if page>1 else "") ) ) # Is there a next page? if str(page+1) not in rx_pages.findall(html[-1]): break self.parent.status(float(page)/float(max_pages+1)) # Alternatively try regex or pyquery parsing #__print__(dbg.HTTP, html) for use_rx in [not conf.pyquery, conf.pyquery]: try: entries = (self.with_regex(html) if use_rx else self.with_dom(html)) if len(entries): break except Exception as e: __print__(dbg.ERR, e) continue # fin return entries # Advertised """ <tr valign="top" class="stream"> <td class="listing1" width="120" align="center"> <a onClick="return popitup('/player/?mount=http://uk2.internet-radio.com:31076/listen.pls&title=Box Uk Radio Danceradiouk&website=http://danceradiouk.com ')" href="/player/?mount=http://uk2.internet-radio.com:31076/listen.pls&title=Box Uk Radio Danceradiouk&website=http://danceradiouk.com "> <img style="margin-right: 6px;" src="/images/blank.gif" class="sprite sprite-flash" alt="Flash Player"></a> <a onClick="_gaq.push(['_trackEvent', 'TuneIn', 'Play - M3U', 'http://uk2.internet-radio.com:31076/listen.pls']);" href="http://servers.internet-radio.com/tools/playlistgenerator/?u=http://uk2.internet-radio.com:31076/listen.pls&t=.m3u"> <img style="margin-right: 6px;" src="/images/blank.gif" class="sprite sprite-wmp" alt="Windows Media Player"></a> <a onClick="_gaq.push(['_trackEvent', 'TuneIn', 'Play - PLS', 'http://uk2.internet-radio.com:31076/listen.pls']);" href="http://servers.internet-radio.com/tools/playlistgenerator/?u=http://uk2.internet-radio.com:31076/listen.pls&t=.pls"> <img style="margin-right: 6px;" src="/images/blank.gif" class="sprite sprite-winamp" alt="Winamp"> <img style="margin-right: 6px;" src="/images/blank.gif" class="sprite sprite-itunes" alt="iTunes"></a> <a onClick="_gaq.push(['_trackEvent', 'TuneIn', 'Play - RAM', 'http://uk2.internet-radio.com:31076/listen.pls']);" href="http://servers.internet-radio.com/tools/playlistgenerator/?u=http://uk2.internet-radio.com:31076/listen.pls&t=.ram"> <img src="/images/blank.gif" class="sprite sprite-realplayer" alt="Realplayer"></a><br> <div style="margin-top: 10px;"><a href="/stations/80s/">80s</a> <a href="/stations/90s/">90s</a> 00s <a href="/stations/rock/">Rock</a> <a href="/stations/disco/">Disco</a> <a href="/stations/pop/">Pop</a> </div></td> <td class="listing2" ><img src="/images/icons/award_star_silver_1.png" alt="Featured" width="16" height="16"> <a href="/station/danceradioukchatbox/" style="font-weight:bold;">Box Uk Radio Danceradiouk</a> <br>Bow Wow Wow - I Want Candy <br><a onClick="_gaq.push(['_trackEvent','Link', 'Station Link', 'http://danceradiouk.com ']);" class="url" href="http://danceradiouk.com " title="Box Uk Radio Danceradiouk" target="_blank">http://danceradiouk.com </a> </td><td class="listing1" align="right" width="100"> 128 Kbps<br>22 Listeners<br> <img src="/images/blank.gif" class="sprite sprite-de" alt="Germany"><img src="/images/blank.gif" class="sprite sprite-cy" alt="Cyprus"> <img src="/images/blank.gif" class="sprite sprite-se" alt="Sweden"><img src="/images/blank.gif" class="sprite sprite-gb" alt="United Kingdom"> <img src="/images/blank.gif" class="sprite sprite-rw" alt="Rwanda"><img src="/images/blank.gif" class="sprite sprite-mx" alt="Mexico"> <img src="/images/blank.gif" class="sprite sprite-ru" alt="Russian Federation"><img src="/images/blank.gif" class="sprite sprite-si" alt="Slovenia"> <img src="/images/blank.gif" class="sprite sprite-ca" alt="Canada"><img src="/images/blank.gif" class="sprite sprite-tt" alt="Trinidad and Tobago"> <img src="/images/blank.gif" class="sprite sprite-ch" alt="Switzerland"><img src="/images/blank.gif" class="sprite sprite-hu" alt="Hungary"> <img src="/images/blank.gif" class="sprite sprite-lt" alt="Lithuania"> </td></tr> """ # Normal """ <tr valign="top" class="stream"> <td class="listing1" width="120" align="center"> <img style="margin-right: 6px;" src="/images/icons/blank.png" alt="Blank"> <a onClick="_gaq.push(['_trackEvent', 'TuneIn', 'Play - M3U', 'http://80.86.106.136:80/listen.pls']);" href="http://servers.internet-radio.com/tools/playlistgenerator/?u=http://80.86.106.136:80/listen.pls&t=.m3u"> <img style="margin-right: 6px;" src="/images/blank.gif" class="sprite sprite-wmp" alt="Windows Media Player"></a> <a onClick="_gaq.push(['_trackEvent', 'TuneIn', 'Play - PLS', 'http://80.86.106.136:80/listen.pls']);" href="http://servers.internet-radio.com/tools/playlistgenerator/?u=http://80.86.106.136:80/listen.pls&t=.pls"> <img style="margin-right: 6px;" src="/images/blank.gif" class="sprite sprite-winamp" alt="Winamp"> <img style="margin-right: 6px;" src="/images/blank.gif" class="sprite sprite-itunes" alt="iTunes"></a> <a onClick="_gaq.push(['_trackEvent', 'TuneIn', 'Play - RAM', 'http://80.86.106.136:80/listen.pls']);" href="http://servers.internet-radio.com/tools/playlistgenerator/?u=http://80.86.106.136:80/listen.pls&t=.ram"> <img src="/images/blank.gif" class="sprite sprite-realplayer" alt="Realplayer"></a> <br><div style="margin-top: 10px;">Top 40 </div></td> <td class="listing2" ><img src="/images/icons/award_star_bronze_1.png" alt="Recommended" width="16" height="16"> <a href="/station/kissfmromania/" style="font-weight:bold;">KissFM Romania - www.kissfm.ro</a> ---ALTERNATIVELY--- <span style="color: #c00;"><b> TDI Radio MP3 48kbps</b></span> <br><a onClick="_gaq.push(['_trackEvent','Link', 'Station Link', 'http://www.kissfm.ro']);" class="url" href="http://www.kissfm.ro" title="KissFM Romania - www.kissfm.ro" target="_blank">http://www.kissfm.ro</a> </td><td class="listing1" align="right" width="100"> 32 Kbps<br>5716 Listeners<br> </td></tr> """ # Variation """ <td class="listing1" width="120" align="center"> <img style="margin-right: 6px;" src="/images/icons/blank.png" alt="Blank"> <a onClick="_gaq.push(['_trackEvent', 'TuneIn', 'Play - M3U', 'http://colostreaming.com:8092/listen.pls']);" href="http://servers.internet-radio.com/tools/playlistgenerator/?u=http://colostreaming.com:8092/listen.pls&t=.m3u"> <img style="margin-right: 6px;" src="/images/blank.gif" class="sprite sprite-wmp" alt="Windows Media Player"></a> <a onClick="_gaq.push(['_trackEvent', 'TuneIn', 'Play - PLS', 'http://colostreaming.com:8092/listen.pls']);" href="http://servers.internet-radio.com/tools/playlistgenerator/?u=http://colostreaming.com:8092/listen.pls&t=.pls"> <img style="margin-right: 6px;" src="/images/blank.gif" class="sprite sprite-winamp" alt="Winamp"><img style="margin-right: 6px;" src="/images/blank.gif" class="sprite sprite-itunes" alt="iTunes"></a> <a onClick="_gaq.push(['_trackEvent', 'TuneIn', 'Play - RAM', 'http://colostreaming.com:8092/listen.pls']);" href="http://servers.internet-radio.com/tools/playlistgenerator/?u=http://colostreaming.com:8092/listen.pls&t=.ram"><img src="/images/blank.gif" class="sprite sprite-realplayer" alt="Realplayer"></a> <br><div style="margin-top: 10px;">Poprock <a href="/stations/dance/">Dance</a> 50s Various </div></td> <td class="listing2" ><img src="/images/icons/award_star_bronze_1.png" alt="Recommended" width="16" height="16"> <span style="color: #c00;"><b> Jack and Jill Radio Pop Rock Dance 50s Big Band Classical Country Folk Jazz Blue</b></span> <br>Vince Gill - When Love Finds You - (Album)When Love Finds You - 1994 Countr <br><a onClick="_gaq.push(['_trackEvent','Link', 'Station Link', 'http://www.jackandjillradio.com']);" class="url" href="http://www.jackandjillradio.com" title="Jack and Jill Radio Pop Rock Dance 50s Big Band Classical Country Folk Jazz Blues Its All Here!" target="_blank">http://www.jackandjillradio.com</a> </td><td class="listing1" align="right" width="100"> 24 Kbps<br></td> """ # Regex extraction def with_regex(self, html): __print__(dbg.PROC, "internet-radio, regex") r = [] html = "\n".join(html) # Break up into <tr> blocks before extracting bits rx_tr = re.compile("""<tr[^>]*>(.+?)</tr>""", re.S) rx_data = re.compile(r""" \?u=(https?://[^'">]+/listen\.pls) .*? <div[^>]+10px[^>]+>(.+?)</div> .*? (?:href="/station/[^>]+|<b)>\s*([^<>]+)\s*</[ab]> .*? (?:<br>\s*([^<>]+)\s*<br>)+? .*? <a[^>]+class="url"[^>]+href="([^<">]+)" .*? (?:(\d+)\s+Kbps \s*<br>\s*)+? (?:(\d+)\s+Listeners \s*<br>\s*)+? """, re.S|re.X) for div in rx_tr.findall(html): #__print__(dbg.DATA, len(div)) uu = rx_data.search(div) if uu: (url, genres, title, playing, homepage, bitrate, listeners) = uu.groups() # transform data r.append({ "url": url, "genre": self.strip_tags(genres), "homepage": http.fix_url(homepage), "title": title.strip(), "playing": playing.strip(), "bitrate": int(bitrate if bitrate else 0), "listeners": int(listeners if listeners else 0), "format": "audio/mpeg", # there is no stream info on that, but internet-radio.org.uk doesn't seem very ogg-friendly anyway, so we assume the default here }) else: __print__(dbg.ERR, "rx missed", div) return r # DOM traversing def with_dom(self, html_list): __print__(dbg.PROC, "internet-radio, dom") rx_numbers = re.compile("(\d+)") r = [] for html in html_list: # the streams are arranged in table rows doc = pq(html) for dir in (pq(e) for e in doc("tr.stream")): bl = dir.find("td[align=right]").text() bl = rx_numbers.findall(str(bl) + " 0 0") r.append({ "title": dir.find("b").text(), "homepage": http.fix_url(dir.find("a.url").attr("href")), "url": dir.find("a").eq(2).attr("href"), "genre": dir.find("td").eq(0).text(), "bitrate": int(bl[0]), "listeners": int(bl[1]), "format": "audio/mpeg", "playing": dir.find("td").eq(1).children().remove().end().text()[13:].strip(), }) return r |