Index: channels/internet_radio.py ================================================================== --- channels/internet_radio.py +++ channels/internet_radio.py @@ -2,15 +2,22 @@ # api: streamtuner2 # title: Internet-Radio.com # description: Broad list of webradios from all genres. # type: channel # category: radio -# version: 0.2 +# version: 1.1 # priority: standard # +# Internet-Radio.co.uk/.com is one of the largest directories of streams. +# Available music genre classifications are mirrored verbatim and flatly. +# +# The new version of this plugin alternates between PyQuery and Regex +# station extraction. Both overlook some paid or incomplete entries. +# HTTP retrieval happens in one batch, determined by the number of pages +# setting, rather than the global max_streams option. # -# Might become new main plugin +# # # # @@ -34,11 +41,17 @@ homepage = "http://www.internet-radio.org.uk/" listformat = "audio/x-scpls" # settings config = [ - {"name":"internetradio_max_pages", "type":"int", "value":5, "description":"How many pages to fetch and read."}, + { + "name": "internetradio_max_pages", + "type": "int", + "value": 5, + "category": "limit", + "description": "How many pages to fetch and read.", + }, ] # category map categories = [] @@ -53,99 +66,199 @@ rx = re.compile("""]+value="/stations/[-+&.\w\s%]+/">([^<]+)""") self.categories = rx.findall(html) - - - # fetch station lists def update_streams(self, cat, force=0): entries = [] if cat not in self.categories: return [] - - # regex - #rx_div = re.compile('(.+?)', re.S) - rx_data = re.compile(""" - (?:M3U|PLS)',\s*'(http://[^']+)' - .*? -

([^\n]*?) - .*? - (?:href="(http://[^"]+)"[^>]+target="_blank"[^>]*)? - >\s* - \s*(\w[^<]+)[<\n] - .*? - playing\s*:\s*([^<\n]+) - .*? - (\d+)\s*Kbps - (?:
(\d+)\s*Listeners)? - """, re.S|re.X) - #rx_homepage = re.compile('href="(http://[^"]+)"[^>]+target="_blank"') - rx_pages = re.compile('href="/stations/[-+\w%\d\s]+/page(\d+)">\d+') - rx_numbers = re.compile("(\d+)") - - - # multiple pages - max = max(int(conf.internetradio_max_pages), 1) - page = 1 - while page <= max: - - # fetch - html = http.get(self.homepage + "stations/" + cat.lower().replace(" ", "%20") + "/" + ("page"+str(page) if page>1 else "")) - - - # regex parsing? - if not conf.pyquery: - # step through - for uu in rx_data.findall(html): - (url, genre, homepage, title, playing, bitrate, listeners) = uu - - # transform data - entries.append({ - "url": url, - "genre": self.strip_tags(genre), - "homepage": http.fix_url(homepage), - "title": title, - "playing": playing, - "bitrate": int(bitrate), - "listeners": int(listeners if listeners else 0), - "format": "audio/mpeg", # there is no stream info on that, but internet-radio.org.uk doesn't seem very ogg-friendly anyway, so we assume the default here - }) - - # DOM parsing - else: - # the streams are arranged in table rows - doc = pq(html) - for dir in (pq(e) for e in doc("tr.stream")): - - bl = dir.find("td[align=right]").text() - bl = rx_numbers.findall(str(bl) + " 0 0") - - entries.append({ - "title": dir.find("b").text(), - "homepage": http.fix_url(dir.find("a.url").attr("href")), - "url": dir.find("a").eq(2).attr("href"), - "genre": dir.find("td").eq(0).text(), - "bitrate": int(bl[0]), - "listeners": int(bl[1]), - "format": "audio/mpeg", - "playing": dir.find("td").eq(1).children().remove().end().text()[13:].strip(), - }) - - # next page? - if str(page+1) not in rx_pages.findall(html): - max = 0 - else: - page = page + 1 - - # keep listview updated while searching - self.update_streams_partially_done(entries) - try: self.parent.status(float(page)/float(max)) - except: """there was a div by zero bug report despite max=1 precautions""" + + rx_pages = re.compile('href="/stations/[-+\w%\d\s]+/page(\d+)">\d+') + + # Fetch multiple pages at once + html = [] + max_pages = max(int(conf.internetradio_max_pages), 1) + for page in range(1, max_pages): + + # Append HTML source + html.append( + http.get( + self.homepage + "stations/" + + cat.lower().replace(" ", "%20") + + "/" + ("page"+str(page) if page>1 else "") + ) + ) + + # Is there a next page? + if str(page+1) not in rx_pages.findall(html[-1]): + break + self.parent.status(float(page)/float(max_pages+1)) + + # Alternatively try regex or pyquery parsing + #__print__(dbg.HTTP, html) + for use_rx in [not conf.pyquery, conf.pyquery]: + try: + entries = (self.with_regex(html) if use_rx else self.with_dom(html)) + if len(entries): + break + except Exception as e: + __print__(dbg.ERR, e) + continue # fin - self.parent.status() return entries + + # Advertised + """ + + + + Flash Player + + Windows Media Player + + Winamp + iTunes + + Realplayer
+
80s 90s 00s Rock Disco Pop
+ Featured + Box Uk Radio Danceradiouk +
Bow Wow Wow - I Want Candy +
http://danceradiouk.com + + 128 Kbps
22 Listeners
+ GermanyCyprus + SwedenUnited Kingdom + RwandaMexico + Russian FederationSlovenia + CanadaTrinidad and Tobago + SwitzerlandHungary + Lithuania + + """ + # Normal + """ + + + Blank + + Windows Media Player + + Winamp + iTunes + + Realplayer +
Top 40
+ Recommended + KissFM Romania - www.kissfm.ro + ---ALTERNATIVELY--- TDI Radio MP3 48kbps +
http://www.kissfm.ro + + 32 Kbps
5716 Listeners
+ + """ + # Variation + """ + + Blank + + Windows Media Player + + WinampiTunes + Realplayer +
Poprock Dance 50s Various
+ Recommended + Jack and Jill Radio Pop Rock Dance 50s Big Band Classical Country Folk Jazz Blue +
Vince Gill - When Love Finds You - (Album)When Love Finds You - 1994 Countr +
http://www.jackandjillradio.com + + 24 Kbps
+ """ + + # Regex extraction + def with_regex(self, html): + __print__(dbg.PROC, "internet-radio, regex") + r = [] + html = "\n".join(html) + + # Break up into blocks before extracting bits + rx_tr = re.compile("""]*>(.+?)""", re.S) + rx_data = re.compile(r""" + \?u=(https?://[^'">]+/listen\.pls) + .*? + ]+10px[^>]+>(.+?) + .*? + (?:href="/station/[^>]+|\s*([^<>]+)\s* + .*? + (?:
\s*([^<>]+)\s*
)+? + .*? + ]+class="url"[^>]+href="([^<">]+)" + .*? + (?:(\d+)\s+Kbps \s*
\s*)+? + (?:(\d+)\s+Listeners \s*
\s*)+? + """, re.S|re.X) + + for div in rx_tr.findall(html): + #__print__(dbg.DATA, len(div)) + uu = rx_data.search(div) + if uu: + (url, genres, title, playing, homepage, bitrate, listeners) = uu.groups() + + # transform data + r.append({ + "url": url, + "genre": self.strip_tags(genres), + "homepage": http.fix_url(homepage), + "title": title.strip(), + "playing": playing.strip(), + "bitrate": int(bitrate if bitrate else 0), + "listeners": int(listeners if listeners else 0), + "format": "audio/mpeg", # there is no stream info on that, but internet-radio.org.uk doesn't seem very ogg-friendly anyway, so we assume the default here + }) + else: + __print__(dbg.ERR, "rx missed", div) + return r + + + # DOM traversing + def with_dom(self, html_list): + __print__(dbg.PROC, "internet-radio, dom") + rx_numbers = re.compile("(\d+)") + r = [] + for html in html_list: + # the streams are arranged in table rows + doc = pq(html) + for dir in (pq(e) for e in doc("tr.stream")): + + bl = dir.find("td[align=right]").text() + bl = rx_numbers.findall(str(bl) + " 0 0") + + r.append({ + "title": dir.find("b").text(), + "homepage": http.fix_url(dir.find("a.url").attr("href")), + "url": dir.find("a").eq(2).attr("href"), + "genre": dir.find("td").eq(0).text(), + "bitrate": int(bl[0]), + "listeners": int(bl[1]), + "format": "audio/mpeg", + "playing": dir.find("td").eq(1).children().remove().end().text()[13:].strip(), + }) + return r +