Index: channels/shoutcast.py ================================================================== --- channels/shoutcast.py +++ channels/shoutcast.py @@ -3,11 +3,11 @@ # title: Shoutcast.com # description: Primary list of shoutcast servers (now managed by radionomy). # type: channel # category: radio # priority: default -# version: 1.3 +# version: 1.4 # depends: pq, re, http # author: Mario # original: Jean-Yves Lefort # # Shoutcast is a server software for audio streaming. It automatically spools @@ -86,130 +86,112 @@ __print__( dbg.PROC, self.categories ) conf.save("cache/categories_shoutcast", self.categories) pass - - #def strip_tags(self, s): - # rx = re.compile(""">(\w+)<""") - # return " ".join(rx.findall(s)) - # downloads stream list from shoutcast for given category def update_streams(self, cat, search=""): if (not cat or cat == self.empty): __print__( dbg.ERR, "nocat" ) return [] - ucat = urllib.quote(cat) - - - # loop - entries = [] - next = 0 - max = int(conf.max_streams) - count = max - rx_stream = None - - try: - if (next < max): - - - #/radiolist.cfm?action=sub&string=&cat=Oldies&_cf_containerId=radiolist&_cf_nodebug=true&_cf_nocache=true&_cf_rc=0 - #/radiolist.cfm?start=19&action=sub&string=&cat=Oldies&amount=18&order=listeners - # page - url = "http://www.shoutcast.com/radiolist.cfm?action=sub&string=&cat="+ucat+"&order=listeners&amount="+str(count) - referer = "http://www.shoutcast.com/?action=sub&cat="+ucat - params = {} - html = http.get(url, params=params, referer=referer, ajax=1) - - #__print__(dbg.DATA, html) - #__print__(re.compile("id=(\d+)").findall(html)); - - - # With the new shallow lists it doesn't make much sense to use - # the pyquery DOM traversal. There aren't any sensible selectors to - # extract values; it's just counting the tags. - - - # regular expressions (default) - if not conf.get("pyquery") or not pq: - - # new html - """ - - Play - Schlagerhoelle - das Paradies fr Schlager und Discofox - Oldies - 955 - 128 - MP3 - - """ - - # new extraction regex - if not rx_stream: - rx_stream = re.compile( - """ - ]+ href="http://yp.shoutcast.com/sbin/tunein-station.pls\? - id=(\d+)"> ([^<>]+) - \s+ ]+ >([^<>]+) - \s+ ]+ >(\d+) - \s+ ]+ >(\d+) - \s+ ]+ >(\w+) - """, - re.S|re.I|re.X - ) - - - # extract entries - self.parent.status("parsing document...") - __print__(dbg.PROC, "channels.shoutcast.update_streams: regex scraping mode") - - for m in rx_stream.findall(html): - #__print__(m) - (id, title, genre, listeners, bitrate, fmt) = m - entries += [{ - "id": id, - "url": "http://yp.shoutcast.com/sbin/tunein-station.pls?id=" + id, - "title": self.entity_decode(title), - #"homepage": http.fix_url(homepage), - #"playing": self.entity_decode(playing), - "genre": genre, - "listeners": int(listeners), - "max": 0, #int(uu[6]), - "bitrate": int(bitrate), - "format": self.mime_fmt(fmt), - }] - - - # PyQuery parsing - else: - # iterate over DOM - for div in (pq(e) for e in pq(html).find("tr")): - - entries.append({ - "title": div.find("a.transition").text(), - "url": div.find("a.transition").attr("href"), - "homepage": "", - "listeners": int(div.find("td:eq(3)").text()), - "bitrate": int(div.find("td:eq(4)").text()), - "format": self.mime_fmt(div.find("td:eq(5)").text()), - "max": 0, - "genre": cat, - }) - - - # display partial results (not strictly needed anymore, because we fetch just one page) - self.update_streams_partially_done(entries) - - # more pages to load? - next = 99999 - - except Exception as e: - __print__(dbg.ERR, e) - return entries - - #fin - #__print__(dbg.DATA, entries) + + #/radiolist.cfm?action=sub&string=&cat=Oldies&_cf_containerId=radiolist&_cf_nodebug=true&_cf_nocache=true&_cf_rc=0 + #/radiolist.cfm?start=19&action=sub&string=&cat=Oldies&amount=18&order=listeners + # page + url = "http://www.shoutcast.com/radiolist.cfm" + params = { + "action": "sub", + "string": "", + "cat": cat, + "order": "listeners", + "amount": conf.max_streams, + } + referer = "http://www.shoutcast.com/?action=sub&cat="+cat + html = http.get(url, params=params, referer=referer, ajax=1) + + #__print__(dbg.DATA, html) + #__print__(re.compile("id=(\d+)").findall(html)); + # new html + """ + + Play + Schlagerhoelle - das Paradies fr Schlager und Discofox + Oldies + 955 + 128 + MP3 + + """ + + # With the new shallow lists it doesn't make much sense to use + # the pyquery DOM traversal. There aren't any sensible selectors to + # extract values; it's just counting the tags. + # And there's a bug in PyQuery 1.2.4 and CssSelector. So make two + # attempts, alternate between regex and DOM; user preference first. + use_regex = not conf.get("pyquery") or not pq + retry = 2 + while retry: + retry -= 1 + try: + if use_regex: + return self.with_regex(html) + else: + return self.with_dom(html) + except Exception as e: + use_regex ^= 1 + __print__(dbg.ERR, e) + return [] + + + # Extract using regex + def with_regex(self, html): + __print__(dbg.PROC, "channels.shoutcast.update_streams: regex scraping mode") + rx_stream = re.compile( + """ + ]+ href="http://yp.shoutcast.com/sbin/tunein-station.pls\? + id=(\d+)"> ([^<>]+) + \s+ ]+ >([^<>]+) + \s+ ]+ >(\d+) + \s+ ]+ >(\d+) + \s+ ]+ >(\w+) + """, + re.S|re.I|re.X + ) + # extract entries + entries = [] + for m in rx_stream.findall(html): + #__print__(m) + (id, title, genre, listeners, bitrate, fmt) = m + entries += [{ + "id": id, + "url": "http://yp.shoutcast.com/sbin/tunein-station.pls?id=" + id, + "title": self.entity_decode(title), + #"homepage": http.fix_url(homepage), + #"playing": self.entity_decode(playing), + "genre": genre, + "listeners": int(listeners), + "max": 0, #int(uu[6]), + "bitrate": int(bitrate), + "format": self.mime_fmt(fmt), + }] + return entries + + + # Iterate over DOM instead + def with_dom(self, html): + __print__(dbg.PROC, "channels.shoutcast.update_streams: attempt DOM/PyQuery processing") + entries = [] + for div in (pq(e) for e in pq(html).find("tr")): + entries.append({ + "title": div.find("a.transition").text(), + "url": div.find("a.transition").attr("href"), + "homepage": "", + "listeners": int(div.find("td:eq(3)").text()), + "bitrate": int(div.find("td:eq(4)").text()), + "format": self.mime_fmt(div.find("td:eq(5)").text()), + "max": 0, + "genre": cat, + }) return entries