Index: channels/shoutcast.py
==================================================================
--- channels/shoutcast.py
+++ channels/shoutcast.py
@@ -3,11 +3,11 @@
# title: Shoutcast.com
# description: Primary list of shoutcast servers (now managed by radionomy).
# type: channel
# category: radio
# priority: default
-# version: 1.3
+# version: 1.4
# depends: pq, re, http
# author: Mario
# original: Jean-Yves Lefort
#
# Shoutcast is a server software for audio streaming. It automatically spools
@@ -86,130 +86,112 @@
__print__( dbg.PROC, self.categories )
conf.save("cache/categories_shoutcast", self.categories)
pass
-
- #def strip_tags(self, s):
- # rx = re.compile(""">(\w+)<""")
- # return " ".join(rx.findall(s))
-
# downloads stream list from shoutcast for given category
def update_streams(self, cat, search=""):
if (not cat or cat == self.empty):
__print__( dbg.ERR, "nocat" )
return []
- ucat = urllib.quote(cat)
-
-
- # loop
- entries = []
- next = 0
- max = int(conf.max_streams)
- count = max
- rx_stream = None
-
- try:
- if (next < max):
-
-
- #/radiolist.cfm?action=sub&string=&cat=Oldies&_cf_containerId=radiolist&_cf_nodebug=true&_cf_nocache=true&_cf_rc=0
- #/radiolist.cfm?start=19&action=sub&string=&cat=Oldies&amount=18&order=listeners
- # page
- url = "http://www.shoutcast.com/radiolist.cfm?action=sub&string=&cat="+ucat+"&order=listeners&amount="+str(count)
- referer = "http://www.shoutcast.com/?action=sub&cat="+ucat
- params = {}
- html = http.get(url, params=params, referer=referer, ajax=1)
-
- #__print__(dbg.DATA, html)
- #__print__(re.compile("id=(\d+)").findall(html));
-
-
- # With the new shallow
lists it doesn't make much sense to use
- # the pyquery DOM traversal. There aren't any sensible selectors to
- # extract values; it's just counting the tags.
-
-
- # regular expressions (default)
- if not conf.get("pyquery") or not pq:
-
- # new html
- """
- |
- |
- Schlagerhoelle - das Paradies fr Schlager und Discofox |
- Oldies |
- 955 |
- 128 |
- MP3 |
-
- """
-
- # new extraction regex
- if not rx_stream:
- rx_stream = re.compile(
- """
- ]+ href="http://yp.shoutcast.com/sbin/tunein-station.pls\?
- id=(\d+)"> ([^<>]+)
- \s+ ]+ >([^<>]+) |
- \s+ ]+ >(\d+) |
- \s+ ]+ >(\d+) |
- \s+ ]+ >(\w+) |
- """,
- re.S|re.I|re.X
- )
-
-
- # extract entries
- self.parent.status("parsing document...")
- __print__(dbg.PROC, "channels.shoutcast.update_streams: regex scraping mode")
-
- for m in rx_stream.findall(html):
- #__print__(m)
- (id, title, genre, listeners, bitrate, fmt) = m
- entries += [{
- "id": id,
- "url": "http://yp.shoutcast.com/sbin/tunein-station.pls?id=" + id,
- "title": self.entity_decode(title),
- #"homepage": http.fix_url(homepage),
- #"playing": self.entity_decode(playing),
- "genre": genre,
- "listeners": int(listeners),
- "max": 0, #int(uu[6]),
- "bitrate": int(bitrate),
- "format": self.mime_fmt(fmt),
- }]
-
-
- # PyQuery parsing
- else:
- # iterate over DOM
- for div in (pq(e) for e in pq(html).find("tr")):
-
- entries.append({
- "title": div.find("a.transition").text(),
- "url": div.find("a.transition").attr("href"),
- "homepage": "",
- "listeners": int(div.find("td:eq(3)").text()),
- "bitrate": int(div.find("td:eq(4)").text()),
- "format": self.mime_fmt(div.find("td:eq(5)").text()),
- "max": 0,
- "genre": cat,
- })
-
-
- # display partial results (not strictly needed anymore, because we fetch just one page)
- self.update_streams_partially_done(entries)
-
- # more pages to load?
- next = 99999
-
- except Exception as e:
- __print__(dbg.ERR, e)
- return entries
-
- #fin
- #__print__(dbg.DATA, entries)
+
+ #/radiolist.cfm?action=sub&string=&cat=Oldies&_cf_containerId=radiolist&_cf_nodebug=true&_cf_nocache=true&_cf_rc=0
+ #/radiolist.cfm?start=19&action=sub&string=&cat=Oldies&amount=18&order=listeners
+ # page
+ url = "http://www.shoutcast.com/radiolist.cfm"
+ params = {
+ "action": "sub",
+ "string": "",
+ "cat": cat,
+ "order": "listeners",
+ "amount": conf.max_streams,
+ }
+ referer = "http://www.shoutcast.com/?action=sub&cat="+cat
+ html = http.get(url, params=params, referer=referer, ajax=1)
+
+ #__print__(dbg.DATA, html)
+ #__print__(re.compile("id=(\d+)").findall(html));
+ # new html
+ """
+
+ |
+ Schlagerhoelle - das Paradies fr Schlager und Discofox |
+ Oldies |
+ 955 |
+ 128 |
+ MP3 |
+
+ """
+
+ # With the new shallow lists it doesn't make much sense to use
+ # the pyquery DOM traversal. There aren't any sensible selectors to
+ # extract values; it's just counting the tags.
+ # And there's a bug in PyQuery 1.2.4 and CssSelector. So make two
+ # attempts, alternate between regex and DOM; user preference first.
+ use_regex = not conf.get("pyquery") or not pq
+ retry = 2
+ while retry:
+ retry -= 1
+ try:
+ if use_regex:
+ return self.with_regex(html)
+ else:
+ return self.with_dom(html)
+ except Exception as e:
+ use_regex ^= 1
+ __print__(dbg.ERR, e)
+ return []
+
+
+ # Extract using regex
+ def with_regex(self, html):
+ __print__(dbg.PROC, "channels.shoutcast.update_streams: regex scraping mode")
+ rx_stream = re.compile(
+ """
+ ]+ href="http://yp.shoutcast.com/sbin/tunein-station.pls\?
+ id=(\d+)"> ([^<>]+) |
+ \s+ ]+ >([^<>]+) |
+ \s+ ]+ >(\d+) |
+ \s+ ]+ >(\d+) |
+ \s+ ]+ >(\w+) |
+ """,
+ re.S|re.I|re.X
+ )
+ # extract entries
+ entries = []
+ for m in rx_stream.findall(html):
+ #__print__(m)
+ (id, title, genre, listeners, bitrate, fmt) = m
+ entries += [{
+ "id": id,
+ "url": "http://yp.shoutcast.com/sbin/tunein-station.pls?id=" + id,
+ "title": self.entity_decode(title),
+ #"homepage": http.fix_url(homepage),
+ #"playing": self.entity_decode(playing),
+ "genre": genre,
+ "listeners": int(listeners),
+ "max": 0, #int(uu[6]),
+ "bitrate": int(bitrate),
+ "format": self.mime_fmt(fmt),
+ }]
+ return entries
+
+
+ # Iterate over DOM instead
+ def with_dom(self, html):
+ __print__(dbg.PROC, "channels.shoutcast.update_streams: attempt DOM/PyQuery processing")
+ entries = []
+ for div in (pq(e) for e in pq(html).find("tr")):
+ entries.append({
+ "title": div.find("a.transition").text(),
+ "url": div.find("a.transition").attr("href"),
+ "homepage": "",
+ "listeners": int(div.find("td:eq(3)").text()),
+ "bitrate": int(div.find("td:eq(4)").text()),
+ "format": self.mime_fmt(div.find("td:eq(5)").text()),
+ "max": 0,
+ "genre": cat,
+ })
return entries