Check-in [696a0ab060]
Overview
Comment: | Retry regex after PyQuery extraction mode (or other way round). |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
696a0ab060625b386e6b4b0cbca905a3 |
User & Date: | mario on 2014-05-26 19:59:29 |
Other Links: | manifest | tags |
Context
2014-05-26
| ||
20:21 | Shoutcast: retry regex/dom really alternatively now (not just on exceptions, but also empty result sets) check-in: 61a51c29f9 user: mario tags: trunk | |
19:59 | Retry regex after PyQuery extraction mode (or other way round). check-in: 696a0ab060 user: mario tags: trunk | |
16:32 | Comment out dbg.DATA prints, add some statusbar updating calls. check-in: fd4a1b208c user: mario tags: trunk | |
Changes
Modified channels/shoutcast.py from [2a36c113c8] to [ab1cca6f24].
1 2 3 4 5 6 7 | # # api: streamtuner2 # title: Shoutcast.com # description: Primary list of shoutcast servers (now managed by radionomy). # type: channel # category: radio # priority: default | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | # # api: streamtuner2 # title: Shoutcast.com # description: Primary list of shoutcast servers (now managed by radionomy). # type: channel # category: radio # priority: default # version: 1.4 # depends: pq, re, http # author: Mario # original: Jean-Yves Lefort # # Shoutcast is a server software for audio streaming. It automatically spools # station information on shoutcast.com, which this plugin can read out. # |
︙ | ︙ | |||
84 85 86 87 88 89 90 | # it's done __print__( dbg.PROC, self.categories ) conf.save("cache/categories_shoutcast", self.categories) pass | < < < < < < < < < < < < < < < < < < | | | | < | > > > > > > > | | | < < < < < < < < < < | | | | | | | | | | | | > > > > > > > > > > > > > > > > > > | | > > > | | | | | | | | | | | < < | < < | | | | | | | | | | | | | | | | > < < | > > > | < | | | | | | | | | | < < < < < < < < < < < < < < | 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 | # it's done __print__( dbg.PROC, self.categories ) conf.save("cache/categories_shoutcast", self.categories) pass # downloads stream list from shoutcast for given category def update_streams(self, cat, search=""): if (not cat or cat == self.empty): __print__( dbg.ERR, "nocat" ) return [] #/radiolist.cfm?action=sub&string=&cat=Oldies&_cf_containerId=radiolist&_cf_nodebug=true&_cf_nocache=true&_cf_rc=0 #/radiolist.cfm?start=19&action=sub&string=&cat=Oldies&amount=18&order=listeners # page url = "http://www.shoutcast.com/radiolist.cfm" params = { "action": "sub", "string": "", "cat": cat, "order": "listeners", "amount": conf.max_streams, } referer = "http://www.shoutcast.com/?action=sub&cat="+cat html = http.get(url, params=params, referer=referer, ajax=1) #__print__(dbg.DATA, html) #__print__(re.compile("id=(\d+)").findall(html)); # new html """ <tr> <td width="6%"><a href="#" onClick="window.open('player/?radname=Schlagerhoelle%20%2D%20das%20Paradies%20fr%20Schlager%20%20und%20Discofox&stationid=14687&coding=MP3','radplayer','height=232,width=776')"><img class="icon transition" src="/img/icon-play.png" alt="Play"></a></td> <td width="30%"><a class="transition" href="http://yp.shoutcast.com/sbin/tunein-station.pls?id=14687">Schlagerhoelle - das Paradies fr Schlager und Discofox</a></td> <td width="12%" style="text-align:left;" width="10%">Oldies</td> <td width="12%" style="text-align:left;" width="10%">955</td> <td width="12%" style="text-align:left;" width="10%">128</td> <td width="12%" style="text-align:left;" width="10%">MP3</td> </tr> """ # With the new shallow <td> lists it doesn't make much sense to use # the pyquery DOM traversal. There aren't any sensible selectors to # extract values; it's just counting the tags. # And there's a bug in PyQuery 1.2.4 and CssSelector. So make two # attempts, alternate between regex and DOM; user preference first. use_regex = not conf.get("pyquery") or not pq retry = 2 while retry: retry -= 1 try: if use_regex: return self.with_regex(html) else: return self.with_dom(html) except Exception as e: use_regex ^= 1 __print__(dbg.ERR, e) return [] # Extract using regex def with_regex(self, html): __print__(dbg.PROC, "channels.shoutcast.update_streams: regex scraping mode") rx_stream = re.compile( """ <a [^>]+ href="http://yp.shoutcast.com/sbin/tunein-station.pls\? id=(\d+)"> ([^<>]+) </a> </td> \s+ <td [^>]+ >([^<>]+)</td> \s+ <td [^>]+ >(\d+)</td> \s+ <td [^>]+ >(\d+)</td> \s+ <td [^>]+ >(\w+)</td> """, re.S|re.I|re.X ) # extract entries entries = [] for m in rx_stream.findall(html): #__print__(m) (id, title, genre, listeners, bitrate, fmt) = m entries += [{ "id": id, "url": "http://yp.shoutcast.com/sbin/tunein-station.pls?id=" + id, "title": self.entity_decode(title), #"homepage": http.fix_url(homepage), #"playing": self.entity_decode(playing), "genre": genre, "listeners": int(listeners), "max": 0, #int(uu[6]), "bitrate": int(bitrate), "format": self.mime_fmt(fmt), }] return entries # Iterate over DOM instead def with_dom(self, html): __print__(dbg.PROC, "channels.shoutcast.update_streams: attempt DOM/PyQuery processing") entries = [] for div in (pq(e) for e in pq(html).find("tr")): entries.append({ "title": div.find("a.transition").text(), "url": div.find("a.transition").attr("href"), "homepage": "", "listeners": int(div.find("td:eq(3)").text()), "bitrate": int(div.find("td:eq(4)").text()), "format": self.mime_fmt(div.find("td:eq(5)").text()), "max": 0, "genre": cat, }) return entries |