Check-in [e3c98a4a73]
Overview
| Comment: | Updated DOM extraction for Internet-Radio.com (more crude than regex mode, as there's not much to anchor for.) |
|---|---|
| Downloads: | Tarball | ZIP archive | SQL archive |
| Timelines: | family | ancestors | descendants | both | trunk |
| Files: | files | file ages | folders |
| SHA1: |
e3c98a4a73a4fe183ebd5a468e150ae6 |
| User & Date: | mario on 2015-03-24 21:10:58 |
| Other Links: | manifest | tags |
Context
|
2015-03-24
| ||
| 21:11 | Omit help/html/* pages from distribution. check-in: e6d2486ae3 user: mario tags: trunk | |
| 21:10 | Updated DOM extraction for Internet-Radio.com (more crude than regex mode, as there's not much to anchor for.) check-in: e3c98a4a73 user: mario tags: trunk | |
| 21:10 | Recent logo Inkscape remake. check-in: f81ad7cdb5 user: mario tags: trunk | |
Changes
Modified channels/internet_radio.py from [583351e59d] to [411d4564a5].
1 2 3 4 5 6 | # # api: streamtuner2 # title: Internet-Radio.com # description: Broad list of webradios from all genres. # type: channel # category: radio | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 | # # api: streamtuner2 # title: Internet-Radio.com # description: Broad list of webradios from all genres. # type: channel # category: radio # version: 1.2 # priority: standard # # Internet-Radio.co.uk/.com is one of the largest directories of streams. # Available music genre classifications are mirrored verbatim and flatly. # # The new version of this plugin alternates between PyQuery and Regex # station extraction. Both overlook some paid or incomplete entries. |
| ︙ | ︙ | |||
106 107 108 109 110 111 112 |
except Exception as e:
__print__(dbg.ERR, e)
continue
# fin
return entries
| < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | | 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 |
except Exception as e:
__print__(dbg.ERR, e)
continue
# fin
return entries
# Regex extraction
def with_regex(self, html):
__print__(dbg.PROC, "internet-radio, regex")
r = []
html = "\n".join(html)
# Break up into <tr> blocks before extracting bits
rx_tr = re.compile("""<tr[^>]*>(.+?)</tr>""", re.S)
rx_data = re.compile(r"""
playjp',\s*'(https?://[^'">]+)
.*? <h4.*?>([^<>]+)</
.*? <b>([^<>]*)</b>
(?: .*? href="(.*?)" )?
(?: .*? Genres:((?:</?a[^>]+>|\w+|\s+)+) )?
.*? (\d+)\s*Listeners
.*? (\d+)\s*Kbps
""", re.S|re.X)
for div in rx_tr.findall(html):
|
| ︙ | ︙ | |||
197 198 199 200 201 202 203 |
def with_dom(self, html_list):
__print__(dbg.PROC, "internet-radio, dom")
rx_numbers = re.compile("(\d+)")
r = []
for html in html_list:
# the streams are arranged in table rows
doc = pq(html)
| | > | > > > > > > > > > > > | | | | | | | | 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
def with_dom(self, html_list):
__print__(dbg.PROC, "internet-radio, dom")
rx_numbers = re.compile("(\d+)")
r = []
for html in html_list:
# the streams are arranged in table rows
doc = pq(html)
for dir in (pq(e) for e in doc("tr")):
# bitrate/listeners
bl = dir.find("p").text()
bl = rx_numbers.findall(str(bl) + " 0 0")
# stream url
url = dir.find("i").eq(0).attr("onclick")
if url:
url = re.search("(http://[^\'\"\>]+)", url)
if url:
url = url.group(0)
else:
url = ""
else:
url = ""
r.append({
"title": dir.find("h4").text(),
"homepage": http.fix_url(dir.find("a.small").attr("href")),
"url": url,
"genre": dir.find("a[href^='/stations/']").text(),
"listeners": int(bl[0]),
"bitrate": int(bl[1]),
"format": "audio/mpeg",
"playing": dir.find("b").text(),
})
return r
|