53
54
55
56
57
58
59
60
61
62
63
64
65
66 | 53
54
55
56
57
58
59
60
61
62
63
64
65
66
67 |
+
| "video/x-ms-asf": "asx",
"application/xspf+xml": "xspf",
"*/*": "href", # "href" for unknown responses
"url/direct": "srv",
"url/youtube": "href",
"url/http": "href",
"audio/x-pn-realaudio": "ram",
"application/json": "json",
"application/smil": "smil",
"application/vnd.ms-wpl":"smil",
"audio/x-ms-wax": "asx",
"video/x-ms-asf": "asx",
"x-urn/st2-script": "script", # unused
"application/x-shockwave-flash": "href", # fallback
} |
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103 | 90
91
92
93
94
95
96
97
98
99
100
101
102
103
104 |
-
+
| srv = "(%srv | %d | %s) \\b",
)
# Playlist format content probing (assert type)
playlist_content_map = [
("pls", r""" (?i)\[playlist\].*NumberOfEntries """),
("xspf", r""" <\?xml .* <playlist .* ((?i)http://xspf\.org)/ns/0/ """),
("m3u", r""" ^ \s* #(EXT)?M3U """),
("m3u", r""" ^ \s* \#(EXT)?M3U """),
("asx" , r""" <asx\b """),
("smil", r""" <smil[^>]*> .* <seq> """),
("html", r""" (?i)<(audio|video)\b[^>]+\bsrc\s*=\s*["']?https?:// """),
("wpl", r""" <\?wpl \s+ version="1\.0" \s* \?> """),
("b4s", r""" <WinampXML> """), # http://gonze.com/playlists/playlist-format-survey.html
("jspf", r""" ^ \s* \{ \s* "playlist": \s* \{ """),
("asf", r""" ^ \[Reference\] .*? ^Ref\d+= """), |
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247 | 232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249 |
-
+
-
+
| for probe,rx in playlist_content_map:
if re.search(rx, cnt, re.X|re.S):
probe = listfmt(probe)
break # with `probe` set
# Check ambiguity (except pseudo extension)
if len(set([source, mime, probe])) > 1:
debug(dbg.ERR, "Possible playlist format mismatch:", (source, mime, probe, ext))
debug(dbg.ERR, "Possible playlist format mismatch:", "listformat={}, http_mime={}, rx_probe={}, ext={}".format(source, mime, probe, ext))
# Extract URLs from content
for fmt in ["pls", "xspf", "asx", "smil", "jspf", "m3u", "json", "asf", "jamj", "raw"]:
for fmt in [id[0] for id in extract_playlist.extr_urls]:
if not urls and fmt in (source, mime, probe, ext, "raw"):
urls = extract_playlist(cnt).format(fmt)
debug(dbg.DATA, "conversion from:", source, " with extractor:", fmt, "got URLs=", urls)
# Return original, or asis for srv targets
if not urls:
return [url] |
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333 | 289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345 |
+
+
+
+
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
-
-
-
-
-
-
-
-
-
-
-
-
-
+
+
+
+
+
+
+
+
+
+
+
+
-
+
| content = "\n".join(str.decode(errors='replace') for str in r.iter_lines())
return (mime, content)
# Extract URLs from playlist formats:
#
# It's entirely regex-based at the moment, because that's more
# resilient against mailformed XSPF or JSON.
# Needs proper extractors later for real playlist *imports*.
#
class extract_playlist(object):
# Content of playlist file
src = ""
def __init__(self, text):
self.src = text
# Extract only URLs from given source type
def format(self, fmt):
debug(dbg.DATA, "input regex:", fmt, len(self.src))
# regex
urls = re.findall(self.extr_urls[fmt], self.src, re.X)
# xml entities
urls = [xmlunescape(url) for url in urls]
# json escaping
urls = [url.replace("\\/", "/") for url in urls]
# uniques
urls = list(set(urls))
debug(dbg.DATA, "input extractor/regex:", fmt, len(self.src))
# find extractor
if fmt in dir(self):
return self.__dict__[fmt]()
# regex scheme
rx, decode = dict(self.extr_urls)[fmt]
urls = re.findall(rx, self.src, re.X)
# decode urls
if decode in ("xml", "*"):
urls = [xmlunescape(url) for url in urls]
if decode in ("json", "*"):
urls = [url.replace("\\/", "/") for url in urls]
# only uniques
return list(set(urls))
return urls
# Only look out for URLs, not local file paths
extr_urls = {
"pls": r"(?im) ^ \s*File\d* \s*=\s* (\w+://[^\s]+) ",
"m3u": r" (?m) ^( \w+:// [^#\n]+ )",
"xspf": r" (?x) <location> (\w+://[^<>\s]+) </location> ",
"asx": r" (?x) <ref \b[^>]+\b href \s*=\s* [\'\"] (\w+://[^\s\"\']+) [\'\"] ",
"smil": r" (?x) <(?:audio|video|media)\b [^>]+ \b src \s*=\s* [^\"\']? \s* (\w+://[^\"\'\s]+) ",
"jspf": r" (?x) \"location\" \s*:\s* \"(\w+://[^\"\s]+)\" ",
"jamj": r" (?x) \"audio\" \s*:\s* \"(\w+:\\?/\\?/[^\"\s]+)\" ",
"json": r" (?x) \"url\" \s*:\s* \"(\w+://[^\"\s]+)\" ",
"asf": r" (?m) ^ \s*Ref\d+ = (\w+://[^\s]+) ",
"raw": r" (?i) ( [\w+]+:// [^\s\"\'\>\#]+ ) ",
# Only look out for URLs, not local file paths, nor titles
extr_urls = (
("pls", (r"(?im) ^ \s*File\d* \s*=\s* (\w+://[^\s]+) ", None)),
("m3u", (r" (?m) ^( \w+:// [^#\n]+ )", None)),
("xspf", (r" (?x) <location> (\w+://[^<>\s]+) </location> ", "xml")),
("asx", (r" (?x) <ref \b[^>]+\b href \s*=\s* [\'\"] (\w+://[^\s\"\']+) [\'\"] ", "xml")),
("smil", (r" (?x) <(?:audio|video|media)\b [^>]+ \b src \s*=\s* [^\"\']? \s* (\w+://[^\"\'\s]+) ", "xml")),
("jspf", (r" (?x) \"location\" \s*:\s* \"(\w+://[^\"\s]+)\" ", "json")),
("jamj", (r" (?x) \"audio\" \s*:\s* \"(\w+:\\?/\\?/[^\"\s]+)\" ", "json")),
("json", (r" (?x) \"url\" \s*:\s* \"(\w+://[^\"\s]+)\" ", "json")),
("asf", (r" (?m) ^ \s*Ref\d+ = (\w+://[^\s]+) ", "xml")),
("raw", (r" (?i) ( [\w+]+:// [^\s\"\'\>\#]+ ) ", "*")),
}
)
# Save rows in one of the export formats.
#
# The export() version uses urls[]+row/title= as input, converts it into
# a list of rows{} beforehand.
# |