︙ | | |
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
|
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
|
-
+
-
-
+
|
always_google = 1 # use favicon service for speed
only_google = 1 # if that fails, try our other/slower methods?
delete_google_stub = 1 # don't keep placeholder images
google_placeholder_filesizes = (726,896)
import os, os.path
import urllib
from compat2and3 import xrange, urllib
import re
import urlparse
from config import conf
try: from processing import Process as Thread
except: from threading import Thread
import http
import ahttp
# ensure that we don't try to download a single favicon twice per session,
# if it's not available the first time, we won't get it after switching stations back and forth
tried_urls = []
|
︙ | | |
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
|
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
|
-
+
-
+
|
# extract first title parts
title = rx_t.search(row["title"])
if title:
title = title.group(0).replace(" ", "%20")
# do a google search
html = http.ajax("http://www.google.de/search?hl=de&q="+title, None)
html = ahttp.ajax("http://www.google.de/search?hl=de&q="+title, None)
# find first URL hit
url = rx_u.search(html)
if url:
row["homepage"] = http.fix_url(url.group(1))
row["homepage"] = ahttp.fix_url(url.group(1))
pass
#-----------------
# extract domain name
def domain(url):
|
︙ | | |
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
|
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
|
-
+
-
+
|
# try:
# URL download
r = urllib.urlopen(favicon)
headers = r.info()
# abort on
if r.getcode() >= 300:
raise "HTTP error", r.getcode()
raise Error("HTTP error" + r.getcode())
if not headers["Content-Type"].lower().find("image/"):
raise "can't use text/* content"
raise Error("can't use text/* content")
# save file
fn_tmp = fn+".tmp"
f = open(fn_tmp, "wb")
f.write(r.read(32768))
f.close()
|
︙ | | |
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
|
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
|
-
+
|
favicon = "".join(rx.findall(html))
# url or
if favicon.startswith("http://"):
None
# just /pathname
else:
favicon = urlparse.urljoin(url, favicon)
favicon = ahttp.urlparse.urljoin(url, favicon)
#favicon = "http://" + domain(url) + "/" + favicon
# download
direct_download(favicon, file(url))
|
︙ | | |
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
|
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
|
-
+
|
#
import operator
import struct
try:
from PIL import BmpImagePlugin, PngImagePlugin, Image
except Exception, e:
except Exception as e:
print("no PIL", e)
always_google = 1
only_google = 1
def load_icon(file, index=None):
'''
|
︙ | | |