Check-in [4753576db5]
Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Add tags, submitter and urls for extraction; default to `name-releases.json` for export file. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
4753576db56df0e8e4af1c1e5c45f5b9 |
User & Date: | mario 2014-08-18 22:21:59 |
Context
2014-08-20
| ||
00:38 | remove obsolete include(lib/openid.php) check-in: 620195ac76 user: mario tags: trunk | |
2014-08-18
| ||
22:21 | Add tags, submitter and urls for extraction; default to `name-releases.json` for export file. check-in: 4753576db5 user: mario tags: trunk | |
04:00 | Project extraction cmdline tool for freecode.com produces a `releases.json` for easy importing of previous releases per Autoupdate. check-in: 3e9edceae0 user: mario tags: trunk | |
Changes
Changes to doc/freecode2releases.py.
1 2 3 4 5 6 7 | #!/usr/bin/env python # encoding: utf-8 # api: cli # type: main # title: freecode-to-releases # description: Extracts project descriptions+history from Freecode.com into releases.JSON # category: scraping | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 | #!/usr/bin/env python # encoding: utf-8 # api: cli # type: main # title: freecode-to-releases # description: Extracts project descriptions+history from Freecode.com into releases.JSON # category: scraping # version: 0.7 # config: # <env-unused name=XDG_CONFIG_HOME value=~/.config description="user config base dir"> # license: MITL # doc: http://fossil.include-once.org/freshcode/wiki/freecode2releases # # # Fetches prior freshmeat/freecode.com project listings, and extracts |
︙ | ︙ | |||
37 38 39 40 41 42 43 | except: import json # scrape from freecode.com def freecode_fetch(name): | > | | > > > | > > > > > > > > > > > > > > | | 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 | except: import json # scrape from freecode.com def freecode_fetch(name): try: url = "http://freecode.com/projects/%s" % name html = bs(requests.get(url).text) except: print("project not found, %s" % url) return None # le basics r = collections.OrderedDict([ ("$feed-license", "author/editor"), ("$feed-origin", url), ("name", name), ("title", html.find("meta", {"property": "og:title"})["content"]), ("oneliner", html.find("meta", {"property": "og:description"})["content"]), #("image", "http://freshcode.com" + html.find("meta", {"property": "og:image"})["content"]), ("keywords", html.find("meta", {"name": "keywords"})["content"]), ("description", html.select("div.project-detail p")[0].string), ("tags", freecode_tags(html.select("#project-tag-cloud a"))), ("submitter", html.select("li.avatar a.avatar")[0]["title"]), ("urls", freecode_urls(html.select(".sub-navigation li.url a"))), ("releases", freecode_releases(name)), ]) return r # extract tag basename from <a> link list def freecode_tags(li): return ", ".join([ a["href"][6:] for a in li ]) # convert url list <li> <a> into dict def freecode_urls(li): r = [ (a.string, "http://freecode.com" + a["href"]) for a in li ] return collections.OrderedDict(r) # fetch releases pages def freecode_releases(name): last_page = 1 page = 1 r = [] while page <= last_page: # iterate through /releases pages url = "http://freecode.com/projects/%s/releases%s" % (name, ("?page=%s" % page if page else "")) html = bs(requests.get(url).text) for ver in html.select("div.release.clearfix"): # remove changelog gimmicks |
︙ | ︙ | |||
83 84 85 86 87 88 89 | }) # next page try: last_page = int(html.select(".pagination a")[-2].string) except: last_page = 1 page = page + 1 | < | > | 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 | }) # next page try: last_page = int(html.select(".pagination a")[-2].string) except: last_page = 1 page = page + 1 return r # try to deduce time from different formats def strftime(s): for fmt in [ "%d %b %Y %H:%M", "%Y-%m-%d %H:%M" ]: try: return datetime.strptime(s, fmt) except: None pass # process CLI arguments, invoke retrieval methods def freecode_cli(argv0="f2r", name="", output=""): if name: output = output or "%s-releases.json" % name json.dump(freecode_fetch(name), open(output, "wt"), indent=4) else: print("synopsis: freecode2releases.py [projectname [output.json]]"); print("[31mPlease only download and resubmit project data you initially wrote yourself.[0m"); return # argv to main if __name__ == "__main__": freecode_cli(*sys.argv) |