⌈⌋ ⎇ branch:  freshcode


Check-in [4753576db5]

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add tags, submitter and urls for extraction; default to `name-releases.json` for export file.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 4753576db56df0e8e4af1c1e5c45f5b978c1bd50
User & Date: mario 2014-08-18 22:21:59
Context
2014-08-20
00:38
remove obsolete include(lib/openid.php) check-in: 620195ac76 user: mario tags: trunk
2014-08-18
22:21
Add tags, submitter and urls for extraction; default to `name-releases.json` for export file. check-in: 4753576db5 user: mario tags: trunk
04:00
Project extraction cmdline tool for freecode.com produces a `releases.json` for easy importing of previous releases per Autoupdate. check-in: 3e9edceae0 user: mario tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to doc/freecode2releases.py.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#!/usr/bin/env python
# encoding: utf-8
# api: cli
# type: main
# title: freecode-to-releases     
# description: Extracts project descriptions+history from Freecode.com into releases.JSON
# category: scraping
# version: 0.5
# config:
#   <env-unused name=XDG_CONFIG_HOME value=~/.config description="user config base dir"> 
# license: MITL
# doc: http://fossil.include-once.org/freshcode/wiki/freecode2releases
# 
#
# Fetches prior freshmeat/freecode.com project listings, and extracts   







|







1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#!/usr/bin/env python
# encoding: utf-8
# api: cli
# type: main
# title: freecode-to-releases     
# description: Extracts project descriptions+history from Freecode.com into releases.JSON
# category: scraping
# version: 0.7
# config:
#   <env-unused name=XDG_CONFIG_HOME value=~/.config description="user config base dir"> 
# license: MITL
# doc: http://fossil.include-once.org/freshcode/wiki/freecode2releases
# 
#
# Fetches prior freshmeat/freecode.com project listings, and extracts   
37
38
39
40
41
42
43

44
45



46
47
48
49
50
51
52
53
54
55



56
57
58
59











60
61
62
63
64
65
66
67
68
69
70
71
except:
    import json



# scrape from freecode.com
def freecode_fetch(name):

    url = "http://freecode.com/projects/%s" % name
    html = bs(requests.get(url).text)



    # le basics
    r = collections.OrderedDict([
        ("$feed-license", "CC author/editors"),
        ("$feed-origin", url),
        ("name", name),
        ("title", html.find("meta", {"property": "og:title"})["content"]),
        ("oneliner", html.find("meta", {"property": "og:description"})["content"]),
        #("image", "http://freshcode.com" + html.find("meta", {"property": "og:image"})["content"]),
        ("keywords", html.find("meta", {"name": "keywords"})["content"]),
        ("description", html.select("div.project-detail p")[0].string),



        ("releases", freecode_releases(name)),
    ])
    return r













# fetch releases pages
def freecode_releases(name):
    last_page = 1
    page = 0
    r = []
    while page <= last_page:
        # iterate through /releases pages
        url = "http://freecode.com/projects/%s/releases%s" % (name, ("?page=%s" % page if page else ""))
        html = bs(requests.get(url).text)
        for ver in html.select("div.release.clearfix"):
            # remove changelog gimmicks







>
|
|
>
>
>


|







>
>
>




>
>
>
>
>
>
>
>
>
>
>




|







37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
except:
    import json



# scrape from freecode.com
def freecode_fetch(name):
    try:
        url = "http://freecode.com/projects/%s" % name
        html = bs(requests.get(url).text)
    except:
        print("project not found, %s" % url)
        return None
    # le basics
    r = collections.OrderedDict([
        ("$feed-license", "author/editor"),
        ("$feed-origin", url),
        ("name", name),
        ("title", html.find("meta", {"property": "og:title"})["content"]),
        ("oneliner", html.find("meta", {"property": "og:description"})["content"]),
        #("image", "http://freshcode.com" + html.find("meta", {"property": "og:image"})["content"]),
        ("keywords", html.find("meta", {"name": "keywords"})["content"]),
        ("description", html.select("div.project-detail p")[0].string),
        ("tags", freecode_tags(html.select("#project-tag-cloud a"))),
        ("submitter", html.select("li.avatar a.avatar")[0]["title"]),
        ("urls", freecode_urls(html.select(".sub-navigation li.url a"))),
        ("releases", freecode_releases(name)),
    ])
    return r


# extract tag basename from <a> link list
def freecode_tags(li):
    return ", ".join([  a["href"][6:] for a in li  ])


# convert url list <li> <a> into dict
def freecode_urls(li):
    r = [  (a.string, "http://freecode.com" + a["href"]) for a in li  ]
    return collections.OrderedDict(r)


# fetch releases pages
def freecode_releases(name):
    last_page = 1
    page = 1
    r = []
    while page <= last_page:
        # iterate through /releases pages
        url = "http://freecode.com/projects/%s/releases%s" % (name, ("?page=%s" % page if page else ""))
        html = bs(requests.get(url).text)
        for ver in html.select("div.release.clearfix"):
            # remove changelog gimmicks
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109

110
111
112
113
114
115
116
117
118
119
120
121
            })
        # next page
        try:
            last_page = int(html.select(".pagination a")[-2].string)
        except:
            last_page = 1
        page = page + 1
        print page
    return r


# try to deduce time from different formats
def strftime(s):
    for fmt in [
        "%d %b %Y %H:%M",
        "%Y-%m-%d %H:%M"
    ]:
        try:
            return datetime.strptime(s, fmt)
        except:
            None
    pass


# process CLI arguments, invoke retrieval methods
def freecode_cli(argv0="f2r", name="", output="releases.json"):
    if name:

        json.dump(freecode_fetch(name), open(output, "wt"), indent=4)
    else:
        print("synopsis: freecode2releases.py [projectname [output.json]]");
        print("Please only download and resubmit project data you initially wrote yourself.");
    return


# argv to main
if __name__ == "__main__":
    freecode_cli(*sys.argv)









<

















|

>












101
102
103
104
105
106
107

108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
            })
        # next page
        try:
            last_page = int(html.select(".pagination a")[-2].string)
        except:
            last_page = 1
        page = page + 1

    return r


# try to deduce time from different formats
def strftime(s):
    for fmt in [
        "%d %b %Y %H:%M",
        "%Y-%m-%d %H:%M"
    ]:
        try:
            return datetime.strptime(s, fmt)
        except:
            None
    pass


# process CLI arguments, invoke retrieval methods
def freecode_cli(argv0="f2r", name="", output=""):
    if name:
        output = output or "%s-releases.json" % name
        json.dump(freecode_fetch(name), open(output, "wt"), indent=4)
    else:
        print("synopsis: freecode2releases.py [projectname [output.json]]");
        print("Please only download and resubmit project data you initially wrote yourself.");
    return


# argv to main
if __name__ == "__main__":
    freecode_cli(*sys.argv)