#!/usr/bin/env python3
# description: search all cookiecutter repos on GH
#
# Search all repos. But use sliding date window, to get all.
# So tedious. Yay for proprietary baskets!
#
# to recreate db:
# ยท gh_find
# ยท gh_tree
# ยท gh_conv
#
import os, re, requests, json, time, dotenv
from random import randint
dotenv.load_dotenv()
# get list of repos for search term and sliding `created` window, optionally page_no
def fetch(page=0, created="2021-03", per_page=50):
url = "https://api.github.com/search/repositories"
params = dict(
q = f"cookiecutter created:{created}",
sort = "updated",
order = "desc",
per_page = per_page,
page = page
)
headers = {
"User-Agent": "cookiedough/0.3.0 (Python; amd64; requests)",
"Accept": "application/vnd.github.preview",
"Authorization": "token " + os.getenv("GITHUB_API_TOKEN")
}
print("SEARCH_Q=%r" % params)
r = requests.get(url, params, headers=headers)
return r.json()
def write(results):
with open("github.json", "w", encoding="utf-8") as f:
f.write(json.dumps(results, indent=4))
def read():
try:
with open("github.json", "r", encoding="utf-8") as f:
return json.loads(f.read())
except:
return {}
# iterate per year+month, and optionally pages if more than 50 results
def year_month_page(ys, ms, pg, per_page=100):
for year in ys:
for month in ms:
for page in pg:
d = fetch(created=f"{year:04}-{month:02}", page=page, per_page=per_page)
if "message" in d or not "items" in d:
print("**NO_RESULTS**=%r" % d)
break
print("len_items=%s" % len(d["items"]))
time.sleep(1)
yield d
if len(d["items"]) < per_page:
break
# add more repo items
results = read()
print(len(results))
for d in year_month_page(range(2012, 2022), range(1,13), range(0,19)):
if len(d["items"]):
for repo in d["items"]:
results[repo["full_name"]] = repo
if not randint(0, 20):
write(results)
print("len_results=%s" % len(results))
write(results)