Files
assets-bot/scrape_repos.py

92 lines
2.5 KiB
Python

"""
GitHub API repository scraper
RabbitMQ as results backend, saving state just in text file
"""
import json
import os
import time
import logging
from requests.exceptions import Timeout
from github import Github, Repository, RateLimitExceededException, GithubException
from src.rabbit import get_connection, get_channel, send_repository
from src.settings import init_logging, SCRAPING_LAST_ID_PATH
SAVE_ID_EVERY = 1000
TIMEOUT_SLEEP = 5 * 60
HEADER_SIZE_TO_READ = 3 * 1024
g = Github(os.getenv("GITHUB_KEY"), per_page=100)
def get_repository_data(r: Repository):
try:
return {
"id": r.id,
"name": r.full_name,
"fork": r.fork,
"size": r.size,
"default_branch": r.default_branch,
"stargazers_count": r.stargazers_count,
"updated_at": int(r.updated_at.timestamp()),
"created_at": int(r.created_at.timestamp()),
"private": r.private,
"archived": r.archived,
}
except GithubException:
logging.info(f"error with {r}")
time.sleep(2)
return
def get_last_id() -> int:
if not os.path.exists(SCRAPING_LAST_ID_PATH):
raise Exception(f"No last_id file at: {SCRAPING_LAST_ID_PATH}")
last_id = int(open(SCRAPING_LAST_ID_PATH).read().strip())
return last_id
def save_last_id(val: int):
logging.info(f"Saving last_id={val}")
with open(SCRAPING_LAST_ID_PATH, "w") as f:
f.write(str(val))
def main():
rabbit_connection = get_connection()
rabbit_channel = get_channel(rabbit_connection)
processed_count = 0
last_item_id = get_last_id()
while True:
try:
for r in g.get_repos(since=last_item_id):
repository_data = get_repository_data(r)
processed_count += 1
if not repository_data:
continue
send_repository(rabbit_channel, json.dumps(repository_data).encode("utf-8"))
last_item_id = repository_data["id"]
if processed_count % SAVE_ID_EVERY == 0:
save_last_id(last_item_id)
except RateLimitExceededException:
save_last_id(last_item_id)
logging.info(f"waiting after {last_item_id}, processed: {processed_count}")
time.sleep(30 * 60)
if __name__ == "__main__":
init_logging()
logging.info(f"last_id = {get_last_id()}")
while True:
try:
main()
except Timeout:
logging.warning("Timeout")
time.sleep(TIMEOUT_SLEEP)