""" GitHub API repository scraper RabbitMQ as results backend, saving state just in text file """ import json import os import time import logging from requests.exceptions import Timeout from github import Github, Repository, RateLimitExceededException, GithubException from src.rabbit import get_connection, get_channel, send_repository from src.settings import init_logging, SCRAPING_LAST_ID_PATH SAVE_ID_EVERY = 1000 TIMEOUT_SLEEP = 5 * 60 HEADER_SIZE_TO_READ = 3 * 1024 g = Github(os.getenv("GITHUB_KEY"), per_page=100) def get_repository_data(r: Repository): try: return { "id": r.id, "name": r.full_name, "fork": r.fork, "size": r.size, "default_branch": r.default_branch, "stargazers_count": r.stargazers_count, "updated_at": int(r.updated_at.timestamp()), "created_at": int(r.created_at.timestamp()), "private": r.private, "archived": r.archived, } except GithubException: logging.info(f"error with {r}") time.sleep(2) return def get_last_id() -> int: if not os.path.exists(SCRAPING_LAST_ID_PATH): raise Exception(f"No last_id file at: {SCRAPING_LAST_ID_PATH}") last_id = int(open(SCRAPING_LAST_ID_PATH).read().strip()) return last_id def save_last_id(val: int): logging.info(f"Saving last_id={val}") with open(SCRAPING_LAST_ID_PATH, "w") as f: f.write(str(val)) def main(): rabbit_connection = get_connection() rabbit_channel = get_channel(rabbit_connection) processed_count = 0 last_item_id = get_last_id() while True: try: for r in g.get_repos(since=last_item_id): repository_data = get_repository_data(r) processed_count += 1 if not repository_data: continue send_repository(rabbit_channel, json.dumps(repository_data).encode("utf-8")) last_item_id = repository_data["id"] if processed_count % SAVE_ID_EVERY == 0: save_last_id(last_item_id) except RateLimitExceededException: save_last_id(last_item_id) logging.info(f"waiting after {last_item_id}, processed: {processed_count}") time.sleep(30 * 60) if __name__ == "__main__": init_logging() logging.info(f"last_id = {get_last_id()}") while True: try: main() except Timeout: logging.warning("Timeout") time.sleep(TIMEOUT_SLEEP)