92 lines
2.5 KiB
Python
92 lines
2.5 KiB
Python
"""
|
|
GitHub API repository scraper
|
|
RabbitMQ as results backend, saving state just in text file
|
|
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import time
|
|
import logging
|
|
|
|
from requests.exceptions import Timeout
|
|
from github import Github, Repository, RateLimitExceededException, GithubException
|
|
|
|
from src.rabbit import get_connection, get_channel, send_repository
|
|
from src.settings import init_logging, SCRAPING_LAST_ID_PATH
|
|
|
|
SAVE_ID_EVERY = 1000
|
|
|
|
TIMEOUT_SLEEP = 5 * 60
|
|
HEADER_SIZE_TO_READ = 3 * 1024
|
|
|
|
g = Github(os.getenv("GITHUB_KEY"), per_page=100)
|
|
|
|
|
|
def get_repository_data(r: Repository):
|
|
try:
|
|
return {
|
|
"id": r.id,
|
|
"name": r.full_name,
|
|
"fork": r.fork,
|
|
"size": r.size,
|
|
"default_branch": r.default_branch,
|
|
"stargazers_count": r.stargazers_count,
|
|
"updated_at": int(r.updated_at.timestamp()),
|
|
"created_at": int(r.created_at.timestamp()),
|
|
"private": r.private,
|
|
"archived": r.archived,
|
|
}
|
|
except GithubException:
|
|
logging.info(f"error with {r}")
|
|
time.sleep(2)
|
|
return
|
|
|
|
|
|
def get_last_id() -> int:
|
|
if not os.path.exists(SCRAPING_LAST_ID_PATH):
|
|
raise Exception(f"No last_id file at: {SCRAPING_LAST_ID_PATH}")
|
|
last_id = int(open(SCRAPING_LAST_ID_PATH).read().strip())
|
|
return last_id
|
|
|
|
|
|
def save_last_id(val: int):
|
|
logging.info(f"Saving last_id={val}")
|
|
with open(SCRAPING_LAST_ID_PATH, "w") as f:
|
|
f.write(str(val))
|
|
|
|
|
|
def main():
|
|
rabbit_connection = get_connection()
|
|
rabbit_channel = get_channel(rabbit_connection)
|
|
|
|
processed_count = 0
|
|
last_item_id = get_last_id()
|
|
while True:
|
|
try:
|
|
for r in g.get_repos(since=last_item_id):
|
|
repository_data = get_repository_data(r)
|
|
processed_count += 1
|
|
if not repository_data:
|
|
continue
|
|
send_repository(rabbit_channel, json.dumps(repository_data).encode("utf-8"))
|
|
|
|
last_item_id = repository_data["id"]
|
|
if processed_count % SAVE_ID_EVERY == 0:
|
|
save_last_id(last_item_id)
|
|
except RateLimitExceededException:
|
|
save_last_id(last_item_id)
|
|
logging.info(f"waiting after {last_item_id}, processed: {processed_count}")
|
|
time.sleep(30 * 60)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
init_logging()
|
|
logging.info(f"last_id = {get_last_id()}")
|
|
while True:
|
|
try:
|
|
main()
|
|
except Timeout:
|
|
logging.warning("Timeout")
|
|
time.sleep(TIMEOUT_SLEEP)
|