From 68ab5418cf570f12d107de2f47040257ca86c967 Mon Sep 17 00:00:00 2001 From: Oleg Morozenkov Date: Sun, 21 Feb 2016 20:53:43 +0300 Subject: [PATCH] Audio analyzer --- audio-analyzer.py | 28 ++++++++++++++++++++++++++++ status-analyzer.py | 4 ++-- 2 files changed, 30 insertions(+), 2 deletions(-) create mode 100644 audio-analyzer.py diff --git a/audio-analyzer.py b/audio-analyzer.py new file mode 100644 index 0000000..376cefa --- /dev/null +++ b/audio-analyzer.py @@ -0,0 +1,28 @@ +import json +import sys +import pymongo + +pazanIds = None + +pazansFileName = sys.argv[1] +with open(pazansFileName) as file: + pazanIds = json.loads(file.read()).keys() + +artistStats = dict() + +audioCollection = pymongo.MongoClient("goto.reproducible.work")["vk"]["audio"] +for pazanId in pazanIds: + for audio in audioCollection.find({"owner_id": pazanId}, {"artist": 1, "title": 1, "url": 1}): + audioName = audio["artist"] + audio["title"] + artistStatsItem = artistStats.get(audioName, { + "url": audio["url"], + "count": 0 + }) + artistStatsItem["count"] += 1 + artistStats[audioName] = artistStatsItem + +with open(sys.argv[2], "w", encoding="utf-8") as file: + for item in sorted(artistStats.items(), key=lambda item: item[1]["count"], reverse=True): + file.write(item[0] + "\n") + file.write("\tcount: " + str(item[1]["count"]) + "\n") + file.write("\turl: " + str(item[1]["url"]) + "\n") diff --git a/status-analyzer.py b/status-analyzer.py index 6e6fbff..c103485 100644 --- a/status-analyzer.py +++ b/status-analyzer.py @@ -6,7 +6,7 @@ from nltk.tokenize import RegexpTokenizer # load pazans pazansGroups = None -pazansFileName = sys.argv[2] +pazansFileName = sys.argv[1] with open(pazansFileName) as file: pazansGroups = json.loads(file.read()) @@ -16,7 +16,7 @@ statusStats = dict() tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+") stemmer = RussianStemmer() -usersFileName = sys.argv[1] +usersFileName = sys.argv[2] with open(usersFileName) as file: for line in file: user = json.loads(line)