diff --git a/status-analyzer.py b/status-analyzer.py index 8d50f01..b6901eb 100644 --- a/status-analyzer.py +++ b/status-analyzer.py @@ -1,3 +1,6 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + import json import sys from collections import OrderedDict @@ -13,41 +16,43 @@ def dictWithoutOneKey(d, key): # load pazans -pazansGroups = None +pazans_groups = None -pazansFileName = sys.argv[1] -with open(pazansFileName, "r") as file: - pazansGroups = json.loads(file.read()) +pazans_file_name = sys.argv[1] +with open(pazans_file_name, "r") as file: + pazans_groups = json.loads(file.read()) # analyze statues -statusStats = dict() +status_stats = dict() tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+") -stemmer = RussianStemmer() +stemmer = RussianStemmer() -usersFileName = sys.argv[2] -with open(usersFileName, "r") as file: +users_file_name = sys.argv[2] +with open(users_file_name, "r") as file: for line in file: user = json.loads(line) - id = str(user["_id"]) - if id in pazansGroups: - pazanGroups = pazansGroups[id] - statusText = user.get("status", "") - filteredStatusText = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize(statusText)]) - if len(filteredStatusText) > 1: - statusStatsItem = statusStats.get(filteredStatusText, { - "full": statusText, + uid = str(user["_id"]) + if uid in pazans_groups: + pazan_groups = pazans_groups[uid] + status_text = user.get("status", "") + filtered_status_text = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize(status_text)]) + if len(filtered_status_text) > 1: + status_stats_item = status_stats.get(filtered_status_text, { + "full": status_text, "count-boys": 0, "count-girls": 0, }) - statusStatsItem["count-boys"] += len(pazanGroups) * (1 if user["sex"] == 2 else 0) - statusStatsItem["count-girls"] += len(pazanGroups) * (1 if user["sex"] == 1 else 0) - statusStats[filteredStatusText] = statusStatsItem + if user["sex"] == 2: + status_stats_item["count-boys"] += len(pazan_groups) + if user["sex"] == 1: + status_stats_item["count-girls"] += len(pazan_groups) + status_stats[filteredstatus_text] = status_stats_item # print result -destFileName = sys.argv[3] -with open(destFileName, "w", encoding="utf8") as file: +dest_file_name = sys.argv[3] +with open(dest_file_name, "w", encoding="utf-8") as file: sortKeyGetter = lambda item: item[1]["count-boys"] + item[1]["count-girls"] - sortedStatues = [item[1] for item in sorted(statusStats.items(), key=sortKeyGetter, reverse=True)] + sortedStatues = [item[1] for item in sorted(status_stats.items(), key=sortKeyGetter, reverse=True)] data = OrderedDict([(item["full"], dictWithoutOneKey(item, "full")) for item in sortedStatues]) file.write(json.dumps(data, ensure_ascii=False, indent=4))