#!/usr/bin/python # -*- coding: utf-8 -*- import json import sys from collections import OrderedDict from nltk.stem.snowball import RussianStemmer from nltk.tokenize import RegexpTokenizer def dictWithoutOneKey(d, key): new_d = d.copy() new_d.pop(key) return new_d # load pazans pazans_groups = None pazans_file_name = sys.argv[1] with open(pazans_file_name, "r") as pazans_file: pazans_groups = json.loads(pazans_file.read()) # analyze statues status_stats = dict() tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+") stemmer = RussianStemmer() users_file_name = sys.argv[2] with open(users_file_name, "r") as users_file: for line in users_file: user = json.loads(line) uid = str(user["_id"]) if uid in pazans_groups: pazan_groups = pazans_groups[uid] status_text = user.get("status", "") filtered_status_text = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize(status_text)]) if len(filtered_status_text) > 1: status_stats_item = status_stats.get(filtered_status_text, { "full": status_text, "count-boys": 0, "count-girls": 0, }) if user["sex"] == 2: status_stats_item["count-boys"] += len(pazan_groups) if user["sex"] == 1: status_stats_item["count-girls"] += len(pazan_groups) status_stats[filtered_status_text] = status_stats_item # print result dest_file_name = sys.argv[3] with open(dest_file_name, "w", encoding="utf-8") as f_out: sortKeyGetter = lambda item: item[1]["count-boys"] + item[1]["count-girls"] sortedStatues = [item[1] for item in sorted(status_stats.items(), key=sortKeyGetter, reverse=True)] data = OrderedDict([(item["full"], dictWithoutOneKey(item, "full")) for item in sortedStatues]) f_out.write(json.dumps(data, ensure_ascii=False, indent=4))