Status analyzer

2016-02-21 18:22:31 +03:00
parent 6ca91dbba8
commit a8ff1c0459
1 changed files with 43 additions and 0 deletions
--- a/status-analyzer.py
+++ b/status-analyzer.py
@@ -0,0 +1,43 @@
+import json
+import sys
+from nltk.stem.snowball import RussianStemmer
+from nltk.tokenize import RegexpTokenizer
+
+# load pazans
+pazansGroups = None
+
+pazansFileName = sys.argv[2]
+with open(pazansFileName) as file:
+	pazansGroups = json.loads(file.read())
+
+# analyze statues
+statusStats = dict()
+
+tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+")
+stemmer = RussianStemmer()
+
+usersFileName = sys.argv[1]
+with open(usersFileName) as file:
+	for line in file:
+		user = json.loads(line)
+		id = str(user["_id"])
+		if id in pazansGroups:
+			pazanGroups = pazansGroups[id]
+			statusText = user.get("status", "")
+			filteredStatusText = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize(statusText)])
+			if len(filteredStatusText) > 1:
+				statusStatsItem = statusStats.get(filteredStatusText, {
+					"full": statusText,
+					"count-boys": 0,
+					"count-girls": 0,
+				})
+				statusStatsItem["count-boys"] += len(pazanGroups) * (1 if user["sex"] == 2 else 0)
+				statusStatsItem["count-girls"] += len(pazanGroups) * (1 if user["sex"] == 1 else 0)
+				statusStats[filteredStatusText] = statusStatsItem
+
+# print result
+with open(sys.argv[3], "w", encoding="utf-8") as file:
+	for item in sorted(statusStats.items(), key=lambda item: item[1]["count-boys"] + item[1]["count-girls"], reverse=True):
+		file.write(item[1]["full"] + "\n")
+		file.write("\tboys: " + str(item[1]["count-boys"]) + "\n")
+		file.write("\tgirls: " + str(item[1]["count-girls"]) + "\n")