diff --git a/audio-analyzer.py b/audio-analyzer.py new file mode 100644 index 0000000..63fa34e --- /dev/null +++ b/audio-analyzer.py @@ -0,0 +1,21 @@ +import json +import sys +from collections import Counter + +from nltk import RegexpTokenizer +from nltk.stem.snowball import RussianStemmer + +counter = Counter() +tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+") +stemmer = RussianStemmer() + +musicFileName = sys.argv[0] +with open(musicFileName) as file: + for line in file: + jsonData = json.loads(line, encoding="utf8") + for song in jsonData.values()[0]: + key = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize("{} {}".format(song["artist"], song["title"]))]) + counter[key] += 1 + +for item in counter.most_common(): + print(item)