Files
gotohack/audio-analyzer.py
Oleg Morozenkov 92aa431791 Audio analyzer
2016-02-22 11:25:57 +03:00

22 lines
597 B
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json
import sys
from collections import Counter
from nltk import RegexpTokenizer
from nltk.stem.snowball import RussianStemmer
counter = Counter()
tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+")
stemmer = RussianStemmer()
musicFileName = sys.argv[0]
with open(musicFileName) as file:
for line in file:
jsonData = json.loads(line, encoding="utf8")
for song in jsonData.values()[0]:
key = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize("{} {}".format(song["artist"], song["title"]))])
counter[key] += 1
for item in counter.most_common():
print(item)