Audio analyzer
This commit is contained in:
@@ -1,21 +1,64 @@
|
|||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
from collections import Counter
|
|
||||||
|
|
||||||
from nltk import RegexpTokenizer
|
from nltk import RegexpTokenizer, OrderedDict
|
||||||
from nltk.stem.snowball import RussianStemmer
|
from nltk.stem.snowball import RussianStemmer
|
||||||
|
|
||||||
counter = Counter()
|
genres = {
|
||||||
|
1: "Rock",
|
||||||
|
2: "Pop",
|
||||||
|
3: "Rap & Hip - Hop",
|
||||||
|
4: "Easy Listening",
|
||||||
|
5: "Dance & House",
|
||||||
|
6: "Instrumental",
|
||||||
|
7: "Metal",
|
||||||
|
21: "Alternative",
|
||||||
|
8: "Dubstep",
|
||||||
|
9: "Jazz & Blues",
|
||||||
|
10: "Drum & Bass",
|
||||||
|
11: "Trance",
|
||||||
|
12: "Chanson",
|
||||||
|
13: "Ethnic",
|
||||||
|
14: "Acoustic & Vocal",
|
||||||
|
15: "Reggae",
|
||||||
|
16: "Classical",
|
||||||
|
17: "Indie Pop",
|
||||||
|
19: "Speech",
|
||||||
|
22: "Electropop & Disco",
|
||||||
|
18: "Other"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dictWithoutOneKey(d, key):
|
||||||
|
new_d = d.copy()
|
||||||
|
new_d.pop(key)
|
||||||
|
return new_d
|
||||||
|
|
||||||
|
|
||||||
|
audioStats = dict()
|
||||||
|
|
||||||
tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+")
|
tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+")
|
||||||
stemmer = RussianStemmer()
|
stemmer = RussianStemmer()
|
||||||
|
|
||||||
musicFileName = sys.argv[0]
|
musicFileName = sys.argv[1]
|
||||||
with open(musicFileName, "r") as file:
|
with open(musicFileName, "r", encoding="utf8") as file:
|
||||||
for line in file:
|
for line in file:
|
||||||
jsonData = json.loads(line, encoding="utf8")
|
jsonData = json.loads(line, encoding="utf8")
|
||||||
for song in jsonData.values()[0]:
|
for song in list(jsonData.values())[0]:
|
||||||
key = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize("{} {}".format(song["artist"], song["title"]))])
|
songName = "{} - {}".format(song["artist"], song["title"])
|
||||||
counter[key] += 1
|
filteredSongName = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize(songName)])
|
||||||
|
if len(filteredSongName) > 1:
|
||||||
|
audioStatsItem = audioStats.get(filteredSongName, {
|
||||||
|
"name": songName,
|
||||||
|
"url": song["url"],
|
||||||
|
"genre": genres.get(song["genre_id"], "Other"),
|
||||||
|
"count": 0
|
||||||
|
})
|
||||||
|
audioStatsItem["count"] += 1
|
||||||
|
audioStats[filteredSongName] = audioStatsItem
|
||||||
|
|
||||||
for item in counter.most_common():
|
destFileName = sys.argv[2]
|
||||||
print(item)
|
with open(destFileName, "w", encoding="utf8") as file:
|
||||||
|
sortedSongs = [item[1] for item in sorted(audioStats.items(), key=lambda item: item[1]["count"], reverse=True)]
|
||||||
|
data = OrderedDict([(item["name"], dictWithoutOneKey(item, "name")) for item in sortedSongs])
|
||||||
|
file.write(json.dumps(data, ensure_ascii=False, indent=4))
|
||||||
|
|||||||
Reference in New Issue
Block a user