From 6b0b3c9300bd60388c079486e1567bcec8d6a7a9 Mon Sep 17 00:00:00 2001 From: Oleg Morozenkov Date: Mon, 22 Feb 2016 13:16:19 +0300 Subject: [PATCH] Audio analyzer --- audio-analyzer.py | 63 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 53 insertions(+), 10 deletions(-) diff --git a/audio-analyzer.py b/audio-analyzer.py index 823794c..8a9d847 100644 --- a/audio-analyzer.py +++ b/audio-analyzer.py @@ -1,21 +1,64 @@ import json import sys -from collections import Counter -from nltk import RegexpTokenizer +from nltk import RegexpTokenizer, OrderedDict from nltk.stem.snowball import RussianStemmer -counter = Counter() +genres = { + 1: "Rock", + 2: "Pop", + 3: "Rap & Hip - Hop", + 4: "Easy Listening", + 5: "Dance & House", + 6: "Instrumental", + 7: "Metal", + 21: "Alternative", + 8: "Dubstep", + 9: "Jazz & Blues", + 10: "Drum & Bass", + 11: "Trance", + 12: "Chanson", + 13: "Ethnic", + 14: "Acoustic & Vocal", + 15: "Reggae", + 16: "Classical", + 17: "Indie Pop", + 19: "Speech", + 22: "Electropop & Disco", + 18: "Other" +} + + +def dictWithoutOneKey(d, key): + new_d = d.copy() + new_d.pop(key) + return new_d + + +audioStats = dict() + tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+") stemmer = RussianStemmer() -musicFileName = sys.argv[0] -with open(musicFileName, "r") as file: +musicFileName = sys.argv[1] +with open(musicFileName, "r", encoding="utf8") as file: for line in file: jsonData = json.loads(line, encoding="utf8") - for song in jsonData.values()[0]: - key = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize("{} {}".format(song["artist"], song["title"]))]) - counter[key] += 1 + for song in list(jsonData.values())[0]: + songName = "{} - {}".format(song["artist"], song["title"]) + filteredSongName = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize(songName)]) + if len(filteredSongName) > 1: + audioStatsItem = audioStats.get(filteredSongName, { + "name": songName, + "url": song["url"], + "genre": genres.get(song["genre_id"], "Other"), + "count": 0 + }) + audioStatsItem["count"] += 1 + audioStats[filteredSongName] = audioStatsItem -for item in counter.most_common(): - print(item) +destFileName = sys.argv[2] +with open(destFileName, "w", encoding="utf8") as file: + sortedSongs = [item[1] for item in sorted(audioStats.items(), key=lambda item: item[1]["count"], reverse=True)] + data = OrderedDict([(item["name"], dictWithoutOneKey(item, "name")) for item in sortedSongs]) + file.write(json.dumps(data, ensure_ascii=False, indent=4))