From 7b3d97460350c3ef2593d609c6fca4f15905d207 Mon Sep 17 00:00:00 2001 From: Aleksey Lobanov Date: Wed, 24 Feb 2016 00:23:47 +0300 Subject: [PATCH] Some refactoring (also PEP-8) to audio-analyzer.py --- audio-analyzer.py | 74 +++++++++++++++++++++++++---------------------- 1 file changed, 40 insertions(+), 34 deletions(-) diff --git a/audio-analyzer.py b/audio-analyzer.py index 8a9d847..6d7940f 100644 --- a/audio-analyzer.py +++ b/audio-analyzer.py @@ -1,20 +1,23 @@ +#!/usr/bin/python3 +# -*- coding: utf-8 -*- + import json import sys -from nltk import RegexpTokenizer, OrderedDict +from nltk import RegexpTokenizer, OrderedDict from nltk.stem.snowball import RussianStemmer genres = { - 1: "Rock", - 2: "Pop", - 3: "Rap & Hip - Hop", - 4: "Easy Listening", - 5: "Dance & House", - 6: "Instrumental", - 7: "Metal", + 1: "Rock", + 2: "Pop", + 3: "Rap & Hip - Hop", + 4: "Easy Listening", + 5: "Dance & House", + 6: "Instrumental", + 7: "Metal", 21: "Alternative", - 8: "Dubstep", - 9: "Jazz & Blues", + 8: "Dubstep", + 9: "Jazz & Blues", 10: "Drum & Bass", 11: "Trance", 12: "Chanson", @@ -34,31 +37,34 @@ def dictWithoutOneKey(d, key): new_d.pop(key) return new_d +if __name__ == '__main__': + musicFileName = sys.argv[1] + destFileName = sys.argv[2] -audioStats = dict() + tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+") + stemmer = RussianStemmer() -tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+") -stemmer = RussianStemmer() + audioStats = dict() -musicFileName = sys.argv[1] -with open(musicFileName, "r", encoding="utf8") as file: - for line in file: - jsonData = json.loads(line, encoding="utf8") - for song in list(jsonData.values())[0]: - songName = "{} - {}".format(song["artist"], song["title"]) - filteredSongName = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize(songName)]) - if len(filteredSongName) > 1: - audioStatsItem = audioStats.get(filteredSongName, { - "name": songName, - "url": song["url"], - "genre": genres.get(song["genre_id"], "Other"), - "count": 0 - }) - audioStatsItem["count"] += 1 - audioStats[filteredSongName] = audioStatsItem + with open(musicFileName, "r", encoding="utf8") as f_music: + for line in f_music: + jsonData = json.loads(line, encoding="utf8") + for song in list(jsonData.values())[0]: + songName = "{} - {}".format(song["artist"], song["title"]) + filteredSongName = "".join( + [stemmer.stem(token).lower() for token in tokenizer.tokenize(songName)] + ) + if len(filteredSongName) > 1: + audioStatsItem = audioStats.get(filteredSongName, { + "name": songName, + "url": song["url"], + "genre": genres.get(song["genre_id"], "Other"), + "count": 0 + }) + audioStatsItem["count"] += 1 + audioStats[filteredSongName] = audioStatsItem -destFileName = sys.argv[2] -with open(destFileName, "w", encoding="utf8") as file: - sortedSongs = [item[1] for item in sorted(audioStats.items(), key=lambda item: item[1]["count"], reverse=True)] - data = OrderedDict([(item["name"], dictWithoutOneKey(item, "name")) for item in sortedSongs]) - file.write(json.dumps(data, ensure_ascii=False, indent=4)) + with open(destFileName, "w", encoding="utf-8") as f_out: + sortedSongs = [item[1] for item in sorted(audioStats.items(), key=lambda item: item[1]["count"], reverse=True)] + data = OrderedDict([(item["name"], dictWithoutOneKey(item, "name")) for item in sortedSongs]) + f_out.write(json.dumps(data, ensure_ascii=False, indent=4))