This commit is contained in:
Omrigan
2016-02-23 01:31:27 +03:00
10 changed files with 267039 additions and 11 deletions

22
all-good-ids.py Normal file
View File

@@ -0,0 +1,22 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
import json
import os
from os.path import join
all_ids = {}
for dirpath, dirnames, filenames in os.walk(sys.argv[1]):
for f in filenames:
fp = os.path.join(dirpath, f)
for uid in open(fp):
uid = int(uid)
if uid in all_ids:
all_ids[uid] += 1
else:
all_ids[uid] = 1
f_out = open(sys.argv[2],'w')
f_out.write(json.dumps(all_ids))

64
audio-analyzer.py Normal file
View File

@@ -0,0 +1,64 @@
import json
import sys
from nltk import RegexpTokenizer, OrderedDict
from nltk.stem.snowball import RussianStemmer
genres = {
1: "Rock",
2: "Pop",
3: "Rap & Hip - Hop",
4: "Easy Listening",
5: "Dance & House",
6: "Instrumental",
7: "Metal",
21: "Alternative",
8: "Dubstep",
9: "Jazz & Blues",
10: "Drum & Bass",
11: "Trance",
12: "Chanson",
13: "Ethnic",
14: "Acoustic & Vocal",
15: "Reggae",
16: "Classical",
17: "Indie Pop",
19: "Speech",
22: "Electropop & Disco",
18: "Other"
}
def dictWithoutOneKey(d, key):
new_d = d.copy()
new_d.pop(key)
return new_d
audioStats = dict()
tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+")
stemmer = RussianStemmer()
musicFileName = sys.argv[1]
with open(musicFileName, "r", encoding="utf8") as file:
for line in file:
jsonData = json.loads(line, encoding="utf8")
for song in list(jsonData.values())[0]:
songName = "{} - {}".format(song["artist"], song["title"])
filteredSongName = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize(songName)])
if len(filteredSongName) > 1:
audioStatsItem = audioStats.get(filteredSongName, {
"name": songName,
"url": song["url"],
"genre": genres.get(song["genre_id"], "Other"),
"count": 0
})
audioStatsItem["count"] += 1
audioStats[filteredSongName] = audioStatsItem
destFileName = sys.argv[2]
with open(destFileName, "w", encoding="utf8") as file:
sortedSongs = [item[1] for item in sorted(audioStats.items(), key=lambda item: item[1]["count"], reverse=True)]
data = OrderedDict([(item["name"], dictWithoutOneKey(item, "name")) for item in sortedSongs])
file.write(json.dumps(data, ensure_ascii=False, indent=4))

47
audio-fetcher.py Normal file
View File

@@ -0,0 +1,47 @@
import json
import sys
import time
import vk_api
# getting pazans
pazanIds = None
pazansFileName = sys.argv[1]
with open(pazansFileName, "r") as file:
jsonData = json.loads(file.read())
pazanIds = [item[0] for item in sorted(jsonData.items(), key=lambda item: len(item[1]), reverse=True)]
# getting music
def captcha_handler(captcha):
key = input("Enter Captcha {0}: ".format(captcha.get_url())).strip()
return captcha.try_again(key)
vk = vk_api.VkApi(token=sys.argv[3], app_id=sys.argv[4], captcha_handler=captcha_handler)
for index, pazanId in enumerate(pazanIds, start=(int(sys.argv[5]) if len(sys.argv) > 5 else 0)):
done = False
while not done:
try:
print(index, pazanId)
pazanSongs = []
# jsonData = vk.method("audio.get", {"owner_id": pazanId, "need_user": 0, "count": 100})
jsonData = vk.method("execute.getMusic", {"id": pazanId})
for audio in jsonData["items"]:
pazanSong = {
"artist": audio["artist"],
"title": audio["title"],
"genre_id": audio.get("genre_id", None),
"url": audio["url"],
}
pazanSongs.append(pazanSong)
with open(sys.argv[2], "a", encoding="utf-8") as file:
file.write(json.dumps({pazanId: pazanSongs}, ensure_ascii=False) + "\n")
done = True
except vk_api.ApiError as e:
if e.code == 9:
print("waiting")
time.sleep(60)
elif e.code == 201 or e.code == 15:
done = True
else:
raise e

68
get-users-addresses.py Normal file
View File

@@ -0,0 +1,68 @@
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
get_users_addresses.py SCHOOL_IDS_FILE ADDRESSES_FILE USERS_IDS_JSON FULL_BASE_FILE OUTPUT_FILE
"""
import sys
import csv
import json
from urllib.parse import quote_plus
import requests
def getCoords(name):
r = requests.get('https://geocode-maps.yandex.ru/1.x/?geocode='+quote_plus(name) + '&format=json')
points = r.json()['response']['GeoObjectCollection']['featureMember'][0]['GeoObject']['Point']['pos'].split()[::-1]
return tuple([float(x) for x in points])
school_ids = {}
cnt = 0
with open(sys.argv[1],'r') as f:
reader = csv.reader(f)
for row in reader:
cnt += 1
if cnt == 1:
continue
if len(row[0].strip()) == 0 or len(row[3].strip()) == 0:
continue
school_ids[int(row[3])] = row[0]
addresses = {}
cnt = 0
with open(sys.argv[2],'r') as f:
reader = csv.reader(f)
for row in reader:
cnt += 1
if cnt == 1:
continue
if len((row[3] + row[2]).strip()) > 0:
addresses[row[0]] = {"coord":(row[3],row[2]),"address":row[1]}
else:
addresses[row[0]] = {"coord":getCoords('Москва ' + row[1]),"address":row[1]}
pazans = json.loads(open(sys.argv[3]).read())
pazan_ids = set([int(i) for i in pazans.keys()])
pazan_schools = {}
print(sys.argv[4])
for line in open(sys.argv[4]):
json_line = json.loads(line)
uid = int(json_line['_id'])
if uid in pazan_ids:
if 'schools' not in json_line or len(json_line['schools']) == 0:
continue
school_id = int(json_line['schools'][-1]['id'])
if school_id not in school_ids:
continue
school_name = school_ids[school_id]
if school_name not in addresses:
continue
pazan_schools[uid] = addresses[school_name]
f_out = open(sys.argv[5],'w')
f_out.write(json.dumps(pazan_schools))
print("There are {} pazans with school".format(len(pazan_schools)))

33
group-ids-downloader.py Normal file
View File

@@ -0,0 +1,33 @@
#!/usr/bin/python3
# -*- coding: utf-8 -*-
'''
group_ids_downloader.py VK_LOGIN VK_PASSWORD APP_ID INPUT_FILENAME
where INPUT_FILENAME is file with one id of group per line
'''
import sys
import vk_api
def getIdsByGroup(group_id):
ids = []
res = vk.method("groups.getMembers", {"group_id":group_id,"count":1000})
count = res['count'] - 1000
ids += res['items']
cur_offset = 1000
while count > 0:
res = vk.method("groups.getMembers", {"group_id":group_id,
"count":1000,"offset":cur_offset, "sort":"id_asc"})
count = count - 1000
cur_offset += 1000
ids += res['items']
return ids
vk_login, vk_password = sys.argv[1], sys.argv[2]
vk = vk_api.VkApi(vk_login, vk_password, app_id=sys.argv[3])
vk.authorization()
for group_id in open(sys.argv[4]):
group_id = group_id.strip()
good_ids = getIdsByGroup(group_id)
open('out/' + str(group_id),'w').write('\n'.join([str(i) for i in good_ids]))

View File

@@ -1,6 +1,7 @@
import os
import re import re
import sys import sys
import os
import pymongo import pymongo
dirWithIds = sys.argv[1] dirWithIds = sys.argv[1]
@@ -36,7 +37,7 @@ else:
for fileName in os.listdir(dirWithIds): for fileName in os.listdir(dirWithIds):
print("parsing", fileName) print("parsing", fileName)
with open(os.path.join(dirWithIds, fileName)) as file: with open(os.path.join(dirWithIds, fileName), "r") as file:
for line in file: for line in file:
id = int(line) id = int(line)
if id in ids: if id in ids:

199237
results/pazan_music.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

BIN
results/presentation.pdf Normal file

Binary file not shown.

View File

@@ -1,13 +1,22 @@
import json import json
import sys import sys
from collections import OrderedDict
from nltk.stem.snowball import RussianStemmer from nltk.stem.snowball import RussianStemmer
from nltk.tokenize import RegexpTokenizer from nltk.tokenize import RegexpTokenizer
def dictWithoutOneKey(d, key):
new_d = d.copy()
new_d.pop(key)
return new_d
# load pazans # load pazans
pazansGroups = None pazansGroups = None
pazansFileName = sys.argv[2] pazansFileName = sys.argv[1]
with open(pazansFileName) as file: with open(pazansFileName, "r") as file:
pazansGroups = json.loads(file.read()) pazansGroups = json.loads(file.read())
# analyze statues # analyze statues
@@ -16,8 +25,8 @@ statusStats = dict()
tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+") tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+")
stemmer = RussianStemmer() stemmer = RussianStemmer()
usersFileName = sys.argv[1] usersFileName = sys.argv[2]
with open(usersFileName) as file: with open(usersFileName, "r") as file:
for line in file: for line in file:
user = json.loads(line) user = json.loads(line)
id = str(user["_id"]) id = str(user["_id"])
@@ -36,8 +45,9 @@ with open(usersFileName) as file:
statusStats[filteredStatusText] = statusStatsItem statusStats[filteredStatusText] = statusStatsItem
# print result # print result
with open(sys.argv[3], "w", encoding="utf-8") as file: destFileName = sys.argv[3]
for item in sorted(statusStats.items(), key=lambda item: item[1]["count-boys"] + item[1]["count-girls"], reverse=True): with open(destFileName, "w", encoding="utf8") as file:
file.write(item[1]["full"] + "\n") sortKeyGetter = lambda item: item[1]["count-boys"] + item[1]["count-girls"]
file.write("\tboys: " + str(item[1]["count-boys"]) + "\n") sortedStatues = [item[1] for item in sorted(statusStats.items(), key=sortKeyGetter, reverse=True)]
file.write("\tgirls: " + str(item[1]["count-girls"]) + "\n") data = OrderedDict([(item["full"], dictWithoutOneKey(item, "full")) for item in sortedStatues])
file.write(json.dumps(data, ensure_ascii=False, indent=4))