Merge branch 'master' of https://github.com/AlekseyLobanov/gotohack
This commit is contained in:
22
all-good-ids.py
Normal file
22
all-good-ids.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
from os.path import join
|
||||||
|
|
||||||
|
all_ids = {}
|
||||||
|
|
||||||
|
for dirpath, dirnames, filenames in os.walk(sys.argv[1]):
|
||||||
|
for f in filenames:
|
||||||
|
fp = os.path.join(dirpath, f)
|
||||||
|
for uid in open(fp):
|
||||||
|
uid = int(uid)
|
||||||
|
if uid in all_ids:
|
||||||
|
all_ids[uid] += 1
|
||||||
|
else:
|
||||||
|
all_ids[uid] = 1
|
||||||
|
|
||||||
|
f_out = open(sys.argv[2],'w')
|
||||||
|
f_out.write(json.dumps(all_ids))
|
||||||
64
audio-analyzer.py
Normal file
64
audio-analyzer.py
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
import json
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from nltk import RegexpTokenizer, OrderedDict
|
||||||
|
from nltk.stem.snowball import RussianStemmer
|
||||||
|
|
||||||
|
genres = {
|
||||||
|
1: "Rock",
|
||||||
|
2: "Pop",
|
||||||
|
3: "Rap & Hip - Hop",
|
||||||
|
4: "Easy Listening",
|
||||||
|
5: "Dance & House",
|
||||||
|
6: "Instrumental",
|
||||||
|
7: "Metal",
|
||||||
|
21: "Alternative",
|
||||||
|
8: "Dubstep",
|
||||||
|
9: "Jazz & Blues",
|
||||||
|
10: "Drum & Bass",
|
||||||
|
11: "Trance",
|
||||||
|
12: "Chanson",
|
||||||
|
13: "Ethnic",
|
||||||
|
14: "Acoustic & Vocal",
|
||||||
|
15: "Reggae",
|
||||||
|
16: "Classical",
|
||||||
|
17: "Indie Pop",
|
||||||
|
19: "Speech",
|
||||||
|
22: "Electropop & Disco",
|
||||||
|
18: "Other"
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def dictWithoutOneKey(d, key):
|
||||||
|
new_d = d.copy()
|
||||||
|
new_d.pop(key)
|
||||||
|
return new_d
|
||||||
|
|
||||||
|
|
||||||
|
audioStats = dict()
|
||||||
|
|
||||||
|
tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+")
|
||||||
|
stemmer = RussianStemmer()
|
||||||
|
|
||||||
|
musicFileName = sys.argv[1]
|
||||||
|
with open(musicFileName, "r", encoding="utf8") as file:
|
||||||
|
for line in file:
|
||||||
|
jsonData = json.loads(line, encoding="utf8")
|
||||||
|
for song in list(jsonData.values())[0]:
|
||||||
|
songName = "{} - {}".format(song["artist"], song["title"])
|
||||||
|
filteredSongName = "".join([stemmer.stem(token).lower() for token in tokenizer.tokenize(songName)])
|
||||||
|
if len(filteredSongName) > 1:
|
||||||
|
audioStatsItem = audioStats.get(filteredSongName, {
|
||||||
|
"name": songName,
|
||||||
|
"url": song["url"],
|
||||||
|
"genre": genres.get(song["genre_id"], "Other"),
|
||||||
|
"count": 0
|
||||||
|
})
|
||||||
|
audioStatsItem["count"] += 1
|
||||||
|
audioStats[filteredSongName] = audioStatsItem
|
||||||
|
|
||||||
|
destFileName = sys.argv[2]
|
||||||
|
with open(destFileName, "w", encoding="utf8") as file:
|
||||||
|
sortedSongs = [item[1] for item in sorted(audioStats.items(), key=lambda item: item[1]["count"], reverse=True)]
|
||||||
|
data = OrderedDict([(item["name"], dictWithoutOneKey(item, "name")) for item in sortedSongs])
|
||||||
|
file.write(json.dumps(data, ensure_ascii=False, indent=4))
|
||||||
47
audio-fetcher.py
Normal file
47
audio-fetcher.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
import json
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
import vk_api
|
||||||
|
|
||||||
|
# getting pazans
|
||||||
|
pazanIds = None
|
||||||
|
pazansFileName = sys.argv[1]
|
||||||
|
with open(pazansFileName, "r") as file:
|
||||||
|
jsonData = json.loads(file.read())
|
||||||
|
pazanIds = [item[0] for item in sorted(jsonData.items(), key=lambda item: len(item[1]), reverse=True)]
|
||||||
|
|
||||||
|
# getting music
|
||||||
|
def captcha_handler(captcha):
|
||||||
|
key = input("Enter Captcha {0}: ".format(captcha.get_url())).strip()
|
||||||
|
return captcha.try_again(key)
|
||||||
|
|
||||||
|
vk = vk_api.VkApi(token=sys.argv[3], app_id=sys.argv[4], captcha_handler=captcha_handler)
|
||||||
|
|
||||||
|
for index, pazanId in enumerate(pazanIds, start=(int(sys.argv[5]) if len(sys.argv) > 5 else 0)):
|
||||||
|
done = False
|
||||||
|
while not done:
|
||||||
|
try:
|
||||||
|
print(index, pazanId)
|
||||||
|
pazanSongs = []
|
||||||
|
# jsonData = vk.method("audio.get", {"owner_id": pazanId, "need_user": 0, "count": 100})
|
||||||
|
jsonData = vk.method("execute.getMusic", {"id": pazanId})
|
||||||
|
for audio in jsonData["items"]:
|
||||||
|
pazanSong = {
|
||||||
|
"artist": audio["artist"],
|
||||||
|
"title": audio["title"],
|
||||||
|
"genre_id": audio.get("genre_id", None),
|
||||||
|
"url": audio["url"],
|
||||||
|
}
|
||||||
|
pazanSongs.append(pazanSong)
|
||||||
|
with open(sys.argv[2], "a", encoding="utf-8") as file:
|
||||||
|
file.write(json.dumps({pazanId: pazanSongs}, ensure_ascii=False) + "\n")
|
||||||
|
done = True
|
||||||
|
except vk_api.ApiError as e:
|
||||||
|
if e.code == 9:
|
||||||
|
print("waiting")
|
||||||
|
time.sleep(60)
|
||||||
|
elif e.code == 201 or e.code == 15:
|
||||||
|
done = True
|
||||||
|
else:
|
||||||
|
raise e
|
||||||
68
get-users-addresses.py
Normal file
68
get-users-addresses.py
Normal file
@@ -0,0 +1,68 @@
|
|||||||
|
#!/usr/bin/python
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
"""
|
||||||
|
get_users_addresses.py SCHOOL_IDS_FILE ADDRESSES_FILE USERS_IDS_JSON FULL_BASE_FILE OUTPUT_FILE
|
||||||
|
"""
|
||||||
|
import sys
|
||||||
|
import csv
|
||||||
|
import json
|
||||||
|
from urllib.parse import quote_plus
|
||||||
|
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
def getCoords(name):
|
||||||
|
r = requests.get('https://geocode-maps.yandex.ru/1.x/?geocode='+quote_plus(name) + '&format=json')
|
||||||
|
points = r.json()['response']['GeoObjectCollection']['featureMember'][0]['GeoObject']['Point']['pos'].split()[::-1]
|
||||||
|
return tuple([float(x) for x in points])
|
||||||
|
|
||||||
|
|
||||||
|
school_ids = {}
|
||||||
|
cnt = 0
|
||||||
|
with open(sys.argv[1],'r') as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
for row in reader:
|
||||||
|
cnt += 1
|
||||||
|
if cnt == 1:
|
||||||
|
continue
|
||||||
|
if len(row[0].strip()) == 0 or len(row[3].strip()) == 0:
|
||||||
|
continue
|
||||||
|
school_ids[int(row[3])] = row[0]
|
||||||
|
|
||||||
|
addresses = {}
|
||||||
|
cnt = 0
|
||||||
|
with open(sys.argv[2],'r') as f:
|
||||||
|
reader = csv.reader(f)
|
||||||
|
for row in reader:
|
||||||
|
cnt += 1
|
||||||
|
if cnt == 1:
|
||||||
|
continue
|
||||||
|
if len((row[3] + row[2]).strip()) > 0:
|
||||||
|
addresses[row[0]] = {"coord":(row[3],row[2]),"address":row[1]}
|
||||||
|
else:
|
||||||
|
addresses[row[0]] = {"coord":getCoords('Москва ' + row[1]),"address":row[1]}
|
||||||
|
|
||||||
|
pazans = json.loads(open(sys.argv[3]).read())
|
||||||
|
|
||||||
|
pazan_ids = set([int(i) for i in pazans.keys()])
|
||||||
|
|
||||||
|
pazan_schools = {}
|
||||||
|
|
||||||
|
print(sys.argv[4])
|
||||||
|
for line in open(sys.argv[4]):
|
||||||
|
json_line = json.loads(line)
|
||||||
|
uid = int(json_line['_id'])
|
||||||
|
if uid in pazan_ids:
|
||||||
|
if 'schools' not in json_line or len(json_line['schools']) == 0:
|
||||||
|
continue
|
||||||
|
school_id = int(json_line['schools'][-1]['id'])
|
||||||
|
if school_id not in school_ids:
|
||||||
|
continue
|
||||||
|
school_name = school_ids[school_id]
|
||||||
|
if school_name not in addresses:
|
||||||
|
continue
|
||||||
|
pazan_schools[uid] = addresses[school_name]
|
||||||
|
|
||||||
|
f_out = open(sys.argv[5],'w')
|
||||||
|
f_out.write(json.dumps(pazan_schools))
|
||||||
|
print("There are {} pazans with school".format(len(pazan_schools)))
|
||||||
33
group-ids-downloader.py
Normal file
33
group-ids-downloader.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
#!/usr/bin/python3
|
||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
'''
|
||||||
|
group_ids_downloader.py VK_LOGIN VK_PASSWORD APP_ID INPUT_FILENAME
|
||||||
|
where INPUT_FILENAME is file with one id of group per line
|
||||||
|
'''
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import vk_api
|
||||||
|
|
||||||
|
def getIdsByGroup(group_id):
|
||||||
|
ids = []
|
||||||
|
res = vk.method("groups.getMembers", {"group_id":group_id,"count":1000})
|
||||||
|
count = res['count'] - 1000
|
||||||
|
ids += res['items']
|
||||||
|
cur_offset = 1000
|
||||||
|
while count > 0:
|
||||||
|
res = vk.method("groups.getMembers", {"group_id":group_id,
|
||||||
|
"count":1000,"offset":cur_offset, "sort":"id_asc"})
|
||||||
|
count = count - 1000
|
||||||
|
cur_offset += 1000
|
||||||
|
ids += res['items']
|
||||||
|
return ids
|
||||||
|
|
||||||
|
vk_login, vk_password = sys.argv[1], sys.argv[2]
|
||||||
|
vk = vk_api.VkApi(vk_login, vk_password, app_id=sys.argv[3])
|
||||||
|
|
||||||
|
vk.authorization()
|
||||||
|
|
||||||
|
for group_id in open(sys.argv[4]):
|
||||||
|
group_id = group_id.strip()
|
||||||
|
good_ids = getIdsByGroup(group_id)
|
||||||
|
open('out/' + str(group_id),'w').write('\n'.join([str(i) for i in good_ids]))
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
|
import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import os
|
|
||||||
import pymongo
|
import pymongo
|
||||||
|
|
||||||
dirWithIds = sys.argv[1]
|
dirWithIds = sys.argv[1]
|
||||||
@@ -36,7 +37,7 @@ else:
|
|||||||
for fileName in os.listdir(dirWithIds):
|
for fileName in os.listdir(dirWithIds):
|
||||||
print("parsing", fileName)
|
print("parsing", fileName)
|
||||||
|
|
||||||
with open(os.path.join(dirWithIds, fileName)) as file:
|
with open(os.path.join(dirWithIds, fileName), "r") as file:
|
||||||
for line in file:
|
for line in file:
|
||||||
id = int(line)
|
id = int(line)
|
||||||
if id in ids:
|
if id in ids:
|
||||||
|
|||||||
199237
results/pazan_music.json
Normal file
199237
results/pazan_music.json
Normal file
File diff suppressed because it is too large
Load Diff
67546
results/pazanskaya_mudrost.json
Normal file
67546
results/pazanskaya_mudrost.json
Normal file
File diff suppressed because it is too large
Load Diff
BIN
results/presentation.pdf
Normal file
BIN
results/presentation.pdf
Normal file
Binary file not shown.
@@ -1,13 +1,22 @@
|
|||||||
import json
|
import json
|
||||||
import sys
|
import sys
|
||||||
|
from collections import OrderedDict
|
||||||
|
|
||||||
from nltk.stem.snowball import RussianStemmer
|
from nltk.stem.snowball import RussianStemmer
|
||||||
from nltk.tokenize import RegexpTokenizer
|
from nltk.tokenize import RegexpTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
def dictWithoutOneKey(d, key):
|
||||||
|
new_d = d.copy()
|
||||||
|
new_d.pop(key)
|
||||||
|
return new_d
|
||||||
|
|
||||||
|
|
||||||
# load pazans
|
# load pazans
|
||||||
pazansGroups = None
|
pazansGroups = None
|
||||||
|
|
||||||
pazansFileName = sys.argv[2]
|
pazansFileName = sys.argv[1]
|
||||||
with open(pazansFileName) as file:
|
with open(pazansFileName, "r") as file:
|
||||||
pazansGroups = json.loads(file.read())
|
pazansGroups = json.loads(file.read())
|
||||||
|
|
||||||
# analyze statues
|
# analyze statues
|
||||||
@@ -16,8 +25,8 @@ statusStats = dict()
|
|||||||
tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+")
|
tokenizer = RegexpTokenizer(r"[A-Za-zА-Яа-я]+")
|
||||||
stemmer = RussianStemmer()
|
stemmer = RussianStemmer()
|
||||||
|
|
||||||
usersFileName = sys.argv[1]
|
usersFileName = sys.argv[2]
|
||||||
with open(usersFileName) as file:
|
with open(usersFileName, "r") as file:
|
||||||
for line in file:
|
for line in file:
|
||||||
user = json.loads(line)
|
user = json.loads(line)
|
||||||
id = str(user["_id"])
|
id = str(user["_id"])
|
||||||
@@ -36,8 +45,9 @@ with open(usersFileName) as file:
|
|||||||
statusStats[filteredStatusText] = statusStatsItem
|
statusStats[filteredStatusText] = statusStatsItem
|
||||||
|
|
||||||
# print result
|
# print result
|
||||||
with open(sys.argv[3], "w", encoding="utf-8") as file:
|
destFileName = sys.argv[3]
|
||||||
for item in sorted(statusStats.items(), key=lambda item: item[1]["count-boys"] + item[1]["count-girls"], reverse=True):
|
with open(destFileName, "w", encoding="utf8") as file:
|
||||||
file.write(item[1]["full"] + "\n")
|
sortKeyGetter = lambda item: item[1]["count-boys"] + item[1]["count-girls"]
|
||||||
file.write("\tboys: " + str(item[1]["count-boys"]) + "\n")
|
sortedStatues = [item[1] for item in sorted(statusStats.items(), key=sortKeyGetter, reverse=True)]
|
||||||
file.write("\tgirls: " + str(item[1]["count-girls"]) + "\n")
|
data = OrderedDict([(item["full"], dictWithoutOneKey(item, "full")) for item in sortedStatues])
|
||||||
|
file.write(json.dumps(data, ensure_ascii=False, indent=4))
|
||||||
|
|||||||
Reference in New Issue
Block a user