diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..ef64266 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +/minidata/data/*.pkl \ No newline at end of file diff --git a/data/clean/tag.py b/data/clean/tag.py new file mode 100644 index 0000000..5c37e49 --- /dev/null +++ b/data/clean/tag.py @@ -0,0 +1,75 @@ +# tag表标签很散,初始有27w个标签 + +import json +import mysql.connector +from tqdm import tqdm +import difflib +import pickle + +mydb = mysql.connector.connect( + host="172.16.0.176", # 数据库主机地址 + user="root", # 数据库用户名 + passwd="admin", # 数据库密码 + database="bigdata" +) +cursor = mydb.cursor() +sql = "SELECT id, value FROM tag" +cursor.execute(sql) +tags = cursor.fetchall() +tag_map = {tag[0]: tag[1] for tag in tags} + +ids = [tag[0] for tag in tags] + +tags = [tag[1] for tag in tags] + +tag_lst = [] +tag_dict = {} + +max_step = 10000 +threshold = 0.8 + +with open('../minidata/data/tag_set.pkl', 'rb') as f: + tag_set = pickle.load(f) +tags = list(tag_set) +tag_map = {i: tag_map[i] for i in tag_set} +with open('../minidata/data/tag_map.pkl', 'wb') as f: + pickle.dump(tag_map, f) +print(len(tags)) +# 27w 双重循环复杂度太高 +# 优化后从 3h+ -> 24s +# 首先对value预处理。去掉”空格、-、'等“ +# tags = list(map(lambda x: x.replace(' ', '').replace('-', '').replace('\'', ''), tags)) +# 然后进行排序 +tags = list(map(lambda x: x.lower(), tags)) +# 循环时比较最近的max_step个元素即可 + +cnt = 0 +for i in tqdm(range(len(tags))): + flag = True + x = tag_map[tags[i]] + for tag in tag_lst[-max_step:]: + # if ids[i] not in tag_set: + # continue + # else: + # cnt += 1 + + if difflib.SequenceMatcher(None, x, tag).quick_ratio() >= threshold: + flag = False + tag_dict[x] = tag + break + + if flag: + tag_lst.append(x) + tag_dict[x] = x + +print(len(tag_lst)) +print(len(set(tag_dict.values()))) +import pickle + +with open('tag_dict.pkl', 'wb') as f: + pickle.dump(tag_dict, f) + + +# print(len(tags)) +# print('\n'.join(tags)) +print(cnt) \ No newline at end of file diff --git a/data/minidata/load.py b/data/minidata/load.py new file mode 100644 index 0000000..36dd92e --- /dev/null +++ b/data/minidata/load.py @@ -0,0 +1,28 @@ +import json +import mysql.connector +from tqdm import tqdm +import pickle + +mydb = mysql.connector.connect( + host="172.16.0.176", # 数据库主机地址 + user="root", # 数据库用户名 + passwd="admin", # 数据库密码 + database="bigdata" +) +cursor = mydb.cursor() + +file = "data/user_ids.pkl" +with open(file, "rb") as f: + user_ids = pickle.load(f) + +user_ids = set(user_ids) + +sql = "select distinct creator_id from playlist" +cursor.execute(sql) +creators = cursor.fetchall() +creators = [creator[0] for creator in creators] + +users = set(creators) & user_ids +print(len(users)) +print(len(creators)) +print(len(user_ids)) \ No newline at end of file diff --git a/data/minidata/read.py b/data/minidata/read.py new file mode 100644 index 0000000..799b70e --- /dev/null +++ b/data/minidata/read.py @@ -0,0 +1,24 @@ +import json +import mysql.connector +from tqdm import tqdm +import pickle + +mydb = mysql.connector.connect( + host="172.16.0.176", # 数据库主机地址 + user="root", # 数据库用户名 + passwd="admin", # 数据库密码 + database="bigdata" +) +cursor = mydb.cursor() + +track_ids = pickle.load(open("data/track_ids.pkl", "rb")) + +sql = f'SELECT DISTINCT user_id FROM preference WHERE track_id IN ({",".join(track_ids)})' +# print(sql) +cursor.execute(sql) +user_ids = cursor.fetchall() +user_ids = [user_ids[0] for user_ids in user_ids] + + +with open('data/user_ids.pkl', 'wb') as f: + pickle.dump(user_ids, f) \ No newline at end of file diff --git a/data/minidata/test.py b/data/minidata/test.py new file mode 100644 index 0000000..1dfdf68 --- /dev/null +++ b/data/minidata/test.py @@ -0,0 +1,33 @@ +import pickle +from tqdm import tqdm +import json +import mysql.connector + +mydb = mysql.connector.connect( + host="172.16.0.176", # 数据库主机地址 + user="root", # 数据库用户名 + passwd="admin", # 数据库密码 + database="minidata" +) +cursor = mydb.cursor() + +tag_set = set() +filepath = "C:/Users/HP/Desktop/音乐推荐/entities/mini_tracks.idomaar" +primary_key = set() +with open(filepath, "r") as file: + lines = file.readlines() + for line in tqdm(lines): + tags = json.loads(line.split("\t")[-1])["tags"] + track_id = line.split("\t")[1] + for tag in tags: + key = f"{track_id}-{tag['id']}" + if key in primary_key: continue + primary_key.add(key) + + sql = "INSERT INTO track_tag (track_id, tag_id) VALUES (%s, %s)" + val = (track_id, tag["id"]) + try: + cursor.execute(sql, val) + except Exception as e: + print(e) + mydb.commit() \ No newline at end of file diff --git a/data/minidata/write.py b/data/minidata/write.py new file mode 100644 index 0000000..b5df4c3 --- /dev/null +++ b/data/minidata/write.py @@ -0,0 +1,24 @@ +import mysql.connector +import re +import pickle +from tqdm import tqdm + + +mydb = mysql.connector.connect( + host="172.16.0.176", # 数据库主机地址 + user="root", # 数据库用户名 + passwd="admin", # 数据库密码 + database="minidata" +) +cursor = mydb.cursor() + +with open('./data/tag_map.pkl', 'rb') as f: + tag_map = pickle.load(f) + +for k, v in tqdm(tag_map.items()): + sql = "INSERT INTO tag (id, value) VALUES (%s, %s)" + val = (k, v) + cursor.execute(sql, val) + + +mydb.commit() \ No newline at end of file