From 0c47ce2bd30fd61ecdb9daf57318d811c366c128 Mon Sep 17 00:00:00 2001 From: junbo <1428285274@qq.com> Date: Sat, 9 Mar 2024 16:40:39 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E6=95=B0=E6=8D=AE=E6=B8=85=E7=90=86?= =?UTF-8?q?=EF=BC=9Atag=E6=A0=87=E7=AD=BE=E8=BF=87=E4=BA=8E=E7=A8=80?= =?UTF-8?q?=E7=96=8F=EF=BC=8C=E4=BD=BF=E7=94=A8=E5=86=85=E7=BD=AEdifflib?= =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E5=90=88=E5=B9=B6=20&&=20=E6=9E=84=E5=BB=BAm?= =?UTF-8?q?inidataset?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data/clean/tag.py | 55 ++++++++++++++++++++++++++++++++++++++++++ data/minidata/load.py | 28 +++++++++++++++++++++ data/minidata/read.py | 24 ++++++++++++++++++ data/minidata/test.py | 7 ++++++ data/minidata/write.py | 36 +++++++++++++++++++++++++++ 5 files changed, 150 insertions(+) create mode 100644 data/clean/tag.py create mode 100644 data/minidata/load.py create mode 100644 data/minidata/read.py create mode 100644 data/minidata/test.py create mode 100644 data/minidata/write.py diff --git a/data/clean/tag.py b/data/clean/tag.py new file mode 100644 index 0000000..93623a7 --- /dev/null +++ b/data/clean/tag.py @@ -0,0 +1,55 @@ +# tag表标签很散,初始有27w个标签 + +import json +import mysql.connector +from tqdm import tqdm +import difflib + +mydb = mysql.connector.connect( + host="172.16.0.176", # 数据库主机地址 + user="root", # 数据库用户名 + passwd="admin", # 数据库密码 + database="bigdata" +) +cursor = mydb.cursor() +sql = "SELECT value FROM tag" +cursor.execute(sql) +tags = cursor.fetchall() +tags = [tag[0] for tag in tags] +tag_set = [] +tag_dict = {} + +max_step = 10 +threshold = 0.6 + +# 27w 双重循环复杂度太高 +# 优化后从 3h+ -> 24s +# 首先对value预处理。去掉”空格、-、'等“ +# tags = list(map(lambda x: x.replace(' ', '').replace('-', '').replace('\'', ''), tags)) +# 然后进行排序 +# tags = list(map(lambda x: x.lower(), tags)) +# 循环时比较最近的max_step个元素即可 + +for i in tqdm(range(len(tags))): + flag = True + + for tag in tag_set[-max_step:]: + if difflib.SequenceMatcher(None, tags[i], tag).quick_ratio() >= threshold: + flag = False + tag_dict[tags[i]] = tag + break + + if flag: + tag_set.append(tags[i]) + tag_dict[tags[i]] = tags[i] + +print(len(tag_set)) +import pickle +with open('tag_set.pkl', 'wb') as f: + pickle.dump(tag_set, f) +with open('tag_dict.pkl', 'wb') as f: + pickle.dump(tag_dict, f) + + +# print(len(tags)) +# print('\n'.join(tags)) diff --git a/data/minidata/load.py b/data/minidata/load.py new file mode 100644 index 0000000..36dd92e --- /dev/null +++ b/data/minidata/load.py @@ -0,0 +1,28 @@ +import json +import mysql.connector +from tqdm import tqdm +import pickle + +mydb = mysql.connector.connect( + host="172.16.0.176", # 数据库主机地址 + user="root", # 数据库用户名 + passwd="admin", # 数据库密码 + database="bigdata" +) +cursor = mydb.cursor() + +file = "data/user_ids.pkl" +with open(file, "rb") as f: + user_ids = pickle.load(f) + +user_ids = set(user_ids) + +sql = "select distinct creator_id from playlist" +cursor.execute(sql) +creators = cursor.fetchall() +creators = [creator[0] for creator in creators] + +users = set(creators) & user_ids +print(len(users)) +print(len(creators)) +print(len(user_ids)) \ No newline at end of file diff --git a/data/minidata/read.py b/data/minidata/read.py new file mode 100644 index 0000000..799b70e --- /dev/null +++ b/data/minidata/read.py @@ -0,0 +1,24 @@ +import json +import mysql.connector +from tqdm import tqdm +import pickle + +mydb = mysql.connector.connect( + host="172.16.0.176", # 数据库主机地址 + user="root", # 数据库用户名 + passwd="admin", # 数据库密码 + database="bigdata" +) +cursor = mydb.cursor() + +track_ids = pickle.load(open("data/track_ids.pkl", "rb")) + +sql = f'SELECT DISTINCT user_id FROM preference WHERE track_id IN ({",".join(track_ids)})' +# print(sql) +cursor.execute(sql) +user_ids = cursor.fetchall() +user_ids = [user_ids[0] for user_ids in user_ids] + + +with open('data/user_ids.pkl', 'wb') as f: + pickle.dump(user_ids, f) \ No newline at end of file diff --git a/data/minidata/test.py b/data/minidata/test.py new file mode 100644 index 0000000..20e5137 --- /dev/null +++ b/data/minidata/test.py @@ -0,0 +1,7 @@ +str = """INSERT INTO `preference` VALUES ('100', '42600', '1210711');""" + +# 分别提取出'100', '42600', '1210711' +import re +pattern = re.compile(r"VALUES \('(\d+)', '(\d+)', '(\d+)'\);") +result = pattern.findall(str) +print(result[0]) # [('100', '42600', '1210711')] \ No newline at end of file diff --git a/data/minidata/write.py b/data/minidata/write.py new file mode 100644 index 0000000..0756d80 --- /dev/null +++ b/data/minidata/write.py @@ -0,0 +1,36 @@ +import mysql.connector +import re +import pickle +from tqdm import tqdm + + +mydb = mysql.connector.connect( + host="172.16.0.176", # 数据库主机地址 + user="root", # 数据库用户名 + passwd="admin", # 数据库密码 + database="minidata" +) +cursor = mydb.cursor() + +file = "data/track_ids.pkl" +with open(file, "rb") as f: + track_ids = pickle.load(f) +file = "data/user_ids.pkl" +with open(file, "rb") as f: + user_ids = pickle.load(f) + +track_ids = set(track_ids) +user_ids = set(user_ids) + +filepath = "C:/Users/HP/Desktop/sql/preference.sql" +pattern = re.compile(r"VALUES \('(\d+)', '(\d+)', '(\d+)'\);") +with open(filepath, "r") as f: + lines = f.readlines() + for line in tqdm(lines): + _, user_id, track_id = pattern.findall(line)[0] + if track_id not in track_ids or user_id not in user_ids: continue + + sql = line + cursor.execute(sql) + +mydb.commit() \ No newline at end of file From 5126a190fdddf0c188e7f964acd46c3e81716171 Mon Sep 17 00:00:00 2001 From: junbo <1428285274@qq.com> Date: Sat, 9 Mar 2024 18:12:39 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E5=AF=BC=E5=85=A5tag=EF=BC=8Cuser=EF=BC=8C?= =?UTF-8?q?track=E5=92=8Ctrack=5Ftag=E8=A1=A8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data/.gitignore | 1 + data/clean/tag.py | 48 ++++++++++++++++++++++++++++++------------ data/minidata/test.py | 38 +++++++++++++++++++++++++++------ data/minidata/write.py | 24 ++++++--------------- 4 files changed, 73 insertions(+), 38 deletions(-) create mode 100644 data/.gitignore diff --git a/data/.gitignore b/data/.gitignore new file mode 100644 index 0000000..ef64266 --- /dev/null +++ b/data/.gitignore @@ -0,0 +1 @@ +/minidata/data/*.pkl \ No newline at end of file diff --git a/data/clean/tag.py b/data/clean/tag.py index 93623a7..5c37e49 100644 --- a/data/clean/tag.py +++ b/data/clean/tag.py @@ -4,6 +4,7 @@ import mysql.connector from tqdm import tqdm import difflib +import pickle mydb = mysql.connector.connect( host="172.16.0.176", # 数据库主机地址 @@ -12,44 +13,63 @@ database="bigdata" ) cursor = mydb.cursor() -sql = "SELECT value FROM tag" +sql = "SELECT id, value FROM tag" cursor.execute(sql) tags = cursor.fetchall() -tags = [tag[0] for tag in tags] -tag_set = [] +tag_map = {tag[0]: tag[1] for tag in tags} + +ids = [tag[0] for tag in tags] + +tags = [tag[1] for tag in tags] + +tag_lst = [] tag_dict = {} -max_step = 10 -threshold = 0.6 +max_step = 10000 +threshold = 0.8 +with open('../minidata/data/tag_set.pkl', 'rb') as f: + tag_set = pickle.load(f) +tags = list(tag_set) +tag_map = {i: tag_map[i] for i in tag_set} +with open('../minidata/data/tag_map.pkl', 'wb') as f: + pickle.dump(tag_map, f) +print(len(tags)) # 27w 双重循环复杂度太高 # 优化后从 3h+ -> 24s # 首先对value预处理。去掉”空格、-、'等“ # tags = list(map(lambda x: x.replace(' ', '').replace('-', '').replace('\'', ''), tags)) # 然后进行排序 -# tags = list(map(lambda x: x.lower(), tags)) +tags = list(map(lambda x: x.lower(), tags)) # 循环时比较最近的max_step个元素即可 +cnt = 0 for i in tqdm(range(len(tags))): flag = True + x = tag_map[tags[i]] + for tag in tag_lst[-max_step:]: + # if ids[i] not in tag_set: + # continue + # else: + # cnt += 1 - for tag in tag_set[-max_step:]: - if difflib.SequenceMatcher(None, tags[i], tag).quick_ratio() >= threshold: + if difflib.SequenceMatcher(None, x, tag).quick_ratio() >= threshold: flag = False - tag_dict[tags[i]] = tag + tag_dict[x] = tag break if flag: - tag_set.append(tags[i]) - tag_dict[tags[i]] = tags[i] + tag_lst.append(x) + tag_dict[x] = x -print(len(tag_set)) +print(len(tag_lst)) +print(len(set(tag_dict.values()))) import pickle -with open('tag_set.pkl', 'wb') as f: - pickle.dump(tag_set, f) + with open('tag_dict.pkl', 'wb') as f: pickle.dump(tag_dict, f) # print(len(tags)) # print('\n'.join(tags)) +print(cnt) \ No newline at end of file diff --git a/data/minidata/test.py b/data/minidata/test.py index 20e5137..1dfdf68 100644 --- a/data/minidata/test.py +++ b/data/minidata/test.py @@ -1,7 +1,33 @@ -str = """INSERT INTO `preference` VALUES ('100', '42600', '1210711');""" +import pickle +from tqdm import tqdm +import json +import mysql.connector -# 分别提取出'100', '42600', '1210711' -import re -pattern = re.compile(r"VALUES \('(\d+)', '(\d+)', '(\d+)'\);") -result = pattern.findall(str) -print(result[0]) # [('100', '42600', '1210711')] \ No newline at end of file +mydb = mysql.connector.connect( + host="172.16.0.176", # 数据库主机地址 + user="root", # 数据库用户名 + passwd="admin", # 数据库密码 + database="minidata" +) +cursor = mydb.cursor() + +tag_set = set() +filepath = "C:/Users/HP/Desktop/音乐推荐/entities/mini_tracks.idomaar" +primary_key = set() +with open(filepath, "r") as file: + lines = file.readlines() + for line in tqdm(lines): + tags = json.loads(line.split("\t")[-1])["tags"] + track_id = line.split("\t")[1] + for tag in tags: + key = f"{track_id}-{tag['id']}" + if key in primary_key: continue + primary_key.add(key) + + sql = "INSERT INTO track_tag (track_id, tag_id) VALUES (%s, %s)" + val = (track_id, tag["id"]) + try: + cursor.execute(sql, val) + except Exception as e: + print(e) + mydb.commit() \ No newline at end of file diff --git a/data/minidata/write.py b/data/minidata/write.py index 0756d80..b5df4c3 100644 --- a/data/minidata/write.py +++ b/data/minidata/write.py @@ -12,25 +12,13 @@ ) cursor = mydb.cursor() -file = "data/track_ids.pkl" -with open(file, "rb") as f: - track_ids = pickle.load(f) -file = "data/user_ids.pkl" -with open(file, "rb") as f: - user_ids = pickle.load(f) +with open('./data/tag_map.pkl', 'rb') as f: + tag_map = pickle.load(f) -track_ids = set(track_ids) -user_ids = set(user_ids) +for k, v in tqdm(tag_map.items()): + sql = "INSERT INTO tag (id, value) VALUES (%s, %s)" + val = (k, v) + cursor.execute(sql, val) -filepath = "C:/Users/HP/Desktop/sql/preference.sql" -pattern = re.compile(r"VALUES \('(\d+)', '(\d+)', '(\d+)'\);") -with open(filepath, "r") as f: - lines = f.readlines() - for line in tqdm(lines): - _, user_id, track_id = pattern.findall(line)[0] - if track_id not in track_ids or user_id not in user_ids: continue - - sql = line - cursor.execute(sql) mydb.commit() \ No newline at end of file