From 0c47ce2bd30fd61ecdb9daf57318d811c366c128 Mon Sep 17 00:00:00 2001
From: junbo <1428285274@qq.com>
Date: Sat, 9 Mar 2024 16:40:39 +0800
Subject: [PATCH 1/2] =?UTF-8?q?=E6=95=B0=E6=8D=AE=E6=B8=85=E7=90=86?=
 =?UTF-8?q?=EF=BC=9Atag=E6=A0=87=E7=AD=BE=E8=BF=87=E4=BA=8E=E7=A8=80?=
 =?UTF-8?q?=E7=96=8F=EF=BC=8C=E4=BD=BF=E7=94=A8=E5=86=85=E7=BD=AEdifflib?=
 =?UTF-8?q?=E8=BF=9B=E8=A1=8C=E5=90=88=E5=B9=B6=20&&=20=E6=9E=84=E5=BB=BAm?=
 =?UTF-8?q?inidataset?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data/clean/tag.py      | 55 ++++++++++++++++++++++++++++++++++++++++++
 data/minidata/load.py  | 28 +++++++++++++++++++++
 data/minidata/read.py  | 24 ++++++++++++++++++
 data/minidata/test.py  |  7 ++++++
 data/minidata/write.py | 36 +++++++++++++++++++++++++++
 5 files changed, 150 insertions(+)
 create mode 100644 data/clean/tag.py
 create mode 100644 data/minidata/load.py
 create mode 100644 data/minidata/read.py
 create mode 100644 data/minidata/test.py
 create mode 100644 data/minidata/write.py

diff --git a/data/clean/tag.py b/data/clean/tag.py
new file mode 100644
index 0000000..93623a7
--- /dev/null
+++ b/data/clean/tag.py
@@ -0,0 +1,55 @@
+# tag表标签很散，初始有27w个标签
+
+import json
+import mysql.connector
+from tqdm import tqdm
+import difflib
+
+mydb = mysql.connector.connect(
+    host="172.16.0.176",  # 数据库主机地址
+    user="root",  # 数据库用户名
+    passwd="admin",  # 数据库密码
+    database="bigdata"
+)
+cursor = mydb.cursor()
+sql = "SELECT value FROM tag"
+cursor.execute(sql)
+tags = cursor.fetchall()
+tags = [tag[0] for tag in tags]
+tag_set = []
+tag_dict = {}
+
+max_step = 10
+threshold = 0.6
+
+# 27w 双重循环复杂度太高
+# 优化后从 3h+ -> 24s
+# 首先对value预处理。去掉”空格、-、'等“
+# tags = list(map(lambda x: x.replace(' ', '').replace('-', '').replace('\'', ''), tags))
+# 然后进行排序
+# tags = list(map(lambda x: x.lower(), tags))
+# 循环时比较最近的max_step个元素即可
+
+for i in tqdm(range(len(tags))):
+    flag = True
+
+    for tag in tag_set[-max_step:]:
+        if difflib.SequenceMatcher(None, tags[i], tag).quick_ratio() >= threshold:
+            flag = False
+            tag_dict[tags[i]] = tag
+            break
+
+    if flag:
+        tag_set.append(tags[i])
+        tag_dict[tags[i]] = tags[i]
+
+print(len(tag_set))
+import pickle
+with open('tag_set.pkl', 'wb') as f:
+    pickle.dump(tag_set, f)
+with open('tag_dict.pkl', 'wb') as f:
+    pickle.dump(tag_dict, f)
+
+
+# print(len(tags))
+# print('\n'.join(tags))
diff --git a/data/minidata/load.py b/data/minidata/load.py
new file mode 100644
index 0000000..36dd92e
--- /dev/null
+++ b/data/minidata/load.py
@@ -0,0 +1,28 @@
+import json
+import mysql.connector
+from tqdm import tqdm
+import pickle
+
+mydb = mysql.connector.connect(
+    host="172.16.0.176",  # 数据库主机地址
+    user="root",  # 数据库用户名
+    passwd="admin",  # 数据库密码
+    database="bigdata"
+)
+cursor = mydb.cursor()
+
+file = "data/user_ids.pkl"
+with open(file, "rb") as f:
+    user_ids = pickle.load(f)
+
+user_ids = set(user_ids)
+
+sql = "select distinct creator_id from playlist"
+cursor.execute(sql)
+creators = cursor.fetchall()
+creators = [creator[0] for creator in creators]
+
+users = set(creators) & user_ids
+print(len(users))
+print(len(creators))
+print(len(user_ids))
\ No newline at end of file
diff --git a/data/minidata/read.py b/data/minidata/read.py
new file mode 100644
index 0000000..799b70e
--- /dev/null
+++ b/data/minidata/read.py
@@ -0,0 +1,24 @@
+import json
+import mysql.connector
+from tqdm import tqdm
+import pickle
+
+mydb = mysql.connector.connect(
+    host="172.16.0.176",  # 数据库主机地址
+    user="root",  # 数据库用户名
+    passwd="admin",  # 数据库密码
+    database="bigdata"
+)
+cursor = mydb.cursor()
+
+track_ids = pickle.load(open("data/track_ids.pkl", "rb"))
+
+sql = f'SELECT DISTINCT user_id FROM preference WHERE track_id IN ({",".join(track_ids)})'
+# print(sql)
+cursor.execute(sql)
+user_ids = cursor.fetchall()
+user_ids = [user_ids[0] for user_ids in user_ids]
+
+
+with open('data/user_ids.pkl', 'wb') as f:
+    pickle.dump(user_ids, f)
\ No newline at end of file
diff --git a/data/minidata/test.py b/data/minidata/test.py
new file mode 100644
index 0000000..20e5137
--- /dev/null
+++ b/data/minidata/test.py
@@ -0,0 +1,7 @@
+str = """INSERT INTO `preference` VALUES ('100', '42600', '1210711');"""
+
+# 分别提取出'100', '42600', '1210711'
+import re
+pattern = re.compile(r"VALUES \('(\d+)', '(\d+)', '(\d+)'\);")
+result = pattern.findall(str)
+print(result[0])  # [('100', '42600', '1210711')]
\ No newline at end of file
diff --git a/data/minidata/write.py b/data/minidata/write.py
new file mode 100644
index 0000000..0756d80
--- /dev/null
+++ b/data/minidata/write.py
@@ -0,0 +1,36 @@
+import mysql.connector
+import re
+import pickle
+from tqdm import tqdm
+
+
+mydb = mysql.connector.connect(
+    host="172.16.0.176",  # 数据库主机地址
+    user="root",  # 数据库用户名
+    passwd="admin",  # 数据库密码
+    database="minidata"
+)
+cursor = mydb.cursor()
+
+file = "data/track_ids.pkl"
+with open(file, "rb") as f:
+    track_ids = pickle.load(f)
+file = "data/user_ids.pkl"
+with open(file, "rb") as f:
+    user_ids = pickle.load(f)
+
+track_ids = set(track_ids)
+user_ids = set(user_ids)
+
+filepath = "C:/Users/HP/Desktop/sql/preference.sql"
+pattern = re.compile(r"VALUES \('(\d+)', '(\d+)', '(\d+)'\);")
+with open(filepath, "r") as f:
+    lines = f.readlines()
+    for line in tqdm(lines):
+        _, user_id, track_id = pattern.findall(line)[0]
+        if track_id not in track_ids or user_id not in user_ids: continue
+
+        sql = line
+        cursor.execute(sql)
+
+mydb.commit()
\ No newline at end of file

From 5126a190fdddf0c188e7f964acd46c3e81716171 Mon Sep 17 00:00:00 2001
From: junbo <1428285274@qq.com>
Date: Sat, 9 Mar 2024 18:12:39 +0800
Subject: [PATCH 2/2] =?UTF-8?q?=E5=AF=BC=E5=85=A5tag=EF=BC=8Cuser=EF=BC=8C?=
 =?UTF-8?q?track=E5=92=8Ctrack=5Ftag=E8=A1=A8?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data/.gitignore        |  1 +
 data/clean/tag.py      | 48 ++++++++++++++++++++++++++++++------------
 data/minidata/test.py  | 38 +++++++++++++++++++++++++++------
 data/minidata/write.py | 24 ++++++---------------
 4 files changed, 73 insertions(+), 38 deletions(-)
 create mode 100644 data/.gitignore

diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 0000000..ef64266
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1 @@
+/minidata/data/*.pkl
\ No newline at end of file
diff --git a/data/clean/tag.py b/data/clean/tag.py
index 93623a7..5c37e49 100644
--- a/data/clean/tag.py
+++ b/data/clean/tag.py
@@ -4,6 +4,7 @@
 import mysql.connector
 from tqdm import tqdm
 import difflib
+import pickle
 
 mydb = mysql.connector.connect(
     host="172.16.0.176",  # 数据库主机地址
@@ -12,44 +13,63 @@
     database="bigdata"
 )
 cursor = mydb.cursor()
-sql = "SELECT value FROM tag"
+sql = "SELECT id, value FROM tag"
 cursor.execute(sql)
 tags = cursor.fetchall()
-tags = [tag[0] for tag in tags]
-tag_set = []
+tag_map = {tag[0]: tag[1] for tag in tags}
+
+ids = [tag[0] for tag in tags]
+
+tags = [tag[1] for tag in tags]
+
+tag_lst = []
 tag_dict = {}
 
-max_step = 10
-threshold = 0.6
+max_step = 10000
+threshold = 0.8
 
+with open('../minidata/data/tag_set.pkl', 'rb') as f:
+    tag_set = pickle.load(f)
+tags = list(tag_set)
+tag_map = {i: tag_map[i] for i in tag_set}
+with open('../minidata/data/tag_map.pkl', 'wb') as f:
+    pickle.dump(tag_map, f)
+print(len(tags))
 # 27w 双重循环复杂度太高
 # 优化后从 3h+ -> 24s
 # 首先对value预处理。去掉”空格、-、'等“
 # tags = list(map(lambda x: x.replace(' ', '').replace('-', '').replace('\'', ''), tags))
 # 然后进行排序
-# tags = list(map(lambda x: x.lower(), tags))
+tags = list(map(lambda x: x.lower(), tags))
 # 循环时比较最近的max_step个元素即可
 
+cnt = 0
 for i in tqdm(range(len(tags))):
     flag = True
+    x = tag_map[tags[i]]
+    for tag in tag_lst[-max_step:]:
+        # if ids[i] not in tag_set:
+        #     continue
+        # else:
+        #     cnt += 1
 
-    for tag in tag_set[-max_step:]:
-        if difflib.SequenceMatcher(None, tags[i], tag).quick_ratio() >= threshold:
+        if difflib.SequenceMatcher(None, x, tag).quick_ratio() >= threshold:
             flag = False
-            tag_dict[tags[i]] = tag
+            tag_dict[x] = tag
             break
 
     if flag:
-        tag_set.append(tags[i])
-        tag_dict[tags[i]] = tags[i]
+        tag_lst.append(x)
+        tag_dict[x] = x
 
-print(len(tag_set))
+print(len(tag_lst))
+print(len(set(tag_dict.values())))
 import pickle
-with open('tag_set.pkl', 'wb') as f:
-    pickle.dump(tag_set, f)
+
 with open('tag_dict.pkl', 'wb') as f:
     pickle.dump(tag_dict, f)
 
 
 # print(len(tags))
 # print('\n'.join(tags))
+print(cnt)
\ No newline at end of file
diff --git a/data/minidata/test.py b/data/minidata/test.py
index 20e5137..1dfdf68 100644
--- a/data/minidata/test.py
+++ b/data/minidata/test.py
@@ -1,7 +1,33 @@
-str = """INSERT INTO `preference` VALUES ('100', '42600', '1210711');"""
+import pickle
+from tqdm import tqdm
+import json
+import mysql.connector
 
-# 分别提取出'100', '42600', '1210711'
-import re
-pattern = re.compile(r"VALUES \('(\d+)', '(\d+)', '(\d+)'\);")
-result = pattern.findall(str)
-print(result[0])  # [('100', '42600', '1210711')]
\ No newline at end of file
+mydb = mysql.connector.connect(
+    host="172.16.0.176",  # 数据库主机地址
+    user="root",  # 数据库用户名
+    passwd="admin",  # 数据库密码
+    database="minidata"
+)
+cursor = mydb.cursor()
+
+tag_set = set()
+filepath = "C:/Users/HP/Desktop/音乐推荐/entities/mini_tracks.idomaar"
+primary_key = set()
+with open(filepath, "r") as file:
+    lines = file.readlines()
+    for line in tqdm(lines):
+        tags = json.loads(line.split("\t")[-1])["tags"]
+        track_id = line.split("\t")[1]
+        for tag in tags:
+            key = f"{track_id}-{tag['id']}"
+            if key in primary_key: continue
+            primary_key.add(key)
+
+            sql = "INSERT INTO track_tag (track_id, tag_id) VALUES (%s, %s)"
+            val = (track_id, tag["id"])
+            try:
+                cursor.execute(sql, val)
+            except Exception as e:
+                print(e)
+    mydb.commit()
\ No newline at end of file
diff --git a/data/minidata/write.py b/data/minidata/write.py
index 0756d80..b5df4c3 100644
--- a/data/minidata/write.py
+++ b/data/minidata/write.py
@@ -12,25 +12,13 @@
 )
 cursor = mydb.cursor()
 
-file = "data/track_ids.pkl"
-with open(file, "rb") as f:
-    track_ids = pickle.load(f)
-file = "data/user_ids.pkl"
-with open(file, "rb") as f:
-    user_ids = pickle.load(f)
+with open('./data/tag_map.pkl', 'rb') as f:
+    tag_map = pickle.load(f)
 
-track_ids = set(track_ids)
-user_ids = set(user_ids)
+for k, v in tqdm(tag_map.items()):
+    sql = "INSERT INTO tag (id, value) VALUES (%s, %s)"
+    val = (k, v)
+    cursor.execute(sql, val)
 
-filepath = "C:/Users/HP/Desktop/sql/preference.sql"
-pattern = re.compile(r"VALUES \('(\d+)', '(\d+)', '(\d+)'\);")
-with open(filepath, "r") as f:
-    lines = f.readlines()
-    for line in tqdm(lines):
-        _, user_id, track_id = pattern.findall(line)[0]
-        if track_id not in track_ids or user_id not in user_ids: continue
-
-        sql = line
-        cursor.execute(sql)
 
 mydb.commit()
\ No newline at end of file