diff --git a/data/.gitignore b/data/.gitignore
new file mode 100644
index 0000000..ef64266
--- /dev/null
+++ b/data/.gitignore
@@ -0,0 +1 @@
+/minidata/data/*.pkl
\ No newline at end of file
diff --git a/data/clean/tag.py b/data/clean/tag.py
new file mode 100644
index 0000000..5c37e49
--- /dev/null
+++ b/data/clean/tag.py
@@ -0,0 +1,75 @@
+# tag表标签很散，初始有27w个标签
+
+import json
+import mysql.connector
+from tqdm import tqdm
+import difflib
+import pickle
+
+mydb = mysql.connector.connect(
+    host="172.16.0.176",  # 数据库主机地址
+    user="root",  # 数据库用户名
+    passwd="admin",  # 数据库密码
+    database="bigdata"
+)
+cursor = mydb.cursor()
+sql = "SELECT id, value FROM tag"
+cursor.execute(sql)
+tags = cursor.fetchall()
+tag_map = {tag[0]: tag[1] for tag in tags}
+
+ids = [tag[0] for tag in tags]
+
+tags = [tag[1] for tag in tags]
+
+tag_lst = []
+tag_dict = {}
+
+max_step = 10000
+threshold = 0.8
+
+with open('../minidata/data/tag_set.pkl', 'rb') as f:
+    tag_set = pickle.load(f)
+tags = list(tag_set)
+tag_map = {i: tag_map[i] for i in tag_set}
+with open('../minidata/data/tag_map.pkl', 'wb') as f:
+    pickle.dump(tag_map, f)
+print(len(tags))
+# 27w 双重循环复杂度太高
+# 优化后从 3h+ -> 24s
+# 首先对value预处理。去掉”空格、-、'等“
+# tags = list(map(lambda x: x.replace(' ', '').replace('-', '').replace('\'', ''), tags))
+# 然后进行排序
+tags = list(map(lambda x: x.lower(), tags))
+# 循环时比较最近的max_step个元素即可
+
+cnt = 0
+for i in tqdm(range(len(tags))):
+    flag = True
+    x = tag_map[tags[i]]
+    for tag in tag_lst[-max_step:]:
+        # if ids[i] not in tag_set:
+        #     continue
+        # else:
+        #     cnt += 1
+
+        if difflib.SequenceMatcher(None, x, tag).quick_ratio() >= threshold:
+            flag = False
+            tag_dict[x] = tag
+            break
+
+    if flag:
+        tag_lst.append(x)
+        tag_dict[x] = x
+
+print(len(tag_lst))
+print(len(set(tag_dict.values())))
+import pickle
+
+with open('tag_dict.pkl', 'wb') as f:
+    pickle.dump(tag_dict, f)
+
+
+# print(len(tags))
+# print('\n'.join(tags))
+print(cnt)
\ No newline at end of file
diff --git a/data/minidata/load.py b/data/minidata/load.py
new file mode 100644
index 0000000..36dd92e
--- /dev/null
+++ b/data/minidata/load.py
@@ -0,0 +1,28 @@
+import json
+import mysql.connector
+from tqdm import tqdm
+import pickle
+
+mydb = mysql.connector.connect(
+    host="172.16.0.176",  # 数据库主机地址
+    user="root",  # 数据库用户名
+    passwd="admin",  # 数据库密码
+    database="bigdata"
+)
+cursor = mydb.cursor()
+
+file = "data/user_ids.pkl"
+with open(file, "rb") as f:
+    user_ids = pickle.load(f)
+
+user_ids = set(user_ids)
+
+sql = "select distinct creator_id from playlist"
+cursor.execute(sql)
+creators = cursor.fetchall()
+creators = [creator[0] for creator in creators]
+
+users = set(creators) & user_ids
+print(len(users))
+print(len(creators))
+print(len(user_ids))
\ No newline at end of file
diff --git a/data/minidata/read.py b/data/minidata/read.py
new file mode 100644
index 0000000..799b70e
--- /dev/null
+++ b/data/minidata/read.py
@@ -0,0 +1,24 @@
+import json
+import mysql.connector
+from tqdm import tqdm
+import pickle
+
+mydb = mysql.connector.connect(
+    host="172.16.0.176",  # 数据库主机地址
+    user="root",  # 数据库用户名
+    passwd="admin",  # 数据库密码
+    database="bigdata"
+)
+cursor = mydb.cursor()
+
+track_ids = pickle.load(open("data/track_ids.pkl", "rb"))
+
+sql = f'SELECT DISTINCT user_id FROM preference WHERE track_id IN ({",".join(track_ids)})'
+# print(sql)
+cursor.execute(sql)
+user_ids = cursor.fetchall()
+user_ids = [user_ids[0] for user_ids in user_ids]
+
+
+with open('data/user_ids.pkl', 'wb') as f:
+    pickle.dump(user_ids, f)
\ No newline at end of file
diff --git a/data/minidata/test.py b/data/minidata/test.py
new file mode 100644
index 0000000..1dfdf68
--- /dev/null
+++ b/data/minidata/test.py
@@ -0,0 +1,33 @@
+import pickle
+from tqdm import tqdm
+import json
+import mysql.connector
+
+mydb = mysql.connector.connect(
+    host="172.16.0.176",  # 数据库主机地址
+    user="root",  # 数据库用户名
+    passwd="admin",  # 数据库密码
+    database="minidata"
+)
+cursor = mydb.cursor()
+
+tag_set = set()
+filepath = "C:/Users/HP/Desktop/音乐推荐/entities/mini_tracks.idomaar"
+primary_key = set()
+with open(filepath, "r") as file:
+    lines = file.readlines()
+    for line in tqdm(lines):
+        tags = json.loads(line.split("\t")[-1])["tags"]
+        track_id = line.split("\t")[1]
+        for tag in tags:
+            key = f"{track_id}-{tag['id']}"
+            if key in primary_key: continue
+            primary_key.add(key)
+
+            sql = "INSERT INTO track_tag (track_id, tag_id) VALUES (%s, %s)"
+            val = (track_id, tag["id"])
+            try:
+                cursor.execute(sql, val)
+            except Exception as e:
+                print(e)
+    mydb.commit()
\ No newline at end of file
diff --git a/data/minidata/write.py b/data/minidata/write.py
new file mode 100644
index 0000000..b5df4c3
--- /dev/null
+++ b/data/minidata/write.py
@@ -0,0 +1,24 @@
+import mysql.connector
+import re
+import pickle
+from tqdm import tqdm
+
+
+mydb = mysql.connector.connect(
+    host="172.16.0.176",  # 数据库主机地址
+    user="root",  # 数据库用户名
+    passwd="admin",  # 数据库密码
+    database="minidata"
+)
+cursor = mydb.cursor()
+
+with open('./data/tag_map.pkl', 'rb') as f:
+    tag_map = pickle.load(f)
+
+for k, v in tqdm(tag_map.items()):
+    sql = "INSERT INTO tag (id, value) VALUES (%s, %s)"
+    val = (k, v)
+    cursor.execute(sql, val)
+
+
+mydb.commit()
\ No newline at end of file