From 727b82d5323d4a11bcc1456a2f994730d7850354 Mon Sep 17 00:00:00 2001 From: 2314 <131743576+gxy2314@users.noreply.github.com> Date: Thu, 26 Dec 2024 14:37:16 +0800 Subject: [PATCH 1/2] fix: resolve training issues with errors - Fixed `ModuleNotFoundError: No module named 'fastai.utils'` during training. - Resolved `RuntimeError: indices should be either on CPU or on the same device as the indexed tensor`. - Addressed output errors during result display. --- README.md | 10 +- build_graph.py | 388 ++++++++++++++++---------------- data_processor.py | 324 +++++++++++++-------------- layer.py | 59 +---- trainer.py | 557 ++++++++++++++++++++++++---------------------- utils.py | 477 +++++++++++++++++++-------------------- 6 files changed, 899 insertions(+), 916 deletions(-) diff --git a/README.md b/README.md index 186c603..3aa6e79 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,15 @@ -# Graph Convolutional Networks for Text Classification in PyTorch +# Simplifying Graph Convolutional Networks in PyTorch (TextSGC) -PyTorch 1.6 and Python 3.7 implementation of Graph Convolutional Networks for Text Classification [1]. +PyTorch 1.6 and Python 3.7 implementation of Simplifying Graph Convolutional Networks [1]. Tested on the 20NG/R8/R52/Ohsumed/MR data set, the code on this repository can achieve the effect of the paper. ## Benchmark -| dataset | 20NG | R8 | R52 | Ohsumed | MR | +| dataset | 20NG | R8 | R52 | Ohsumed | MR | |---------------|----------|------|--------|--------|--------| | TextGCN(official) | 0.8634 | 0.9707 | 0.9356 | 0.6836 | 0.7674 | -| This repo. | 0.8618 | 0.9704 | 0.9354 | 0.6827 | 0.7643 | +| This repo. | 0.8605 | 0.9743 | 0.9384 | 0.6828 | 0.7728 | NOTE: The result of the experiment is to repeat the run 10 times, and then take the average of accuracy. @@ -32,4 +32,4 @@ NOTE: The result of the experiment is to repeat the run 10 times, and then take 3. Training model, run `trainer.py` ## References -[1] [Yao, L. , Mao, C. , & Luo, Y. . (2018). Graph convolutional networks for text classification.](https://arxiv.org/abs/1809.05679) +[1] [Wu, F. , Zhang, T. , Souza, A. H. D. , Fifty, C. , Yu, T. , & Weinberger, K. Q. . (2019). Simplifying graph convolutional networks.](https://arxiv.org/abs/1902.07153) diff --git a/build_graph.py b/build_graph.py index 2e0b13c..f15133c 100644 --- a/build_graph.py +++ b/build_graph.py @@ -1,194 +1,194 @@ -import os -from collections import Counter - -import networkx as nx - -import itertools -import math -from collections import defaultdict -from time import time - -from sklearn.feature_extraction.text import CountVectorizer -from sklearn.feature_extraction.text import TfidfTransformer -from sklearn.pipeline import Pipeline -from tqdm import tqdm - -from utils import print_graph_detail - - -def get_window(content_lst, window_size): - """ - 找出窗口 - :param content_lst: - :param window_size: - :return: - """ - word_window_freq = defaultdict(int) # w(i) 单词在窗口单位内出现的次数 - word_pair_count = defaultdict(int) # w(i, j) - windows_len = 0 - for words in tqdm(content_lst, desc="Split by window"): - windows = list() - - if isinstance(words, str): - words = words.split() - length = len(words) - - if length <= window_size: - windows.append(words) - else: - for j in range(length - window_size + 1): - window = words[j: j + window_size] - windows.append(list(set(window))) - - for window in windows: - for word in window: - word_window_freq[word] += 1 - - for word_pair in itertools.combinations(window, 2): - word_pair_count[word_pair] += 1 - - windows_len += len(windows) - return word_window_freq, word_pair_count, windows_len - - -def cal_pmi(W_ij, W, word_freq_1, word_freq_2): - p_i = word_freq_1 / W - p_j = word_freq_2 / W - p_i_j = W_ij / W - pmi = math.log(p_i_j / (p_i * p_j)) - - return pmi - - -def count_pmi(windows_len, word_pair_count, word_window_freq, threshold): - word_pmi_lst = list() - for word_pair, W_i_j in tqdm(word_pair_count.items(), desc="Calculate pmi between words"): - word_freq_1 = word_window_freq[word_pair[0]] - word_freq_2 = word_window_freq[word_pair[1]] - - pmi = cal_pmi(W_i_j, windows_len, word_freq_1, word_freq_2) - if pmi <= threshold: - continue - word_pmi_lst.append([word_pair[0], word_pair[1], pmi]) - return word_pmi_lst - - -def get_pmi_edge(content_lst, window_size=20, threshold=0.): - if isinstance(content_lst, str): - content_lst = list(open(content_lst, "r")) - print("pmi read file len:", len(content_lst)) - - pmi_start = time() - word_window_freq, word_pair_count, windows_len = get_window(content_lst, - window_size=window_size) - - pmi_edge_lst = count_pmi(windows_len, word_pair_count, word_window_freq, threshold) - print("Total number of edges between word:", len(pmi_edge_lst)) - pmi_time = time() - pmi_start - return pmi_edge_lst, pmi_time - - -class BuildGraph: - def __init__(self, dataset): - clean_corpus_path = "data/text_dataset/clean_corpus" - self.graph_path = "data/graph" - if not os.path.exists(self.graph_path): - os.makedirs(self.graph_path) - - self.word2id = dict() # 单词映射 - self.dataset = dataset - print(f"\n==> 现在的数据集是:{dataset}<==") - - self.g = nx.Graph() - - self.content = f"{clean_corpus_path}/{dataset}.txt" - - self.get_tfidf_edge() - self.get_pmi_edge() - self.save() - - def get_pmi_edge(self): - pmi_edge_lst, self.pmi_time = get_pmi_edge(self.content, window_size=20, threshold=0.0) - print("pmi time:", self.pmi_time) - - for edge_item in pmi_edge_lst: - word_indx1 = self.node_num + self.word2id[edge_item[0]] - word_indx2 = self.node_num + self.word2id[edge_item[1]] - if word_indx1 == word_indx2: - continue - self.g.add_edge(word_indx1, word_indx2, weight=edge_item[2]) - - print_graph_detail(self.g) - - def get_tfidf_edge(self): - # 获得tfidf权重矩阵(sparse)和单词列表 - tfidf_vec = self.get_tfidf_vec() - - count_lst = list() # 统计每个句子的长度 - for ind, row in tqdm(enumerate(tfidf_vec), - desc="generate tfidf edge"): - count = 0 - for col_ind, value in zip(row.indices, row.data): - word_ind = self.node_num + col_ind - self.g.add_edge(ind, word_ind, weight=value) - count += 1 - count_lst.append(count) - - print_graph_detail(self.g) - - def get_tfidf_vec(self): - """ - 学习获得tfidf矩阵,及其对应的单词序列 - :param content_lst: - :return: - """ - start = time() - text_tfidf = Pipeline([ - ("vect", CountVectorizer(min_df=1, - max_df=1.0, - token_pattern=r"\S+", - )), - ("tfidf", TfidfTransformer(norm=None, - use_idf=True, - smooth_idf=False, - sublinear_tf=False - )) - ]) - - tfidf_vec = text_tfidf.fit_transform(open(self.content, "r")) - - self.tfidf_time = time() - start - print("tfidf time:", self.tfidf_time) - print("tfidf_vec shape:", tfidf_vec.shape) - print("tfidf_vec type:", type(tfidf_vec)) - - self.node_num = tfidf_vec.shape[0] - - # 映射单词 - vocab_lst = text_tfidf["vect"].get_feature_names() - print("vocab_lst len:", len(vocab_lst)) - for ind, word in enumerate(vocab_lst): - self.word2id[word] = ind - - self.vocab_lst = vocab_lst - - return tfidf_vec - - def save(self): - print("total time:", self.pmi_time + self.tfidf_time) - nx.write_weighted_edgelist(self.g, - f"{self.graph_path}/{self.dataset}.txt") - - print("\n") - - -def main(): - BuildGraph("mr") - BuildGraph("ohsumed") - BuildGraph("R52") - BuildGraph("R8") - BuildGraph("20ng") - - -if __name__ == '__main__': - main() +import os +from collections import Counter + +import networkx as nx + +import itertools +import math +from collections import defaultdict +from time import time + +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.feature_extraction.text import TfidfTransformer +from sklearn.pipeline import Pipeline +from tqdm import tqdm + +from utils import print_graph_detail + + +def get_window(content_lst, window_size): + """ + 找出窗口 + :param content_lst: + :param window_size: + :return: + """ + word_window_freq = defaultdict(int) # w(i) 单词在窗口单位内出现的次数 + word_pair_count = defaultdict(int) # w(i, j) + windows_len = 0 + for words in tqdm(content_lst, desc="Split by window"): + windows = list() + + if isinstance(words, str): + words = words.split() + length = len(words) + + if length <= window_size: + windows.append(words) + else: + for j in range(length - window_size + 1): + window = words[j: j + window_size] + windows.append(list(set(window))) + + for window in windows: + for word in window: + word_window_freq[word] += 1 + + for word_pair in itertools.combinations(window, 2): + word_pair_count[word_pair] += 1 + + windows_len += len(windows) + return word_window_freq, word_pair_count, windows_len + + +def cal_pmi(W_ij, W, word_freq_1, word_freq_2): + p_i = word_freq_1 / W + p_j = word_freq_2 / W + p_i_j = W_ij / W + pmi = math.log(p_i_j / (p_i * p_j)) + + return pmi + + +def count_pmi(windows_len, word_pair_count, word_window_freq, threshold): + word_pmi_lst = list() + for word_pair, W_i_j in tqdm(word_pair_count.items(), desc="Calculate pmi between words"): + word_freq_1 = word_window_freq[word_pair[0]] + word_freq_2 = word_window_freq[word_pair[1]] + + pmi = cal_pmi(W_i_j, windows_len, word_freq_1, word_freq_2) + if pmi <= threshold: + continue + word_pmi_lst.append([word_pair[0], word_pair[1], pmi]) + return word_pmi_lst + + +def get_pmi_edge(content_lst, window_size=20, threshold=0.): + if isinstance(content_lst, str): + content_lst = list(open(content_lst, "r")) + print("pmi read file len:", len(content_lst)) + + pmi_start = time() + word_window_freq, word_pair_count, windows_len = get_window(content_lst, + window_size=window_size) + + pmi_edge_lst = count_pmi(windows_len, word_pair_count, word_window_freq, threshold) + print("Total number of edges between word:", len(pmi_edge_lst)) + pmi_time = time() - pmi_start + return pmi_edge_lst, pmi_time + + +class BuildGraph: + def __init__(self, dataset): + clean_corpus_path = "PyTorch_TextSGC/data/text_dataset/clean_corpus" + self.graph_path = "PyTorch_TextSGC/data/graph" + if not os.path.exists(self.graph_path): + os.makedirs(self.graph_path) + + self.word2id = dict() # 单词映射 + self.dataset = dataset + print(f"\n==> 现在的数据集是:{dataset}<==") + + self.g = nx.Graph() + + self.content = f"{clean_corpus_path}/{dataset}.txt" + + self.get_tfidf_edge() + self.get_pmi_edge() + self.save() + + def get_pmi_edge(self): + pmi_edge_lst, self.pmi_time = get_pmi_edge(self.content, window_size=20, threshold=0.0) + print("pmi time:", self.pmi_time) + + for edge_item in pmi_edge_lst: + word_indx1 = self.node_num + self.word2id[edge_item[0]] + word_indx2 = self.node_num + self.word2id[edge_item[1]] + if word_indx1 == word_indx2: + continue + self.g.add_edge(word_indx1, word_indx2, weight=edge_item[2]) + + print_graph_detail(self.g) + + def get_tfidf_edge(self): + # 获得tfidf权重矩阵(sparse)和单词列表 + tfidf_vec = self.get_tfidf_vec() + + count_lst = list() # 统计每个句子的长度 + for ind, row in tqdm(enumerate(tfidf_vec), + desc="generate tfidf edge"): + count = 0 + for col_ind, value in zip(row.indices, row.data): + word_ind = self.node_num + col_ind + self.g.add_edge(ind, word_ind, weight=value) + count += 1 + count_lst.append(count) + + print_graph_detail(self.g) + + def get_tfidf_vec(self): + """ + 学习获得tfidf矩阵,及其对应的单词序列 + :param content_lst: + :return: + """ + start = time() + text_tfidf = Pipeline([ + ("vect", CountVectorizer(min_df=1, + max_df=1.0, + token_pattern=r"\S+", + )), + ("tfidf", TfidfTransformer(norm=None, + use_idf=True, + smooth_idf=False, + sublinear_tf=False + )) + ]) + + tfidf_vec = text_tfidf.fit_transform(open(self.content, "r")) + + self.tfidf_time = time() - start + print("tfidf time:", self.tfidf_time) + print("tfidf_vec shape:", tfidf_vec.shape) + print("tfidf_vec type:", type(tfidf_vec)) + + self.node_num = tfidf_vec.shape[0] + + # 映射单词 + vocab_lst = text_tfidf["vect"].get_feature_names() + print("vocab_lst len:", len(vocab_lst)) + for ind, word in enumerate(vocab_lst): + self.word2id[word] = ind + + self.vocab_lst = vocab_lst + + return tfidf_vec + + def save(self): + print("total time:", self.pmi_time + self.tfidf_time) + nx.write_weighted_edgelist(self.g, + f"{self.graph_path}/{self.dataset}.txt") + + print("\n") + + +def main(): + BuildGraph("mr") + BuildGraph("ohsumed") + BuildGraph("R52") + BuildGraph("R8") + BuildGraph("20ng") + + +if __name__ == '__main__': + main() diff --git a/data_processor.py b/data_processor.py index e3c806e..a613f83 100644 --- a/data_processor.py +++ b/data_processor.py @@ -1,162 +1,162 @@ -import os -import re -from collections import Counter -from collections import defaultdict -import numpy as np - -from tqdm import tqdm - - -class StringProcess(object): - def __init__(self): - self.other_char = re.compile(r"[^A-Za-z0-9(),!?\'\`]", flags=0) - self.num = re.compile(r"[+-]?\d+\.?\d*", flags=0) - # self.url = re.compile(r"[a-z]*[:.]+\S+|\n|\s+", flags=0) - self.url = re.compile( - r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", flags=0) - self.stop_words = None - self.nlp = None - - def clean_str(self, string): - string = re.sub(self.other_char, " ", string) - string = re.sub(r"\'s", " \'s", string) - string = re.sub(r"\'ve", " \'ve", string) - string = re.sub(r"n\'t", " n\'t", string) - string = re.sub(r"\'re", " \'re", string) - string = re.sub(r"\'d", " \'d", string) - string = re.sub(r"\'ll", " \'ll", string) - string = re.sub(r",", " , ", string) - string = re.sub(r"!", " ! ", string) - string = re.sub(r"\(", " \( ", string) - string = re.sub(r"\)", " \) ", string) - string = re.sub(r"\?", " \? ", string) - string = re.sub(r"\s{2,}", " ", string) - - return string.strip().lower() - - def norm_str(self, string): - string = re.sub(self.other_char, " ", string) - - if self.nlp is None: - from spacy.lang.en import English - self.nlp = English() - - new_doc = list() - doc = self.nlp(string) - for token in doc: - if token.is_space or token.is_punct: - continue - if token.is_digit: - token = "[num]" - else: - token = token.text - - new_doc.append(token) - - return " ".join(new_doc).lower() - - def lean_str_sst(self, string): - """ - Tokenization/string cleaning for the SST yelp_dataset - Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py - """ - string = re.sub(self.other_char, " ", string) - string = re.sub(r"\s{2,}", " ", string) - return string.strip().lower() - - def remove_stopword(self, string): - if self.stop_words is None: - from nltk.corpus import stopwords - self.stop_words = set(stopwords.words('english')) - - if type(string) is str: - string = string.split() - - new_string = list() - for word in string: - if word in self.stop_words: - continue - new_string.append(word) - - return " ".join(new_string) - - def replace_num(self, string): - result = re.sub(self.num, '', string) - return result - - def replace_urls(self, string): - result = re.sub(self.url, '', string) - result = ' '.join(re.split(' +|\n+', result)).strip() - return result - - -def remove_less_word(lines_str, word_st): - return " ".join([word for word in lines_str.split() if word in word_st]) - - -class CorpusProcess: - def __init__(self, dataset, encoding=None): - corpus_path = "data/text_dataset/corpus" - clean_corpus_path = "data/text_dataset/clean_corpus" - if not os.path.exists(clean_corpus_path): - os.makedirs(clean_corpus_path) - - self.dataset = dataset - self.corpus_name = f"{corpus_path}/{dataset}.txt" - self.save_name = f"{clean_corpus_path}/{dataset}.txt" - self.context_dct = defaultdict(dict) - - self.encoding = encoding - self.clean_text() - - def clean_text(self): - sp = StringProcess() - word_lst = list() - with open(self.corpus_name, mode="rb", encoding=self.encoding) as fin: - for indx, item in tqdm(enumerate(fin), desc="clean the text"): - data = item.strip().decode('latin1') - data = sp.clean_str(data) - if self.dataset not in {"mr"}: - data = sp.remove_stopword(data) - word_lst.extend(data.split()) - - word_st = set() - if self.dataset not in {"mr"}: - for word, value in Counter(word_lst).items(): - if value < 5: - continue - word_st.add(word) - else: - word_st = set(word_lst) - - doc_len_lst = list() - with open(self.save_name, mode='w') as fout: - with open(self.corpus_name, mode="rb", encoding=self.encoding) as fin: - for line in tqdm(fin): - lines_str = line.strip().decode('latin1') - lines_str = sp.clean_str(lines_str) - if self.dataset not in {"mr"}: - lines_str = sp.remove_stopword(lines_str) - lines_str = remove_less_word(lines_str, word_st) - - fout.write(lines_str) - fout.write(" \n") - - doc_len_lst.append(len(lines_str.split())) - - print("Average length:", np.mean(doc_len_lst)) - print("doc count:", len(doc_len_lst)) - print("Total number of words:", len(word_st)) - - -def main(): - CorpusProcess("R52") - # CorpusProcess("20ng") - # CorpusProcess("mr") - # CorpusProcess("ohsumed") - # CorpusProcess("R8") - # pass - - -if __name__ == '__main__': - main() +import os +import re +from collections import Counter +from collections import defaultdict +import numpy as np + +from tqdm import tqdm + + +class StringProcess(object): + def __init__(self): + self.other_char = re.compile(r"[^A-Za-z0-9(),!?\'\`]", flags=0) + self.num = re.compile(r"[+-]?\d+\.?\d*", flags=0) + # self.url = re.compile(r"[a-z]*[:.]+\S+|\n|\s+", flags=0) + self.url = re.compile( + r"(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]", flags=0) + self.stop_words = None + self.nlp = None + + def clean_str(self, string): + string = re.sub(self.other_char, " ", string) + string = re.sub(r"\'s", " \'s", string) + string = re.sub(r"\'ve", " \'ve", string) + string = re.sub(r"n\'t", " n\'t", string) + string = re.sub(r"\'re", " \'re", string) + string = re.sub(r"\'d", " \'d", string) + string = re.sub(r"\'ll", " \'ll", string) + string = re.sub(r",", " , ", string) + string = re.sub(r"!", " ! ", string) + string = re.sub(r"\(", " \( ", string) + string = re.sub(r"\)", " \) ", string) + string = re.sub(r"\?", " \? ", string) + string = re.sub(r"\s{2,}", " ", string) + + return string.strip().lower() + + def norm_str(self, string): + string = re.sub(self.other_char, " ", string) + + if self.nlp is None: + from spacy.lang.en import English + self.nlp = English() + + new_doc = list() + doc = self.nlp(string) + for token in doc: + if token.is_space or token.is_punct: + continue + if token.is_digit: + token = "[num]" + else: + token = token.text + + new_doc.append(token) + + return " ".join(new_doc).lower() + + def lean_str_sst(self, string): + """ + Tokenization/string cleaning for the SST yelp_dataset + Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py + """ + string = re.sub(self.other_char, " ", string) + string = re.sub(r"\s{2,}", " ", string) + return string.strip().lower() + + def remove_stopword(self, string): + if self.stop_words is None: + from nltk.corpus import stopwords + self.stop_words = set(stopwords.words('english')) + + if type(string) is str: + string = string.split() + + new_string = list() + for word in string: + if word in self.stop_words: + continue + new_string.append(word) + + return " ".join(new_string) + + def replace_num(self, string): + result = re.sub(self.num, '', string) + return result + + def replace_urls(self, string): + result = re.sub(self.url, '', string) + result = ' '.join(re.split(' +|\n+', result)).strip() + return result + + +def remove_less_word(lines_str, word_st): + return " ".join([word for word in lines_str.split() if word in word_st]) + + +class CorpusProcess: + def __init__(self, dataset, encoding=None): + corpus_path = "PyTorch_TextSGC/data/text_dataset/corpus" + clean_corpus_path = "PyTorch_TextSGC/data/text_dataset/clean_corpus" + if not os.path.exists(clean_corpus_path): + os.makedirs(clean_corpus_path) + + self.dataset = dataset + self.corpus_name = f"{corpus_path}/{dataset}.txt" + self.save_name = f"{clean_corpus_path}/{dataset}.txt" + self.context_dct = defaultdict(dict) + + self.encoding = encoding + self.clean_text() + + def clean_text(self): + sp = StringProcess() + word_lst = list() + with open(self.corpus_name, mode="rb", encoding=self.encoding) as fin: + for indx, item in tqdm(enumerate(fin), desc="clean the text"): + data = item.strip().decode('latin1') + data = sp.clean_str(data) + if self.dataset not in {"mr"}: + data = sp.remove_stopword(data) + word_lst.extend(data.split()) + + word_st = set() + if self.dataset not in {"mr"}: + for word, value in Counter(word_lst).items(): + if value < 5: + continue + word_st.add(word) + else: + word_st = set(word_lst) + + doc_len_lst = list() + with open(self.save_name, mode='w') as fout: + with open(self.corpus_name, mode="rb", encoding=self.encoding) as fin: + for line in tqdm(fin): + lines_str = line.strip().decode('latin1') + lines_str = sp.clean_str(lines_str) + if self.dataset not in {"mr"}: + lines_str = sp.remove_stopword(lines_str) + lines_str = remove_less_word(lines_str, word_st) + + fout.write(lines_str) + fout.write(" \n") + + doc_len_lst.append(len(lines_str.split())) + + print("Average length:", np.mean(doc_len_lst)) + print("doc count:", len(doc_len_lst)) + print("Total number of words:", len(word_st)) + + +def main(): + # CorpusProcess("R52") + # CorpusProcess("20ng") + CorpusProcess("mr") + CorpusProcess("ohsumed") + # CorpusProcess("R8") + # pass + + +if __name__ == '__main__': + main() diff --git a/layer.py b/layer.py index 85a9ba3..a7e662f 100644 --- a/layer.py +++ b/layer.py @@ -1,56 +1,13 @@ -import math +import torch.nn as nn import torch as th - -from torch.nn.parameter import Parameter from torch.nn.modules.module import Module -class GraphConvolution(Module): - """ - Simple pygGCN layer, similar to https://arxiv.org/abs/1609.02907 - """ - - def __init__(self, in_features, out_features, bias=True): - super(GraphConvolution, self).__init__() - self.in_features = in_features - self.out_features = out_features - self.weight = Parameter(th.FloatTensor(in_features, out_features)) - if bias: - self.bias = Parameter(th.FloatTensor(out_features)) - else: - self.register_parameter('bias', None) - self.reset_parameters() - - def reset_parameters(self): - stdv = 1. / math.sqrt(self.weight.size(1)) - self.weight.data.uniform_(-stdv, stdv) - if self.bias is not None: - self.bias.data.uniform_(-stdv, stdv) - - def forward(self, infeatn, adj): - support = th.spmm(infeatn, self.weight) - output = th.spmm(adj, support) - if self.bias is not None: - return output + self.bias - else: - return output - - def __repr__(self): - return self.__class__.__name__ + ' (' \ - + str(self.in_features) + ' -> ' \ - + str(self.out_features) + ')' - - -class GCN(Module): - def __init__(self, nfeat, nhid, nclass, dropout): - super(GCN, self).__init__() - self.gc1 = GraphConvolution(nfeat, nhid) - self.gc2 = GraphConvolution(nhid, nclass) - self.dropout = dropout +class SGC(Module): + def __init__(self, nfeat, nclass): + super(SGC, self).__init__() + self.W = nn.Linear(nfeat, nclass) + th.nn.init.xavier_normal_(self.W.weight) - def forward(self, x, adj): - x = self.gc1(x, adj) - x = th.relu(x) - x = th.dropout(x, self.dropout, train=self.training) - x = self.gc2(x, adj) - return x + def forward(self, x): + return self.W(x) diff --git a/trainer.py b/trainer.py index 32d289e..c62442a 100644 --- a/trainer.py +++ b/trainer.py @@ -1,266 +1,291 @@ -import gc -import warnings -from time import time - -import networkx as nx -import numpy as np -import pandas as pd -import torch as th -from sklearn.model_selection import train_test_split - -from layer import GCN -from utils import accuracy -from utils import macro_f1 -from utils import CudaUse -from utils import EarlyStopping -from utils import LogResult -from utils import parameter_parser -from utils import preprocess_adj -from utils import print_graph_detail -from utils import read_file -from utils import return_seed - -th.backends.cudnn.deterministic = True -th.backends.cudnn.benchmark = True -warnings.filterwarnings("ignore") - - -def get_train_test(target_fn): - train_lst = list() - test_lst = list() - with read_file(target_fn, mode="r") as fin: - for indx, item in enumerate(fin): - if item.split("\t")[1] in {"train", "training", "20news-bydate-train"}: - train_lst.append(indx) - else: - test_lst.append(indx) - - return train_lst, test_lst - - -class PrepareData: - def __init__(self, args): - print("prepare data") - self.graph_path = "data/graph" - self.args = args - - # graph - graph = nx.read_weighted_edgelist(f"{self.graph_path}/{args.dataset}.txt" - , nodetype=int) - print_graph_detail(graph) - adj = nx.to_scipy_sparse_matrix(graph, - nodelist=list(range(graph.number_of_nodes())), - weight='weight', - dtype=np.float) - - adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) - - self.adj = preprocess_adj(adj, is_sparse=True) - - # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - - # features - self.nfeat_dim = graph.number_of_nodes() - row = list(range(self.nfeat_dim)) - col = list(range(self.nfeat_dim)) - value = [1.] * self.nfeat_dim - shape = (self.nfeat_dim, self.nfeat_dim) - indices = th.from_numpy( - np.vstack((row, col)).astype(np.int64)) - values = th.FloatTensor(value) - shape = th.Size(shape) - - self.features = th.sparse.FloatTensor(indices, values, shape) - - # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - # target - - target_fn = f"data/text_dataset/{self.args.dataset}.txt" - target = np.array(pd.read_csv(target_fn, - sep="\t", - header=None)[2]) - target2id = {label: indx for indx, label in enumerate(set(target))} - self.target = [target2id[label] for label in target] - self.nclass = len(target2id) - - # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - # train val test split - - self.train_lst, self.test_lst = get_train_test(target_fn) - - -class TextGCNTrainer: - def __init__(self, args, model, pre_data): - self.args = args - self.model = model - self.device = args.device - - self.max_epoch = self.args.max_epoch - self.set_seed() - - self.dataset = args.dataset - self.predata = pre_data - self.earlystopping = EarlyStopping(args.early_stopping) - - def set_seed(self): - th.manual_seed(self.args.seed) - np.random.seed(self.args.seed) - - def fit(self): - self.prepare_data() - self.model = self.model(nfeat=self.nfeat_dim, - nhid=self.args.nhid, - nclass=self.nclass, - dropout=self.args.dropout) - print(self.model.parameters) - self.model = self.model.to(self.device) - - self.optimizer = th.optim.Adam(self.model.parameters(), lr=self.args.lr) - self.criterion = th.nn.CrossEntropyLoss() - - self.model_param = sum(param.numel() for param in self.model.parameters()) - print('# model parameters:', self.model_param) - self.convert_tensor() - - start = time() - self.train() - self.train_time = time() - start - - @classmethod - def set_description(cls, desc): - string = "" - for key, value in desc.items(): - if isinstance(value, int): - string += f"{key}:{value} " - else: - string += f"{key}:{value:.4f} " - print(string) - - def prepare_data(self): - self.adj = self.predata.adj - self.nfeat_dim = self.predata.nfeat_dim - self.features = self.predata.features - self.target = self.predata.target - self.nclass = self.predata.nclass - - self.train_lst, self.val_lst = train_test_split(self.predata.train_lst, - test_size=self.args.val_ratio, - shuffle=True, - random_state=self.args.seed) - self.test_lst = self.predata.test_lst - - def convert_tensor(self): - self.model = self.model.to(self.device) - self.adj = self.adj.to(self.device) - self.features = self.features.to(self.device) - self.target = th.tensor(self.target).long().to(self.device) - self.train_lst = th.tensor(self.train_lst).long().to(self.device) - self.val_lst = th.tensor(self.val_lst).long().to(self.device) - - def train(self): - for epoch in range(self.max_epoch): - self.model.train() - self.optimizer.zero_grad() - - logits = self.model.forward(self.features, self.adj) - loss = self.criterion(logits[self.train_lst], - self.target[self.train_lst]) - - loss.backward() - self.optimizer.step() - - val_desc = self.val(self.val_lst) - - desc = dict(**{"epoch" : epoch, - "train_loss": loss.item(), - }, **val_desc) - - self.set_description(desc) - - if self.earlystopping(val_desc["val_loss"]): - break - - @th.no_grad() - def val(self, x, prefix="val"): - self.model.eval() - with th.no_grad(): - logits = self.model.forward(self.features, self.adj) - loss = self.criterion(logits[x], - self.target[x]) - acc = accuracy(logits[x], - self.target[x]) - f1, precision, recall = macro_f1(logits[x], - self.target[x], - num_classes=self.nclass) - - desc = { - f"{prefix}_loss": loss.item(), - "acc" : acc, - "macro_f1" : f1, - "precision" : precision, - "recall" : recall, - } - return desc - - @th.no_grad() - def test(self): - self.test_lst = th.tensor(self.test_lst).long().to(self.device) - test_desc = self.val(self.test_lst, prefix="test") - test_desc["train_time"] = self.train_time - test_desc["model_param"] = self.model_param - return test_desc - - -def main(dataset, times): - args = parameter_parser() - args.dataset = dataset - - args.device = th.device('cuda') if th.cuda.is_available() else th.device('cpu') - args.nhid = 200 - args.max_epoch = 200 - args.dropout = 0.5 - args.val_ratio = 0.1 - args.early_stopping = 10 - args.lr = 0.02 - model = GCN - - print(args) - - predata = PrepareData(args) - cudause = CudaUse() - - record = LogResult() - seed_lst = list() - for ind, seed in enumerate(return_seed(times)): - print(f"\n\n==> {ind}, seed:{seed}") - args.seed = seed - seed_lst.append(seed) - - framework = TextGCNTrainer(model=model, args=args, pre_data=predata) - framework.fit() - - if th.cuda.is_available(): - gpu_mem = cudause.gpu_mem_get(_id=0) - record.log_single(key="gpu_mem", value=gpu_mem) - - record.log(framework.test()) - - del framework - gc.collect() - - if th.cuda.is_available(): - th.cuda.empty_cache() - - print("==> seed set:") - print(seed_lst) - record.show_str() - - -if __name__ == '__main__': - # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< - # for d in ["mr", "ohsumed", "R52", "R8", "20ng"]: - # main(d) - main("mr", 1) - # main("ohsumed") - # main("R8", 1) - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> +import gc +import warnings +from time import time + +import networkx as nx +import numpy as np +import pandas as pd +import torch as th +from sklearn.model_selection import train_test_split + +from layer import SGC +from utils import accuracy +from utils import macro_f1 +from utils import CudaUse +from utils import EarlyStopping +from utils import LogResult +from utils import parameter_parser +from utils import preprocess_adj +from utils import print_graph_detail +from utils import read_file +from utils import return_seed + +th.backends.cudnn.deterministic = True +th.backends.cudnn.benchmark = True +warnings.filterwarnings("ignore") + + +def get_train_test(target_fn): + train_lst = list() + test_lst = list() + with read_file(target_fn, mode="r") as fin: + for indx, item in enumerate(fin): + if item.split("\t")[1] in {"train", "training", "20news-bydate-train"}: + train_lst.append(indx) + else: + test_lst.append(indx) + + return train_lst, test_lst + + +class PrepareData: + def __init__(self, args): + print("prepare data") + self.graph_path = "PyTorch_TextSGC/data/graph" + self.args = args + + # graph + graph = nx.read_weighted_edgelist(f"{self.graph_path}/{args.dataset}.txt" + , nodetype=int) + print_graph_detail(graph) + adj = nx.to_scipy_sparse_matrix(graph, + nodelist=list(range(graph.number_of_nodes())), + weight='weight', + dtype=np.float) + + adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj) + + self.adj = preprocess_adj(adj, is_sparse=True) + + # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + # target + + target_fn = f"PyTorch_TextSGC/data/text_dataset/{self.args.dataset}.txt" + target = np.array(pd.read_csv(target_fn, + sep="\t", + header=None)[2]) + target2id = {label: indx for indx, label in enumerate(set(target))} + self.target = [target2id[label] for label in target] + self.nclass = len(target2id) + + # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + # train val test split + + self.train_lst, self.test_lst = get_train_test(target_fn) + + +class TextSCGTrainer: + def __init__(self, args, model, pre_data): + self.args = args + self.model = model + self.device = args.device + + self.max_epoch = self.args.max_epoch + self.set_seed() + + self.dataset = args.dataset + self.predata = pre_data + self.earlystopping = EarlyStopping(args.early_stopping) + + def set_seed(self): + th.manual_seed(self.args.seed) + np.random.seed(self.args.seed) + + def fit(self): + self.prepare_data() + self.convert_tensor() + + self.model = self.model(nfeat=self.nfeat_dim, + nclass=self.nclass) + self.model = self.model.to(self.device) + + self.optimizer = th.optim.Adam(self.model.parameters(), lr=self.args.lr) + self.criterion = th.nn.CrossEntropyLoss() + + self.model_param = sum(param.numel() for param in self.model.parameters()) + print('# model parameters:', self.model_param) + + start = time() + self.train() + self.train_time = time() - start + self.pre_time + + @classmethod + def set_description(cls, desc): + string = "" + for key, value in desc.items(): + if isinstance(value, int): + string += f"{key}:{value} " + else: + string += f"{key}:{value:.4f} " + print(string) + + def prepare_data(self): + self.adj = self.predata.adj + self.target = self.predata.target + self.nclass = self.predata.nclass + + self.train_lst, self.val_lst = train_test_split(self.predata.train_lst, + test_size=self.args.val_ratio, + shuffle=True, + random_state=self.args.seed) + self.test_lst = self.predata.test_lst + + @th.no_grad() + def sgc_precompute(self, sp_adj, adj_dense, train_lst, val_lst, test_lst): + start = time() + + # train + feats = adj_dense[:, train_lst].to(self.device) + feats = th.spmm(sp_adj, feats).t() + + train_feats_max, _ = feats.max(dim=0, keepdim=True) + train_feats_min, _ = feats.min(dim=0, keepdim=True) + + train_feats_range = train_feats_max - train_feats_min + useful_features_dim = train_feats_range.squeeze().gt(0).nonzero().squeeze() + feats = feats[:, useful_features_dim] + train_feats_range = train_feats_range[:, useful_features_dim] + train_feats_min = train_feats_min[:, useful_features_dim] + train_vec = ((feats - train_feats_min) / train_feats_range) + + # val + feats = adj_dense[:, val_lst].to(self.device) + feats = th.spmm(sp_adj, feats).t() + feats = feats[:, useful_features_dim] + val_vec = ((feats - train_feats_min) / train_feats_range) + + # test + feats = adj_dense[:, test_lst].to(self.device) + feats = th.spmm(sp_adj, feats).t() + feats = feats[:, useful_features_dim] + test_vec = ((feats - train_feats_min) / train_feats_range).cpu() + + print(train_vec.size()) + print(val_vec.size()) + print(test_vec.size()) + return train_vec, val_vec, test_vec, time() - start + + def convert_tensor(self): + self.target = th.tensor(self.target).long().to(self.device) + + self.train_lst = th.tensor(self.train_lst).long().to(self.device) + self.val_lst = th.tensor(self.val_lst).long().to(self.device) + self.test_lst = th.tensor(self.test_lst).long().to(self.device) + + adj_dense = self.adj.to_dense().to(self.device) + self.adj = self.adj.to(self.device) + self.train_vec, self.val_vec, self.test_vec, self.pre_time = self.sgc_precompute(self.adj, + adj_dense, + self.train_lst, + self.val_lst, + self.test_lst) + self.nfeat_dim = self.train_vec.size(1) + + def train(self): + for epoch in range(self.max_epoch): + self.model.train() + self.optimizer.zero_grad() + + logits = self.model.forward(self.train_vec) + loss = self.criterion(logits, + self.target[self.train_lst]) + + loss.backward() + self.optimizer.step() + + val_desc = self.val(self.val_vec, self.val_lst) + + desc = dict(**{"epoch" : epoch, + "train_loss": loss.item(), + }, **val_desc) + + self.set_description(desc) + + if self.earlystopping(val_desc["val_loss"]): + break + + @th.no_grad() + def val(self, feats, ind, prefix="val"): + self.model.eval() + with th.no_grad(): + logits = self.model.forward(feats) + loss = self.criterion(logits, + self.target[ind]) + acc = accuracy(logits, + self.target[ind]) + f1, precision, recall = macro_f1(logits, + self.target[ind], + num_classes=self.nclass) + + desc = { + f"{prefix}_loss": loss.item(), + "acc" : acc, + "macro_f1" : f1, + "precision" : precision, + "recall" : recall, + } + return desc + + @th.no_grad() + def test(self): + test_vec = self.test_vec.to(self.device) + test_lst = th.tensor(self.test_lst).long().to(self.device) + test_desc = self.val(test_vec, test_lst, prefix="test") + test_desc["train_time"] = self.train_time + test_desc["model_param"] = self.model_param + return test_desc + + +def main(dataset, times): + args = parameter_parser() + args.dataset = dataset + + args.device = th.device('cuda') if th.cuda.is_available() else th.device('cpu') + args.nhid = 200 + args.max_epoch = 200 + args.dropout = 0.5 + args.val_ratio = 0.1 + args.early_stopping = 10 + args.lr = 0.01 + model = SGC + + print(args) + + predata = PrepareData(args) + cudause = CudaUse() + + record = LogResult() + seed_lst = list() + for ind, seed in enumerate(return_seed(times)): + print(f"\n\n==> {ind}, seed:{seed}") + args.seed = seed + seed_lst.append(seed) + + framework = TextSCGTrainer(model=model, args=args, pre_data=predata) + framework.fit() + + if th.cuda.is_available(): + gpu_mem = cudause.gpu_mem_get(_id=0) + record.log_single(key="gpu_mem", value=gpu_mem) + + record.log(framework.test()) + + del framework + gc.collect() + + if th.cuda.is_available(): + th.cuda.empty_cache() + + print("==> seed set:") + print(seed_lst) + record.show_str() + + +if __name__ == '__main__': + # <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<< + # for d in ["mr", "ohsumed", "R52", "R8", "20ng"]: + # main(d) + main("mr", 1) + # main("ohsumed") + # main("R8", 1) + # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> diff --git a/utils.py b/utils.py index 702680d..6a67ef5 100644 --- a/utils.py +++ b/utils.py @@ -1,238 +1,239 @@ -import argparse -import random -from collections import defaultdict - -import numpy as np -import torch as th -import scipy.sparse as sp - - -def macro_f1(pred, targ, num_classes=None): - pred = th.max(pred, 1)[1] - tp_out = [] - fp_out = [] - fn_out = [] - if num_classes is None: - num_classes = sorted(set(targ.cpu().numpy().tolist())) - else: - num_classes = range(num_classes) - for i in num_classes: - tp = ((pred == i) & (targ == i)).sum().item() # 预测为i,且标签的确为i的 - fp = ((pred == i) & (targ != i)).sum().item() # 预测为i,但标签不是为i的 - fn = ((pred != i) & (targ == i)).sum().item() # 预测不是i,但标签是i的 - tp_out.append(tp) - fp_out.append(fp) - fn_out.append(fn) - - eval_tp = np.array(tp_out) - eval_fp = np.array(fp_out) - eval_fn = np.array(fn_out) - - precision = eval_tp / (eval_tp + eval_fp) - precision[np.isnan(precision)] = 0 - precision = np.mean(precision) - - recall = eval_tp / (eval_tp + eval_fn) - recall[np.isnan(recall)] = 0 - recall = np.mean(recall) - - f1 = 2 * (precision * recall) / (precision + recall) - return f1, precision, recall - - -def accuracy(pred, targ): - pred = th.max(pred, 1)[1] - acc = ((pred == targ).float()).sum().item() / targ.size()[0] - - return acc - - -class CudaUse(object): - def __init__(self): - self.cuda_available = th.cuda.is_available() - if self.cuda_available: - from fastai.utils.pynvml_gate import load_pynvml_env - self.pynvml = load_pynvml_env() - - def get_cuda_id(self): - if self.cuda_available: - gpu_mem = sorted(self.gpu_mem_get_all(), key=lambda item: item.free, reverse=True) - low_use_id = gpu_mem[0].id - return th.device(f'cuda:{low_use_id}') - else: - return th.device('cpu') - - def gpu_mem_get_all(self): - "get total, used and free memory (in MBs) for each available gpu" - return list(map(self.gpu_mem_get, range(self.pynvml.nvmlDeviceGetCount()))) - - def gpu_mem_get(self, _id=None): - """get total, used and free memory (in MBs) for gpu `id`. if `id` is not passed, - currently selected torch device is used""" - from collections import namedtuple - GPUMemory = namedtuple('GPUMemory', ['total', 'free', 'used', 'id']) - - if _id is None: - _id = th.cuda.current_device() - try: - handle = self.pynvml.nvmlDeviceGetHandleByIndex(_id) - info = self.pynvml.nvmlDeviceGetMemoryInfo(handle) - # return GPUMemory(*(map(b2mb, [info.total, info.free, info.used])), id=_id) - return b2mb(info.used) - except: - return GPUMemory(0, 0, 0, -1) - - -def read_file(path, mode='r', encoding=None): - if mode not in {"r", "rb"}: - raise ValueError("only read") - return open(path, mode=mode, encoding=encoding) - - -def print_graph_detail(graph): - """ - 格式化显示Graph参数 - :param graph: - :return: - """ - import networkx as nx - dst = {"nodes" : nx.number_of_nodes(graph), - "edges" : nx.number_of_edges(graph), - "selfloops": nx.number_of_selfloops(graph), - "isolates" : nx.number_of_isolates(graph), - "覆盖度" : 1 - nx.number_of_isolates(graph) / nx.number_of_nodes(graph), } - print_table(dst) - - -def print_table(dst): - table_title = list(dst.keys()) - from prettytable import PrettyTable - table = PrettyTable(field_names=table_title, header_style="title", header=True, border=True, - hrules=1, padding_width=2, align="c") - table.float_format = "0.4" - table.add_row([dst[i] for i in table_title]) - print(table) - - -def return_seed(nums=10): - # seed = [47, 17, 1, 3, 87, 300, 77, 23, 13] - seed = random.sample(range(0, 100000), nums) - return seed - - -def preprocess_adj(adj, is_sparse=False): - """Preprocessing of adjacency matrix for simple pygGCN model and conversion to - tuple representation.""" - adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0])) - if is_sparse: - adj_normalized = sparse_mx_to_torch_sparse_tensor(adj_normalized) - return adj_normalized - else: - return th.from_numpy(adj_normalized.A).float() - - -def sparse_mx_to_torch_sparse_tensor(sparse_mx): - """Convert a scipy sparse matrix to a torch sparse tensor.""" - sparse_mx = sparse_mx.tocoo().astype(np.float32) - indices = th.from_numpy( - np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) - values = th.from_numpy(sparse_mx.data) - shape = th.Size(sparse_mx.shape) - return th.sparse.FloatTensor(indices, values, shape) - - -def normalize_adj(adj): - """Symmetrically normalize adjacency matrix.""" - adj = sp.coo_matrix(adj) - rowsum = np.array(adj.sum(1)) - d_inv_sqrt = np.power(rowsum, -0.5).flatten() - d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. - d_mat_inv_sqrt = sp.diags(d_inv_sqrt) - return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo() - - -class EarlyStopping: - """Early stops the training if validation loss doesn't improve after a given patience.""" - - def __init__(self, patience=7, verbose=False, delta=0): - """ - Args: - patience (int): How long to wait after last time validation loss improved. - Default: 7 - verbose (bool): If True, prints a message for each validation loss improvement. - Default: False - delta (float): Minimum change in the monitored quantity to qualify as an improvement. - Default: 0 - """ - self.patience = patience - self.verbose = verbose - self.counter = 0 - self.best_score = None - self.early_stop = False - self.val_loss_min = np.Inf - self.delta = delta - self.model_path = "hdd_data/prepare_dataset/model/model.pt" - - def __call__(self, val_loss, model=None): - - score = -val_loss - - if self.best_score is None: - self.best_score = score - # self.save_checkpoint(val_loss, model) - elif score < self.best_score + self.delta: - self.counter += 1 - if self.verbose: - print(f'EarlyStopping counter: {self.counter} out of {self.patience}') - if self.counter >= self.patience: - self.early_stop = True - return True - else: - self.best_score = score - # self.save_checkpoint(val_loss, model) - self.counter = 0 - - def save_checkpoint(self, val_loss, model): - '''Saves model when validation loss decrease.''' - if self.verbose: - print( - f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...') - th.save(model.state_dict(), self.model_path) - self.val_loss_min = val_loss - - def load_model(self): - return th.load(self.model_path) - - -def parameter_parser(): - """ - A method to parse up command line parameters. By default it trains on the PubMed yelp_dataset. - The default hyperparameters give a good quality representation without grid search. - """ - parser = argparse.ArgumentParser(description="Run .") - - return parser.parse_args() - - -class LogResult: - def __init__(self): - self.result = defaultdict(list) - pass - - def log(self, result: dict): - for key, value in result.items(): - self.result[key].append(value) - - def log_single(self, key, value): - self.result[key].append(value) - - def show_str(self): - print() - string = "" - for key, value_lst in self.result.items(): - value = np.mean(value_lst) - if isinstance(value, int): - string += f"{key}:\n{value}\n{max(value_lst)}\n{min(value_lst)}\n" - else: - string += f"{key}:\n{value:.4f}\n{max(value_lst):.4f}\n{min(value_lst):.4f} \n" - print(string) +import argparse +import random +from collections import defaultdict + +import numpy as np +import torch as th +import scipy.sparse as sp + + +def macro_f1(pred, targ, num_classes=None): + pred = th.max(pred, 1)[1] + tp_out = [] + fp_out = [] + fn_out = [] + if num_classes is None: + num_classes = sorted(set(targ.cpu().numpy().tolist())) + else: + num_classes = range(num_classes) + for i in num_classes: + tp = ((pred == i) & (targ == i)).sum().item() # 预测为i,且标签的确为i的 + fp = ((pred == i) & (targ != i)).sum().item() # 预测为i,但标签不是为i的 + fn = ((pred != i) & (targ == i)).sum().item() # 预测不是i,但标签是i的 + tp_out.append(tp) + fp_out.append(fp) + fn_out.append(fn) + + eval_tp = np.array(tp_out) + eval_fp = np.array(fp_out) + eval_fn = np.array(fn_out) + + precision = eval_tp / (eval_tp + eval_fp) + precision[np.isnan(precision)] = 0 + precision = np.mean(precision) + + recall = eval_tp / (eval_tp + eval_fn) + recall[np.isnan(recall)] = 0 + recall = np.mean(recall) + + f1 = 2 * (precision * recall) / (precision + recall) + return f1, precision, recall + + +def accuracy(pred, targ): + pred = th.max(pred, 1)[1] + acc = ((pred == targ).float()).sum().item() / targ.size()[0] + + return acc + + +class CudaUse(object): + def __init__(self): + self.cuda_available = th.cuda.is_available() + if self.cuda_available: + from fastai.utils.pynvml_gate import load_pynvml_env + self.pynvml = load_pynvml_env() + + def get_cuda_id(self): + if self.cuda_available: + gpu_mem = sorted(self.gpu_mem_get_all(), key=lambda item: item.free, reverse=True) + low_use_id = gpu_mem[0].id + return th.device(f'cuda:{low_use_id}') + else: + return th.device('cpu') + + def gpu_mem_get_all(self): + "get total, used and free memory (in MBs) for each available gpu" + return list(map(self.gpu_mem_get, range(self.pynvml.nvmlDeviceGetCount()))) + + def gpu_mem_get(self, _id=None): + """get total, used and free memory (in MBs) for gpu `id`. if `id` is not passed, + currently selected torch device is used""" + from collections import namedtuple + GPUMemory = namedtuple('GPUMemory', ['total', 'free', 'used', 'id']) + + if _id is None: + _id = th.cuda.current_device() + try: + handle = self.pynvml.nvmlDeviceGetHandleByIndex(_id) + info = self.pynvml.nvmlDeviceGetMemoryInfo(handle) + # return GPUMemory(*(map(b2mb, [info.total, info.free, info.used])), id=_id) + return b2mb(info.used) + except: + return GPUMemory(0, 0, 0, -1) + + +def read_file(path, mode='r', encoding=None): + if mode not in {"r", "rb"}: + raise ValueError("only read") + return open(path, mode=mode, encoding=encoding) + + +def print_graph_detail(graph): + """ + 格式化显示Graph参数 + :param graph: + :return: + """ + import networkx as nx + dst = {"nodes" : nx.number_of_nodes(graph), + "edges" : nx.number_of_edges(graph), + "selfloops": nx.number_of_selfloops(graph), + "isolates" : nx.number_of_isolates(graph), + "覆盖度" : 1 - nx.number_of_isolates(graph) / nx.number_of_nodes(graph), } + print_table(dst) + + +def print_table(dst): + table_title = list(dst.keys()) + from prettytable import PrettyTable + table = PrettyTable(field_names=table_title, header_style="title", header=True, border=True, + hrules=1, padding_width=2, align="c") + table.float_format = "0.4" + table.add_row([dst[i] for i in table_title]) + print(table) + + +def return_seed(nums=10): + # seed = [47, 17, 1, 3, 87, 300, 77, 23, 13] + seed = random.sample(range(0, 100000), nums) + return seed + + +def preprocess_adj(adj, is_sparse=False): + """Preprocessing of adjacency matrix for simple pygGCN model and conversion to + tuple representation.""" + adj_normalized = normalize_adj(adj + sp.eye(adj.shape[0])) + if is_sparse: + adj_normalized = sparse_mx_to_torch_sparse_tensor(adj_normalized) + return adj_normalized + else: + return th.from_numpy(adj_normalized.A).float() + + +def sparse_mx_to_torch_sparse_tensor(sparse_mx): + """Convert a scipy sparse matrix to a torch sparse tensor.""" + sparse_mx = sparse_mx.tocoo().astype(np.float32) + indices = th.from_numpy( + np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64)) + values = th.from_numpy(sparse_mx.data) + shape = th.Size(sparse_mx.shape) + return th.sparse.FloatTensor(indices, values, shape) + + +def normalize_adj(adj): + """Symmetrically normalize adjacency matrix.""" + adj = sp.coo_matrix(adj) + rowsum = np.array(adj.sum(1)) + d_inv_sqrt = np.power(rowsum, -0.5).flatten() + d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. + d_mat_inv_sqrt = sp.diags(d_inv_sqrt) + return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo() + + +class EarlyStopping: + """Early stops the training if validation loss doesn't improve after a given patience.""" + + def __init__(self, patience=7, verbose=False, delta=0): + """ + Args: + patience (int): How long to wait after last time validation loss improved. + Default: 7 + verbose (bool): If True, prints a message for each validation loss improvement. + Default: False + delta (float): Minimum change in the monitored quantity to qualify as an improvement. + Default: 0 + """ + self.patience = patience + self.verbose = verbose + self.counter = 0 + self.best_score = None + self.early_stop = False + self.val_loss_min = np.Inf + self.delta = delta + self.model_path = "hdd_data/prepare_dataset/model/model.pt" + + def __call__(self, val_loss, model=None): + + score = -val_loss + + if self.best_score is None: + self.best_score = score + # self.save_checkpoint(val_loss, model) + elif score < self.best_score + self.delta: + self.counter += 1 + if self.verbose: + print(f'EarlyStopping counter: {self.counter} out of {self.patience}') + if self.counter >= self.patience: + self.early_stop = True + return True + else: + self.best_score = score + # self.save_checkpoint(val_loss, model) + self.counter = 0 + + def save_checkpoint(self, val_loss, model): + '''Saves model when validation loss decrease.''' + if self.verbose: + print( + f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}). Saving model ...') + th.save(model.state_dict(), self.model_path) + self.val_loss_min = val_loss + + def load_model(self): + return th.load(self.model_path) + + +def parameter_parser(): + """ + A method to parse up command line parameters. By default it trains on the PubMed yelp_dataset. + The default hyperparameters give a good quality representation without grid search. + """ + parser = argparse.ArgumentParser(description="Run .") + + return parser.parse_args() + + +class LogResult: + def __init__(self): + self.result = defaultdict(list) + pass + + def log(self, result: dict): + for key, value in result.items(): + self.result[key].append(value) + + def log_single(self, key, value): + self.result[key].append(value) + + def show_str(self): + print() + string = "" + for key, value_lst in self.result.items(): + value = np.mean(value_lst) + if isinstance(value, int): + string += f"{key}:\n{value}\n{max(value_lst)}\n{min(value_lst)}\n" + else: + # string +=f"{key}:\n{value:.4f}\n{max(value_lst):.4f}\n{min(value_lst):.4f} \n" + print(key," ",":",value,) + print(string) From a249870494b140d339874016454e7216cddf3252 Mon Sep 17 00:00:00 2001 From: 2314 <131743576+gxy2314@users.noreply.github.com> Date: Thu, 26 Dec 2024 14:40:27 +0800 Subject: [PATCH 2/2] resolve training issues with mistakes - Fixed `ModuleNotFoundError: No module named 'fastai.utils'` during training. - Resolved `RuntimeError: indices should be either on CPU or on the same device as the indexed tensor`. - Addressed output errors during result display.