From a9e0b9e85ca7fbef3071e6d312997277a3f37fd3 Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Mon, 4 Mar 2024 00:48:48 -0500 Subject: [PATCH 01/15] changes for optional raw text support --- .gitignore | 3 + datasets/cora/cora.ipynb | 171 ++++++++-- datasets/cora/metadata.json | 9 + gli/__init__.py | 1 + gli/dataloading.py | 14 +- gli/graph.py | 40 ++- gli/io/graph.py | 29 +- gli/raw_text_utils.py | 600 ++++++++++++++++++++++++++++++++++++ gli/utils.py | 40 ++- 9 files changed, 856 insertions(+), 51 deletions(-) create mode 100644 gli/raw_text_utils.py diff --git a/.gitignore b/.gitignore index 183f686f..60c4f8f2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,9 @@ # MacOS .DS_Store files .DS_Store +# Raw dataset folders +cora_raw/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/datasets/cora/cora.ipynb b/datasets/cora/cora.ipynb index d59f8693..509771b1 100644 --- a/datasets/cora/cora.ipynb +++ b/datasets/cora/cora.ipynb @@ -7,6 +7,16 @@ "# Cora Example" ] }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -16,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -54,7 +64,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -75,11 +85,64 @@ "print(edge.shape)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load raw text for cora dataset" + ] + }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/jinhuang/opt/miniconda3/envs/arxiv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "dict_keys(['title', 'abs', 'label'])\n", + "title ['Title: The megaprior heuristic for discovering protein sequence patterns ', 'Title: Applications of machine learning: a medical follow up study ', 'Title: Submitted to NIPS96, Section: Applications. Preference: Oral presentation Reinforcement Learning for Dynamic Channel Allocation in']\n", + "abs ['Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength (i.e., decreases the variance) of the prior in proportion to the size of the sequence dataset. This causes each column in the final model to strongly resemble the mean of a single component of the prior, regardless of the size of the dataset. We describe the cause of the convex combination problem, analyze it mathematically, motivate and describe the implementation of the megaprior heuristic, and show how it can effectively eliminate the problem of convex combinations in protein sequence pattern discovery. ', 'Abstract: This paper describes preliminary work that aims to apply some learning strategies to a medical follow-up study. An investigation of the application of three machine learning algorithms-1R, FOIL and InductH to identify risk factors that govern the colposuspension cure rate has been made. The goal of this study is to induce a generalised description or explanation of the classification attribute, colposuspension cure rate (completely cured, improved, unchanged and worse) from the 767 examples in the questionnaires. We looked for a set of rules that described which risk factors result in differences of cure rate. The results were encouraging, and indicate that machine learning can play a useful role in large scale medical problem solving. ', 'Abstract: In cellular telephone systems, an important problem is to dynamically allocate the communication resource (channels) so as to maximize service in a stochastic caller environment. This problem is naturally formulated as a dynamic programming problem and we use a reinforcement learning (RL) method to find dynamic channel allocation policies that are better than previous heuristic solutions. The policies obtained perform well for a broad variety of call traffic patterns. We present results on a large cellular system In cellular communication systems, an important problem is to allocate the communication resource (bandwidth) so as to maximize the service provided to a set of mobile callers whose demand for service changes stochastically. A given geographical area is divided into mutually disjoint cells, and each cell serves the calls that are within its boundaries (see Figure 1a). The total system bandwidth is divided into channels, with each channel centered around a frequency. Each channel can be used simultaneously at different cells, provided these cells are sufficiently separated spatially, so that there is no interference between them. The minimum separation distance between simultaneous reuse of the same channel is called the channel reuse constraint . When a call requests service in a given cell either a free channel (one that does not violate the channel reuse constraint) may be assigned to the call, or else the call is blocked from the system; this will happen if no free channel can be found. Also, when a mobile caller crosses from one cell to another, the call is \"handed off\" to the cell of entry; that is, a new free channel is provided to the call at the new cell. If no such channel is available, the call must be dropped/disconnected from the system. One objective of a channel allocation policy is to allocate the available channels to calls so that the number of blocked calls is minimized. An additional objective is to minimize the number of calls that are dropped when they are handed off to a busy cell. These two objectives must be weighted appropriately to reflect their relative importance, since dropping existing calls is generally more undesirable than blocking new calls. with approximately 70 49 states.']\n", + "label ['Neural Networks', 'Rule Learning', 'Reinforcement Learning']\n" + ] + } + ], + "source": [ + "import sys\n", + "sys.path.append(\"../../\")\n", + "from gli.raw_text_utils import load_data\n", + "\n", + "_, raw_text_dict = load_data(dataset=\"cora\", use_text=True)\n", + "\n", + "print(raw_text_dict.keys())\n", + "\n", + "for key, item in raw_text_dict.items():\n", + " print(key, item[:3])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "key_to_loc = {'RawText_NodeRawText': {'optional file': 'cora__graph__835178b65ba8cfdfb9c91f33c6260506.optional.npz', 'key': 'RawText_NodeRawText'}, 'Edge_Edge': {'file': 'cora__graph__6c912909fa18eff10797210ea5e485fe.npz', 'key': 'Edge_Edge'}, 'Node_NodeLabel': {'file': 'cora__graph__6c912909fa18eff10797210ea5e485fe.npz', 'key': 'Node_NodeLabel'}, 'Node_NodeFeature': {'file': 'cora__graph__Node_NodeFeature__7032c9c380d1889061dcbbcd76b8c427.sparse.npz'}, 'Graph_NodeList': {'file': 'cora__graph__Graph_NodeList__23bbef862fd6037395412eb03b4e1d9c.sparse.npz'}}\n", + "r.name = 'NodeRawText'\n" + ] + } + ], "source": [ "from gli.io import save_graph, Attribute\n", "\n", @@ -99,12 +162,22 @@ " \"Tensor\",\n", " )\n", "]\n", + "raw_text_attrs = [\n", + " Attribute(\n", + " \"NodeRawText\",\n", + " raw_text_dict,\n", + " \"Raw text of title, abstract and label of each node in Cora dataset, string.\",\n", + " \"Dict\",\n", + " 'Dict[str, list[str]]'\n", + " )\n", + "]\n", "\n", "metadata = save_graph(\n", " name=\"cora\",\n", " edge=edge,\n", " num_nodes=graph.num_nodes(),\n", " node_attrs=node_attrs,\n", + " raw_text_attrs=raw_text_attrs,\n", " description=\"CORA dataset.\",\n", " cite=\n", " \"@inproceedings{yang2016revisiting,\\ntitle={Revisiting semi-supervised learning with graph embeddings},\\nauthor={Yang, Zhilin and Cohen, William and Salakhudinov, Ruslan},\\nbooktitle={International conference on machine learning},\\npages={40--48},\\nyear={2016},\\norganization={PMLR}\\n}\",\n", @@ -120,7 +193,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -155,6 +228,15 @@ " \"_NodeList\": {\n", " \"file\": \"cora__graph__Graph_NodeList__23bbef862fd6037395412eb03b4e1d9c.sparse.npz\"\n", " }\n", + " },\n", + " \"RawText\": {\n", + " \"NodeRawText\": {\n", + " \"description\": \"Raw text of title, abstract and label of each node in Cora dataset, string.\",\n", + " \"type\": \"Dict\",\n", + " \"format\": \"Dict[str, list[str]]\",\n", + " \"optional file\": \"cora__graph__835178b65ba8cfdfb9c91f33c6260506.optional.npz\",\n", + " \"key\": \"RawText_NodeRawText\"\n", + " }\n", " }\n", " },\n", " \"citation\": \"@inproceedings{yang2016revisiting,\\ntitle={Revisiting semi-supervised learning with graph embeddings},\\nauthor={Yang, Zhilin and Cohen, William and Salakhudinov, Ruslan},\\nbooktitle={International conference on machine learning},\\npages={40--48},\\nyear={2016},\\norganization={PMLR}\\n}\",\n", @@ -177,7 +259,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -188,7 +270,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -216,7 +298,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -260,7 +342,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -275,7 +357,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/jimmy/Projects/Private/gli/gli/utils.py:254: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/aten/src/ATen/SparseCsrTensorImpl.cpp:56.)\n", + "/Users/jinhuang/Documents/research/gli/datasets/cora/../../gli/utils.py:263: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /Users/runner/work/_temp/anaconda/conda-bld/pytorch_1682343673238/work/aten/src/ATen/SparseCsrTensorImpl.cpp:56.)\n", " return torch.sparse_csr_tensor(crow_indices,\n" ] }, @@ -287,7 +369,7 @@ " edata_schemes={})" ] }, - "execution_count": 8, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -301,37 +383,74 @@ "data[0]" ] }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "All data files already exist. Skip downloading.\n", + "CORA dataset.\n", + "All data files already exist. Skip downloading.\n", + "Node classification on CORA dataset. Planetoid split.\n", + "Graph(num_nodes=2708, num_edges=10556,\n", + " ndata_schemes={'NodeFeature': Scheme(shape=(1433,), dtype=torch.float32), 'NodeLabel': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool)}\n", + " edata_schemes={})\n" + ] + } + ], + "source": [ + "from gli.dataloading import get_gli_dataset\n", + "\n", + "dataset = get_gli_dataset(\"cora\", \"NodeClassification\", load_raw_text=True, verbose=True)\n", + "\n", + "data = dataset[0]\n", + "\n", + "print(data)\n" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "After adding LICENSE and README.md, the dataset directory will be the following." + "The raw text are saved in:" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "\u001b[1;36m.\u001b[00m\n", - "├── LICENSE\n", - "├── README.md\n", - "├── cora.ipynb\n", - "├── cora__graph__6c912909fa18eff10797210ea5e485fe.npz\n", - "├── cora__graph__Graph_NodeList__23bbef862fd6037395412eb03b4e1d9c.sparse.npz\n", - "├── cora__graph__Node_NodeFeature__7032c9c380d1889061dcbbcd76b8c427.sparse.npz\n", - "├── cora__task_node_classification_1__41e167258678b585872679839ce9c40f.npz\n", - "├── metadata.json\n", - "└── task_node_classification_1.json\n", - "\n", - "0 directories, 9 files\n" + "title ['Title: The megaprior heuristic for discovering protein sequence patterns ', 'Title: Applications of machine learning: a medical follow up study ', 'Title: Submitted to NIPS96, Section: Applications. Preference: Oral presentation Reinforcement Learning for Dynamic Channel Allocation in']\n", + "abs ['Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength (i.e., decreases the variance) of the prior in proportion to the size of the sequence dataset. This causes each column in the final model to strongly resemble the mean of a single component of the prior, regardless of the size of the dataset. We describe the cause of the convex combination problem, analyze it mathematically, motivate and describe the implementation of the megaprior heuristic, and show how it can effectively eliminate the problem of convex combinations in protein sequence pattern discovery. ', 'Abstract: This paper describes preliminary work that aims to apply some learning strategies to a medical follow-up study. An investigation of the application of three machine learning algorithms-1R, FOIL and InductH to identify risk factors that govern the colposuspension cure rate has been made. The goal of this study is to induce a generalised description or explanation of the classification attribute, colposuspension cure rate (completely cured, improved, unchanged and worse) from the 767 examples in the questionnaires. We looked for a set of rules that described which risk factors result in differences of cure rate. The results were encouraging, and indicate that machine learning can play a useful role in large scale medical problem solving. ', 'Abstract: In cellular telephone systems, an important problem is to dynamically allocate the communication resource (channels) so as to maximize service in a stochastic caller environment. This problem is naturally formulated as a dynamic programming problem and we use a reinforcement learning (RL) method to find dynamic channel allocation policies that are better than previous heuristic solutions. The policies obtained perform well for a broad variety of call traffic patterns. We present results on a large cellular system In cellular communication systems, an important problem is to allocate the communication resource (bandwidth) so as to maximize the service provided to a set of mobile callers whose demand for service changes stochastically. A given geographical area is divided into mutually disjoint cells, and each cell serves the calls that are within its boundaries (see Figure 1a). The total system bandwidth is divided into channels, with each channel centered around a frequency. Each channel can be used simultaneously at different cells, provided these cells are sufficiently separated spatially, so that there is no interference between them. The minimum separation distance between simultaneous reuse of the same channel is called the channel reuse constraint . When a call requests service in a given cell either a free channel (one that does not violate the channel reuse constraint) may be assigned to the call, or else the call is blocked from the system; this will happen if no free channel can be found. Also, when a mobile caller crosses from one cell to another, the call is \"handed off\" to the cell of entry; that is, a new free channel is provided to the call at the new cell. If no such channel is available, the call must be dropped/disconnected from the system. One objective of a channel allocation policy is to allocate the available channels to calls so that the number of blocked calls is minimized. An additional objective is to minimize the number of calls that are dropped when they are handed off to a busy cell. These two objectives must be weighted appropriately to reflect their relative importance, since dropping existing calls is generally more undesirable than blocking new calls. with approximately 70 49 states.']\n", + "label ['Neural Networks', 'Rule Learning', 'Reinforcement Learning']\n" ] } ], + "source": [ + "for key, item in data.NodeRawText['RawText_NodeRawText'].items():\n", + " print(key, item[:3])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After adding LICENSE and README.md, the dataset directory will be the following." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "!tree ." ] @@ -353,7 +472,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.13" + "version": "3.9.17" }, "orig_nbformat": 4, "vscode": { diff --git a/datasets/cora/metadata.json b/datasets/cora/metadata.json index f4895f41..8a058581 100644 --- a/datasets/cora/metadata.json +++ b/datasets/cora/metadata.json @@ -26,6 +26,15 @@ "_NodeList": { "file": "cora__graph__Graph_NodeList__23bbef862fd6037395412eb03b4e1d9c.sparse.npz" } + }, + "RawText": { + "NodeRawText": { + "description": "Raw text of title, abstract and label of each node in Cora dataset, string.", + "type": "Dict", + "format": "Dict[str, list[str]]", + "optional file": "cora__graph__835178b65ba8cfdfb9c91f33c6260506.optional.npz", + "key": "RawText_NodeRawText" + } } }, "citation": "@inproceedings{yang2016revisiting,\ntitle={Revisiting semi-supervised learning with graph embeddings},\nauthor={Yang, Zhilin and Cohen, William and Salakhudinov, Ruslan},\nbooktitle={International conference on machine learning},\npages={40--48},\nyear={2016},\norganization={PMLR}\n}", diff --git a/gli/__init__.py b/gli/__init__.py index d2cb5229..7bdc67a5 100644 --- a/gli/__init__.py +++ b/gli/__init__.py @@ -5,5 +5,6 @@ from . import graph from . import task from . import utils +from . import raw_text_utils from .dataloading import get_gli_graph, get_gli_task, \ get_gli_dataset, combine_graph_and_task diff --git a/gli/dataloading.py b/gli/dataloading.py index 63ec88bf..bc18e3c4 100644 --- a/gli/dataloading.py +++ b/gli/dataloading.py @@ -50,6 +50,7 @@ def get_gli_dataset(dataset: str, task: str, task_id: int = 1, device: str = "cpu", + load_raw_text: bool = False, verbose: bool = False) -> DGLDataset: """Get a graph dataset given dataset name and task config. @@ -61,6 +62,8 @@ def get_gli_dataset(dataset: str, :type task_id: int, optional. :param device: device name, defaults to "cpu". :type device: str, optional + :param load_raw_text: whether to load raw text data, defaults to False. + :type load_raw_text: bool, optional :param verbose: verbose level, defaults to False. :type verbose: bool, optional @@ -86,13 +89,15 @@ def get_gli_dataset(dataset: str, >>> d.name 'CORA dataset. NodeClassification' """ - g = get_gli_graph(dataset, device=device, verbose=verbose) + g = get_gli_graph(dataset, device=device, load_raw_text=load_raw_text, + verbose=verbose) t = get_gli_task(dataset, task, task_id=task_id, verbose=verbose) return combine_graph_and_task(g, t) def get_gli_graph(dataset: str, device: str = "cpu", + load_raw_text: bool = False, verbose: bool = False) -> Union[DGLGraph, List[DGLGraph]]: """Get one (or a list of) :class:`dgl.DGLGraph` object(s) from GLI repo. @@ -105,6 +110,8 @@ def get_gli_graph(dataset: str, :type dataset: str :param device: device name, defaults to "cpu". :type device: str, optional + :param load_raw_text: whether to load raw text data, defaults to False. + :type load_raw_text: bool, optional :param verbose: verbose level, defaults to False. :type verbose: bool, optional @@ -136,9 +143,10 @@ def get_gli_graph(dataset: str, raise FileNotFoundError(f"{data_dir} not found.") if not os.path.exists(metadata_path): raise FileNotFoundError(f"{metadata_path} not found.") - download_data(dataset, verbose=verbose) + download_data(dataset, load_raw_text=load_raw_text, verbose=verbose) - return read_gli_graph(metadata_path, device=device, verbose=verbose) + return read_gli_graph(metadata_path, device=device, + load_raw_text=load_raw_text, verbose=verbose) def get_gli_task(dataset: str, diff --git a/gli/graph.py b/gli/graph.py index 1b0cb8d2..6f3c23e4 100644 --- a/gli/graph.py +++ b/gli/graph.py @@ -16,7 +16,8 @@ from .utils import sparse_to_torch, load_data -def read_gli_graph(metadata_path: os.PathLike, device="cpu", verbose=True): +def read_gli_graph(metadata_path: os.PathLike, device="cpu", + load_raw_text=False, verbose=True): """Read a local `metadata.json` file and return a (or a list of) graph(s). :func:`gli.graph.read_gli_graph` reads a graph or a list of graphs @@ -64,10 +65,12 @@ def read_gli_graph(metadata_path: os.PathLike, device="cpu", verbose=True): "data"], f"attribute `{neg}` not in metadata.json" data = copy(metadata["data"]) - data = _dfs_read_file(pwd, data, device="cpu") + data = _dfs_read_file(pwd, data, device="cpu", + load_raw_text=load_raw_text) if _is_single_graph(data): - return _get_single_graph(data, device, hetero=hetero, name=name) + return _get_single_graph(data, device, hetero=hetero, + name=name, load_raw_text=load_raw_text) else: return _get_multi_graph(data, device, name=name) @@ -102,12 +105,13 @@ def _to_tensor(x, device="cpu"): return x -def _get_single_graph(data, device="cpu", hetero=False, name=None): +def _get_single_graph(data, device="cpu", hetero=False, + name=None, load_raw_text=False): """Initialize and return a single Graph instance given data.""" if hetero: g = _get_heterograph(data) else: - g = _get_homograph(data) + g = _get_homograph(data, load_raw_text=load_raw_text) setattr(g, "name", name) return g.to(device=device) @@ -154,7 +158,7 @@ def _get_multi_graph(data, device="cpu", name=None): return graphs -def _get_homograph(data): +def _get_homograph(data, load_raw_text=False): """Get a homogeneous graph from data.""" edges = data["Edge"].pop("_Edge") # (num_edges, 2) src_nodes, dst_nodes = edges.T[0], edges.T[1] @@ -171,6 +175,11 @@ def _get_homograph(data): for attr, array in data["Edge"].items(): g.edata[attr] = _to_tensor(array) + + if load_raw_text: + for attr, raw_text_dict in data["RawText"].items(): + setattr(g, attr, raw_text_dict) + return g @@ -245,20 +254,25 @@ def _dict_depth(d): return 0 -def _dfs_read_file(pwd, d, device="cpu"): +def _dfs_read_file(pwd, d, device="cpu", load_raw_text=False): """Read file efficiently.""" - return _dfs_read_file_helper(pwd, d, device) + return _dfs_read_file_helper(pwd, d, device, load_raw_text) -def _dfs_read_file_helper(pwd, d, device="cpu"): +def _dfs_read_file_helper(pwd, d, device="cpu", load_raw_text=False): """Read file recursively (helper of `_dfs_read_file`).""" - if "file" in d: - path = os.path.join(pwd, d["file"]) - return load_data(path, d.get("key"), device) + if "file" in d or "optional file" in d: + if "file" in d: + path = os.path.join(pwd, d["file"]) + else: + path = os.path.join(pwd, d["optional file"]) + return load_data(path, d.get("key"), device, load_raw_text) empty_keys = [] for k in d: - entry = _dfs_read_file_helper(pwd, d[k], device=device) + entry = _dfs_read_file_helper(pwd, d[k], + load_raw_text=load_raw_text, + device=device) if entry is None: empty_keys.append(k) else: diff --git a/gli/io/graph.py b/gli/io/graph.py index bece9da9..5ab40054 100644 --- a/gli/io/graph.py +++ b/gli/io/graph.py @@ -55,8 +55,12 @@ def __init__(self, else: raise TypeError("The input data must be a scipy sparse array " "or numpy array.") - - self.num_data = len(data) if self.format == "Tensor" else data.shape[0] + if self.format == "Tensor": + self.num_data = len(data) + elif self.format == "Dict[str, list[str]]": + self.num_data = None + else: + self.num_data = data.shape[0] def get_metadata_dict(self): """Return the metadata dictionary of the attribute.""" @@ -102,6 +106,7 @@ def save_graph( graph_node_list: Optional[spmatrix] = None, graph_edge_list: Optional[spmatrix] = None, graph_attrs: Optional[List[Attribute]] = None, + raw_text_attrs: Optional[List[Attribute]] = None, is_hetero: bool = False, description: str = "", cite: str = "", @@ -117,7 +122,7 @@ def save_graph( if not is_hetero: return save_homograph(name, edge, num_nodes, node_attrs, edge_attrs, graph_node_list, graph_edge_list, graph_attrs, - description, cite, save_dir) + raw_text_attrs, description, cite, save_dir) # verify the inputs are dict for heterograph if not isinstance(edge, dict): raise TypeError("The input edge must be a dictionary for heterograph.") @@ -144,6 +149,7 @@ def save_homograph( graph_node_list: Optional[spmatrix] = None, graph_edge_list: Optional[spmatrix] = None, graph_attrs: Optional[List[Attribute]] = None, + raw_text_attrs: Optional[List[Attribute]] = None, description: str = "", citation: str = "", save_dir: str = ".", @@ -178,6 +184,8 @@ def save_homograph( :type graph_edge_list: (sparse) array, optional :param graph_attrs: A list of attributes of the graphs, defaults to None. :type graph_attrs: list of Attribute, optional + :param raw_text: A list of raw text data, defaults to None.8 + :type raw_text: dict, optional :param description: The description of the dataset, defaults to "". :type description: str, optional :param citation: The citation of the dataset, defaults to "". @@ -190,6 +198,7 @@ def save_homograph( :rtype: dict Example + TODO: update this code example for raw_text ------- .. code-block:: python @@ -270,6 +279,8 @@ def save_homograph( edge_attrs = [] if graph_attrs is None: graph_attrs = [] + if raw_text_attrs is None: + raw_text_attrs = [] # Check the length of node/edge/graph attrs. _verify_attrs(node_attrs, "node") @@ -316,10 +327,15 @@ def save_homograph( assert g.name not in ("NodeList", "EdgeList"), \ "The name of a graph attribute cannot be 'NodeList' or 'EdgeList'." data[f"Graph_{g.name}"] = g.data + if raw_text_attrs is not None: + for r in raw_text_attrs: + data[f"RawText_{r.name}"] = r.data # Call save_data(). key_to_loc = save_data(f"{name}__graph", save_dir=save_dir, **data) + print(f"{key_to_loc = }") + # Create the metadata dict. metadata = {"description": description, "data": {}} @@ -343,6 +359,13 @@ def save_homograph( graph_dict[g.name] = _attr_to_metadata_dict(key_to_loc, "Graph", g) metadata["data"]["Graph"] = graph_dict + # Add the metadata of the raw text + raw_text_dict = {} + for r in raw_text_attrs: + print(f"{r.name = }") + raw_text_dict[r.name] = _attr_to_metadata_dict(key_to_loc, "RawText", r) + metadata["data"]["RawText"] = raw_text_dict + metadata["citation"] = citation metadata["is_heterogeneous"] = False diff --git a/gli/raw_text_utils.py b/gli/raw_text_utils.py new file mode 100644 index 00000000..db1f1023 --- /dev/null +++ b/gli/raw_text_utils.py @@ -0,0 +1,600 @@ +"""The ``gli.raw_text_utils`` module provides functions to process raw text.""" +import json +import numpy as np +import os +import pandas as pd +import random +import sys +import torch +from datasets import load_dataset +from ogb.nodeproppred import PygNodePropPredDataset +from sklearn.preprocessing import normalize +import torch_geometric.transforms as T +from torch_geometric.data import Data +from torch_geometric.datasets import Planetoid + +sys.path.append("../") + + + +DATASET_W_RAW_TEXT = ["cora", "pubmed", "ogbn-arxiv", + "arxiv-2023", "ogbn-products"] + + +def load_data(dataset, use_text=False, seed=0): + """ + Load data based on the dataset name. + + Parameters: + dataset (str): Name of the dataset to be loaded. + Options are "cora", "pubmed", "arxiv", "arxiv_2023", and "product". + use_text (bool, optional): Whether to use text data. Default is False. + seed (int, optional): Random seed for data loading. Default is 0. + + Returns: + Tuple: Loaded data and text information. + + Raises: + ValueError: If the dataset name is not recognized. + """ + + if dataset == "cora": + data, text = get_raw_text_cora(use_text, seed) + elif dataset == "pubmed": + data, text = get_raw_text_pubmed(use_text, seed) + elif dataset == "arxiv": + data, text = get_raw_text_arxiv(use_text) + elif dataset == "arxiv_2023": + data, text = get_raw_text_arxiv_2023(use_text) + elif dataset == "product": + data, text = get_raw_text_products(use_text) + else: + raise ValueError("Dataset must be one of: cora, pubmed, arxiv") + return data, text + + +################# Ogbn-arxiv ################# + + +def get_raw_text_arxiv(use_text=False): + """ + Reference: https://github.com/XiaoxinHe/TAPE/blob/ + main/core/data_utils/load_arxiv.py + """ + + dataset = PygNodePropPredDataset(name="ogbn-arxiv") + data = dataset[0] + + idx_splits = dataset.get_idx_split() + train_mask = torch.zeros(data.num_nodes).bool() + val_mask = torch.zeros(data.num_nodes).bool() + test_mask = torch.zeros(data.num_nodes).bool() + train_mask[idx_splits["train"]] = True + val_mask[idx_splits["valid"]] = True + test_mask[idx_splits["test"]] = True + data.train_mask = train_mask + data.val_mask = val_mask + data.test_mask = test_mask + + # data.edge_index = data.adj_t.to_symmetric() + if not use_text: + return data, None + + nodeidx2paperid_path = "dataset/ogbn_arxiv/mapping/nodeidx2paperid.csv.gz" + nodeidx2paperid = pd.read_csv(nodeidx2paperid_path, compression="gzip") + + raw_text = pd.read_csv("dataset/ogbn_arxiv/titleabs.tsv", sep="\t") + raw_text.columns = ["paper id", "title", "abs"] + + df = pd.merge(nodeidx2paperid, raw_text, on="paper id") + + text = {"title": [], "abs": [], "label": []} + + for ti, ab in zip(df["title"], df["abs"]): + text["title"].append(ti) + text["abs"].append(ab) + + # Load the label index to arXiv category mapping data + label_mapping_path = "dataset/ogbn_arxiv/mapping/"\ + "labelidx2arxivcategeory.csv.gz" + label_mapping_data = pd.read_csv(label_mapping_path) + label_mapping_data.columns = ["label_idx", "arxiv_category"] + + for i in range(len(data.y)): + row = label_mapping_data.loc[ + label_mapping_data["label_idx"].isin(data.y[i].numpy())] + # If the row doesn"t exist, return a message indicating this + if len(row) == 0: + raise ValueError("No matching arXiv category found for this label.") + + # Parse the arXiv category string to be in the desired format "cs.XX" + arxiv_category = "cs." + row["arxiv_category"]\ + .values[0].split()[-1].upper() + text["label"].append(arxiv_category) + + return data, text + + +def generate_arxiv_keys_list(): + label_mapping_path = "dataset/ogbn_arxiv/mapping/"\ + "labelidx2arxivcategeory.csv.gz" + label_mapping_data = pd.read_csv(label_mapping_path, compression="gzip") + label_mapping_data.columns = ["label_idx", "arxiv_category"] + arxiv_categories = label_mapping_data["arxiv_category"].unique() + return ["cs." + category.split()[-1].upper() + for category in arxiv_categories] + + + +################# Arxiv-2023 ################# + +def get_raw_text_arxiv_2023(use_text=True, base_path="dataset/arxiv_2023"): + # Load processed data + edge_index = torch.load(os.path.join(base_path, + "processed", "edge_index.pt")) + # Load raw data + titles_df = pd.read_csv(os.path.join(base_path, + "raw", "titles.csv.gz"), compression="gzip") + abstracts_df = pd.read_csv(os.path.join(base_path, + "raw", "abstracts.csv.gz"), compression="gzip") + ids_df = pd.read_csv(os.path.join(base_path, "raw", "ids.csv.gz"), + compression="gzip") + labels_df = pd.read_csv(os.path.join(base_path, "raw", "labels.csv.gz"), + compression="gzip") + + # Load split data + train_id_df = pd.read_csv(os.path.join(base_path, "split", "train.csv.gz"), + compression="gzip") + val_id_df = pd.read_csv(os.path.join(base_path, "split", "valid.csv.gz"), + compression="gzip") + test_id_df = pd.read_csv(os.path.join(base_path, "split", "test.csv.gz"), + compression="gzip") + + num_nodes = len(ids_df) + titles = titles_df["titles"].tolist() + abstracts = abstracts_df["abstracts"].tolist() + ids = ids_df["ids"].tolist() + labels = labels_df["labels"].tolist() + train_id = train_id_df["train_id"].tolist() + val_id = val_id_df["val_id"].tolist() + test_id = test_id_df["test_id"].tolist() + + features = torch.load(os.path.join(base_path, "processed", "features.pt")) + + y = torch.load(os.path.join(base_path, "processed", "labels.pt")) + + train_mask = torch.tensor([x in train_id for x in range(num_nodes)]) + val_mask = torch.tensor([x in val_id for x in range(num_nodes)]) + test_mask = torch.tensor([x in test_id for x in range(num_nodes)]) + + data = Data( + x=features, + y=y, + paper_id=ids, + edge_index=edge_index, + train_mask=train_mask, + val_mask=val_mask, + test_mask=test_mask, + num_nodes=num_nodes, + ) + + data.train_id = train_id + data.val_id = val_id + data.test_id = test_id + + if not use_text: + return data, None + + text = {"title": titles, "abs": abstracts, "label": labels, "id": ids} + + return data, text + +################# Cora ################# + +cora_mapping = { + 0: "Case Based", + 1: "Genetic Algorithms", + 2: "Neural Networks", + 3: "Probabilistic Methods", + 4: "Reinforcement Learning", + 5: "Rule Learning", + 6: "Theory" +} + +def get_cora_casestudy(seed=0): + """ + Reference: https://github.com/XiaoxinHe/TAPE/blob/main/ + core/data_utils/load_cora.py + """ + data_x, data_y, data_citeid, data_edges = parse_cora() + # data_x = sklearn.preprocessing.normalize(data_x, norm="l1") + + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + np.random.seed(seed) # Numpy module. + random.seed(seed) # Python random module. + + # load data + data_name = "cora" + # path = osp.join(osp.dirname(osp.realpath(__file__)), "dataset") + dataset = Planetoid("dataset", data_name, + transform=T.NormalizeFeatures()) + data = dataset[0] + + data.x = torch.tensor(data_x).float() + data.edge_index = torch.tensor(data_edges).long() + data.y = torch.tensor(data_y).long() + data.num_nodes = len(data_y) + + # split data + node_id = np.arange(data.num_nodes) + np.random.shuffle(node_id) + + data.train_id = np.sort(node_id[:int(data.num_nodes * 0.1)]) + data.val_id = np.sort( + node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)]) + data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):]) + + data.train_mask = torch.tensor( + [x in data.train_id for x in range(data.num_nodes)]) + data.val_mask = torch.tensor( + [x in data.val_id for x in range(data.num_nodes)]) + data.test_mask = torch.tensor( + [x in data.test_id for x in range(data.num_nodes)]) + + return data, data_citeid + +# credit: https://github.com/tkipf/pygcn/issues/27, xuhaiyun +def parse_cora(): + path = "cora_raw/cora" + idx_features_labels = np.genfromtxt( + f"{path}.content", dtype=np.dtype(str)) + data_x = idx_features_labels[:, 1:-1].astype(np.float32) + labels = idx_features_labels[:, -1] + class_map = {x: i for i, x in enumerate(["Case_Based", + "Genetic_Algorithms", + "Neural_Networks", + "Probabilistic_Methods", + "Reinforcement_Learning", + "Rule_Learning", + "Theory"])} + data_y = np.array([class_map[l] for l in labels]) + data_citeid = idx_features_labels[:, 0] + idx = np.array(data_citeid, dtype=np.dtype(str)) + idx_map = {j: i for i, j in enumerate(idx)} + edges_unordered = np.genfromtxt( + f"{path}.cites", dtype=np.dtype(str)) + edges = np.array(list(map(idx_map.get, edges_unordered.flatten()))).reshape( + edges_unordered.shape) + data_edges = np.array(edges[~(edges is None).max(1)], dtype="int") + data_edges = np.vstack((data_edges, np.fliplr(data_edges))) + return data_x, data_y, data_citeid, \ + np.unique(data_edges, axis=0).transpose() + +def get_raw_text_cora(use_text=False, seed=0): + data, data_citeid = get_cora_casestudy(seed) + if not use_text: + return data, None + + with open("cora_raw/mccallum/cora/papers", encoding="UTF-8")as f: + lines = f.readlines() + pid_filename = {} + for line in lines: + pid = line.split("\t")[0] + fn = line.split("\t")[1].replace(":", "_") + pid_filename[pid] = fn + + path = "cora_raw/mccallum/cora/extractions/" + + text = {"title": [], "abs": [], "label": []} + + # Assuming path is given + all_files = {f.lower(): f for f in os.listdir(path)} + + for pid in data_citeid: + expected_fn = pid_filename[pid].lower() + # fn = pid_filename[pid] + if expected_fn in all_files: + real_fn = all_files[expected_fn] + with open(path+real_fn, encoding="UTF-8") as f: + lines = f.read().splitlines() + + if expected_fn in all_files: + real_fn = all_files[expected_fn] + + for line in lines: + if "Title:" in line: + ti = line + if "Abstract:" in line: + ab = line + text["title"].append(ti) + text["abs"].append(ab) + + for i in range(len(data.y)): + text["label"].append(cora_mapping[data.y[i].item()]) + + return data, text + + +################# Ogbn-product ################# + + + +def get_raw_dataset(raw_train="dataset/ogbn_products/Amazon-3M.raw/trn.json.gz", + raw_test="dataset/ogbn_products/"\ + "Amazon-3M.raw/tst.json.gz", + label2cat="dataset/ogbn_products/mapping/"\ + "labelidx2productcategory.csv.gz", + idx2asin="dataset/ogbn_products/mapping/"\ + "nodeidx2asin.csv.gz"): + """ + mapping references: + https://github.com/CurryTang/Graph-LLM/blob/master/utils.py + """ + + train_part = load_dataset("json", data_files=raw_train) + test_part = load_dataset("json", data_files=raw_test) + train_df = train_part["train"].to_pandas() + test_df = test_part["train"].to_pandas() + combine_df = pd.concat([train_df, test_df], ignore_index=True) + + label2cat_df = pd.read_csv(label2cat, compression="gzip") + idx2asin_df = pd.read_csv(idx2asin, compression="gzip") + + idx_mapping = {row[0]: row[1] for row in idx2asin_df.values} + label_mapping = {row["label idx"]: row["product category"] + for _, row in label2cat_df.iterrows()} + content_mapping = {row[0]: (row[1], row[2]) for row in combine_df.values} + + return idx_mapping, content_mapping, label_mapping + +def get_raw_text_products(use_text=False): + dataset = PygNodePropPredDataset(name="ogbn-products") + data = dataset[0] + + idx_splits = dataset.get_idx_split() + train_mask = torch.zeros(data.num_nodes).bool() + val_mask = torch.zeros(data.num_nodes).bool() + test_mask = torch.zeros(data.num_nodes).bool() + train_mask[idx_splits["train"]] = True + val_mask[idx_splits["valid"]] = True + test_mask[idx_splits["test"]] = True + data.train_mask = train_mask + data.val_mask = val_mask + data.test_mask = test_mask + + if not use_text: + return data, None + + idx_mapping, content_mapping, label_mapping = get_raw_dataset() + + text = {"title": [], "content": [], "label": []} + + for i in range(len(data.y)): + uid = idx_mapping.get(i, None) + if uid: + title, content = content_mapping.get(uid, (None, None)) + label = label_mapping.get(data.y[i].item(), None) + + text["title"].append(title) + text["content"].append(content) + + mapped_label = products_mapping.get(label, None) + if mapped_label is None: + text["label"].append("label 25") + else: + text["label"].append(mapped_label) + + return data, text + + +products_mapping = {"Home & Kitchen": "Home & Kitchen", + "Health & Personal Care": "Health & Personal Care", + "Beauty": "Beauty", + "Sports & Outdoors": "Sports & Outdoors", + "Books": "Books", + "Patio, Lawn & Garden": "Patio, Lawn & Garden", + "Toys & Games": "Toys & Games", + "CDs & Vinyl": "CDs & Vinyl", + "Cell Phones & Accessories": "Cell Phones & Accessories", + "Grocery & Gourmet Food": "Grocery & Gourmet Food", + "Arts, Crafts & Sewing": "Arts, Crafts & Sewing", + "Clothing, Shoes & Jewelry": "Clothing, Shoes & Jewelry", + "Electronics": "Electronics", + "Movies & TV": "Movies & TV", + "Software": "Software", + "Video Games": "Video Games", + "Automotive": "Automotive", + "Pet Supplies": "Pet Supplies", + "Office Products": "Office Products", + "Industrial & Scientific": "Industrial & Scientific", + "Musical Instruments": "Musical Instruments", + "Tools & Home Improvement": "Tools & Home Improvement", + "Magazine Subscriptions": "Magazine Subscriptions", + "Baby Products": "Baby Products", + "label 25": "label 25", + "Appliances": "Appliances", + "Kitchen & Dining": "Kitchen & Dining", + "Collectibles & Fine Art": "Collectibles & Fine Art", + "All Beauty": "All Beauty", + "Luxury Beauty": "Luxury Beauty", + "Amazon Fashion": "Amazon Fashion", + "Computers": "Computers", + "All Electronics": "All Electronics", + "Purchase Circles": "Purchase Circles", + "MP3 Players & Accessories": "MP3 Players & Accessories", + "Gift Cards": "Gift Cards", + "Office & School Supplies": "Office & School Supplies", + "Home Improvement": "Home Improvement", + "Camera & Photo": "Camera & Photo", + "GPS & Navigation": "GPS & Navigation", + "Digital Music": "Digital Music", + "Car Electronics": "Car Electronics", + "Baby": "Baby", + "Kindle Store": "Kindle Store", + "Buy a Kindle": "Buy a Kindle", + "Furniture & Décor": "Furniture & Decor", + "#508510": "#508510"} + +products_keys_list = list(products_mapping.keys()) + + + +################# Pubmed ################# + +""" +Reference: https://github.com/XiaoxinHe/TAPE/blob/main/core/ +data_utils/load_pubmed.py +""" + +pubmed_mapping = { + 0: "Experimentally induced diabetes", + 1: "Type 1 diabetes", + 2: "Type 2 diabetes", +} + +def get_pubmed_casestudy(corrected=False, seed=0): + _, data_x, data_y, data_pubid, data_edges = parse_pubmed() + data_x = normalize(data_x, norm="l1") + + torch.manual_seed(seed) + if torch.cuda.is_available(): + torch.cuda.manual_seed(seed) + np.random.seed(seed) + random.seed(seed) + + # load data + data_name = "PubMed" + # path = osp.join(osp.dirname(osp.realpath(__file__)), "dataset") + dataset = Planetoid("dataset", data_name, transform=T.NormalizeFeatures()) + data = dataset[0] + + # replace dataset matrices with the PubMed-Diabetes data, + # for which we have the original pubmed IDs + data.x = torch.tensor(data_x) + data.edge_index = torch.tensor(data_edges) + data.y = torch.tensor(data_y) + + # split data + node_id = np.arange(data.num_nodes) + np.random.shuffle(node_id) + + data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)]) + data.val_id = np.sort( + node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)]) + data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):]) + + if corrected: + is_mistake = np.loadtxt( + "pubmed_casestudy/pubmed_mistake.txt", dtype="bool") + data.train_id = [i for i in data.train_id if not is_mistake[i]] + data.val_id = [i for i in data.val_id if not is_mistake[i]] + data.test_id = [i for i in data.test_id if not is_mistake[i]] + + data.train_mask = torch.tensor( + [x in data.train_id for x in range(data.num_nodes)]) + data.val_mask = torch.tensor( + [x in data.val_id for x in range(data.num_nodes)]) + data.test_mask = torch.tensor( + [x in data.test_id for x in range(data.num_nodes)]) + + return data, data_pubid + + +def parse_pubmed(): + path = "dataset/PubMed/data/" + + n_nodes = 19717 + n_features = 500 + + data_x = np.zeros((n_nodes, n_features), dtype="float32") + data_y = [None] * n_nodes + data_pubid = [None] * n_nodes + data_edges = [] + + paper_to_index = {} + feature_to_index = {} + + # parse nodes + with open(path + "Pubmed-Diabetes.NODE.paper.tab", "r", encoding="UTF-8")\ + as node_file: + # first two lines are headers + node_file.readline() + node_file.readline() + + k = 0 + + for i, line in enumerate(node_file.readlines()): + items = line.strip().split("\t") + + paper_id = items[0] + data_pubid[i] = paper_id + paper_to_index[paper_id] = i + + # label=[1,2,3] + label = int(items[1].split("=")[-1]) - \ + 1 # subtract 1 to zero-count + data_y[i] = label + + # f1=val1 \t f2=val2 \t ... \t fn=valn summary=... + features = items[2:-1] + for feature in features: + parts = feature.split("=") + fname = parts[0] + fvalue = float(parts[1]) + + if fname not in feature_to_index: + feature_to_index[fname] = k + k += 1 + + data_x[i, feature_to_index[fname]] = fvalue + + # parse graph + data_a = np.zeros((n_nodes, n_nodes), dtype="float32") + + with open(path + "Pubmed-Diabetes.DIRECTED.cites.tab", + "r", encoding="UTF-8") as edge_file: + # first two lines are headers + edge_file.readline() + edge_file.readline() + + for i, line in enumerate(edge_file.readlines()): + + # edge_id \t paper:tail \t | \t paper:head + items = line.strip().split("\t") + + tail = items[1].split(":")[-1] + head = items[3].split(":")[-1] + + data_a[paper_to_index[tail], paper_to_index[head]] = 1.0 + data_a[paper_to_index[head], paper_to_index[tail]] = 1.0 + if head != tail: + data_edges.append( + (paper_to_index[head], paper_to_index[tail])) + data_edges.append( + (paper_to_index[tail], paper_to_index[head])) + + return data_a, data_x, data_y, data_pubid,\ + np.unique(data_edges, axis=0).transpose() + + +def get_raw_text_pubmed(use_text=False, seed=0): + data, _ = get_pubmed_casestudy(seed=seed) + if not use_text: + return data, None + with open("dataset/PubMed/pubmed.json", encoding="UTF-8") as f: + pubmed = json.load(f) + df_pubmed = pd.DataFrame.from_dict(pubmed) + + ab = df_pubmed["AB"].fillna("") + ti = df_pubmed["TI"].fillna("") + text = {"title": [], "abs": [], "label": []} + for ti, ab in zip(ti, ab): + text["title"].append(ti) + text["abs"].append(ab) + + for i in range(len(data.y)): + text["label"].append(pubmed_mapping[data.y[i].item()]) + + return data, text diff --git a/gli/utils.py b/gli/utils.py index d7c489e7..7231c43f 100644 --- a/gli/utils.py +++ b/gli/utils.py @@ -182,7 +182,7 @@ def download_file_from_google_drive(g_url: str, print(f"Successfully downloaded {filename} to {root} from {g_url}.") -def load_data(path, key=None, device="cpu"): +def load_data(path, key=None, device="cpu", load_raw_text=False): """Load data from npy or npz file, return sparse array or torch tensor. Parameters @@ -212,6 +212,15 @@ def load_data(path, key=None, device="cpu"): assert key is None, "Sparse format cannot contain key." return sp.load_npz(path) + if path.endswith(".optional.npz"): + # For raw text, which is saved as a dict + if load_raw_text: + a = np.load(path, allow_pickle=True) + d = {k: a[k].item() for k in a.files} + return d + else: + return None + # Dense arrays file with a key raw = np.load(path, allow_pickle=False) assert key is not None @@ -261,7 +270,7 @@ def sparse_to_torch(sparse_array: sp.spmatrix, raise TypeError(f"Unsupported sparse type {sparse_type}") -def _find_data_files_from_json_files(data_dir): +def _find_data_files_from_json_files(data_dir, load_raw_text): """Traverse json files under dataset path and find dependent data files.""" json_files = [] for file in os.listdir(data_dir): @@ -273,7 +282,7 @@ def _find_data_files_helper(data): data_files = [] if isinstance(data, dict): for key, value in data.items(): - if key == "file": + if key == "file" or (key == "optional file" and load_raw_text): data_files.append(value) else: data_files.extend(_find_data_files_helper(value)) @@ -310,11 +319,12 @@ def _get_url_from_server(data_file: str): return None -def download_data(dataset: str, verbose=False): +def download_data(dataset: str, load_raw_text=False, verbose=False): """Download dependent data of a configuration (metadata/task) file. Args: dataset (str): Name of dataset. + load_raw_text (bool, optional): Defaults to False. verbose (bool, optional): Defaults to False. """ data_dir = os.path.join(get_local_data_dir(), dataset) @@ -322,7 +332,7 @@ def download_data(dataset: str, verbose=False): raise FileNotFoundError(f"cannot find dataset {dataset}.") # Get all required dependent data files from json files. - data_files = _find_data_files_from_json_files(data_dir) + data_files = _find_data_files_from_json_files(data_dir, load_raw_text) exist_all_files = True for data_file_name in data_files: data_file_path = os.path.join(data_dir, data_file_name) @@ -531,8 +541,11 @@ def save_data(prefix, save_dir=".", **kwargs): """ dense_arrays = {} sparse_arrays = {} + dict_array = {} for key, matrix in kwargs.items(): - if sp.issparse(matrix): + if isinstance(matrix, dict): + dict_array[key] = matrix + elif sp.issparse(matrix): sparse_arrays[key] = matrix elif isinstance(matrix, np.ndarray): dense_arrays[key] = matrix @@ -552,6 +565,21 @@ def _dir(filename): """Prepend save_dir to the file.""" return os.path.join(save_dir, filename) + # Save dict with raw text to "optional file" + if dict_array: + np.savez_compressed(_dir(f"{prefix}.optional.npz"), **dict_array) + with open(_dir(f"{prefix}.optional.npz"), "rb") as f: + md5 = hashlib.md5(f.read()).hexdigest() + os.rename(_dir(f"{prefix}.optional.npz"), + _dir(f"{prefix}__{md5}.optional.npz")) + key_to_loc.update({ + key: { + "optional file": f"{prefix}__{md5}.optional.npz", + "key": key + } + for key in dict_array + }) + # Save numpy arrays into a single file np.savez_compressed(_dir(f"{prefix}.npz"), **dense_arrays) with open(_dir(f"{prefix}.npz"), "rb") as f: From 9bb6c079f032619c9920b11399c7e380bc520297 Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Mon, 4 Mar 2024 15:28:06 -0500 Subject: [PATCH 02/15] fix style issue --- gli/io/graph.py | 3 +- gli/raw_text_utils.py | 188 +++++++++++++++++++++++------------------- pyproject.toml | 2 +- 3 files changed, 106 insertions(+), 87 deletions(-) diff --git a/gli/io/graph.py b/gli/io/graph.py index 5ab40054..04c37d82 100644 --- a/gli/io/graph.py +++ b/gli/io/graph.py @@ -363,7 +363,8 @@ def save_homograph( raw_text_dict = {} for r in raw_text_attrs: print(f"{r.name = }") - raw_text_dict[r.name] = _attr_to_metadata_dict(key_to_loc, "RawText", r) + raw_text_dict[r.name] = _attr_to_metadata_dict(key_to_loc, + "RawText", r) metadata["data"]["RawText"] = raw_text_dict metadata["citation"] = citation diff --git a/gli/raw_text_utils.py b/gli/raw_text_utils.py index db1f1023..011d787b 100644 --- a/gli/raw_text_utils.py +++ b/gli/raw_text_utils.py @@ -15,8 +15,6 @@ sys.path.append("../") - - DATASET_W_RAW_TEXT = ["cora", "pubmed", "ogbn-arxiv", "arxiv-2023", "ogbn-products"] @@ -26,7 +24,7 @@ def load_data(dataset, use_text=False, seed=0): Load data based on the dataset name. Parameters: - dataset (str): Name of the dataset to be loaded. + dataset (str): Name of the dataset to be loaded. Options are "cora", "pubmed", "arxiv", "arxiv_2023", and "product". use_text (bool, optional): Whether to use text data. Default is False. seed (int, optional): Random seed for data loading. Default is 0. @@ -37,7 +35,6 @@ def load_data(dataset, use_text=False, seed=0): Raises: ValueError: If the dataset name is not recognized. """ - if dataset == "cora": data, text = get_raw_text_cora(use_text, seed) elif dataset == "pubmed": @@ -53,15 +50,16 @@ def load_data(dataset, use_text=False, seed=0): return data, text -################# Ogbn-arxiv ################# +# Ogbn-arxiv def get_raw_text_arxiv(use_text=False): """ + Get raw text data for the ogbn-arxiv dataset. + Reference: https://github.com/XiaoxinHe/TAPE/blob/ main/core/data_utils/load_arxiv.py """ - dataset = PygNodePropPredDataset(name="ogbn-arxiv") data = dataset[0] @@ -96,7 +94,7 @@ def get_raw_text_arxiv(use_text=False): # Load the label index to arXiv category mapping data label_mapping_path = "dataset/ogbn_arxiv/mapping/"\ - "labelidx2arxivcategeory.csv.gz" + "labelidx2arxivcategeory.csv.gz" label_mapping_data = pd.read_csv(label_mapping_path) label_mapping_data.columns = ["label_idx", "arxiv_category"] @@ -105,19 +103,20 @@ def get_raw_text_arxiv(use_text=False): label_mapping_data["label_idx"].isin(data.y[i].numpy())] # If the row doesn"t exist, return a message indicating this if len(row) == 0: - raise ValueError("No matching arXiv category found for this label.") + raise ValueError("No matching arXiv category for this label.") # Parse the arXiv category string to be in the desired format "cs.XX" arxiv_category = "cs." + row["arxiv_category"]\ - .values[0].split()[-1].upper() + .values[0].split()[-1].upper() text["label"].append(arxiv_category) return data, text def generate_arxiv_keys_list(): + """Return a list of arXiv categories.""" label_mapping_path = "dataset/ogbn_arxiv/mapping/"\ - "labelidx2arxivcategeory.csv.gz" + "labelidx2arxivcategeory.csv.gz" label_mapping_data = pd.read_csv(label_mapping_path, compression="gzip") label_mapping_data.columns = ["label_idx", "arxiv_category"] arxiv_categories = label_mapping_data["arxiv_category"].unique() @@ -125,10 +124,11 @@ def generate_arxiv_keys_list(): for category in arxiv_categories] +# Arxiv-2023 -################# Arxiv-2023 ################# def get_raw_text_arxiv_2023(use_text=True, base_path="dataset/arxiv_2023"): + """Return data and text for arxiv_2023 dataset.""" # Load processed data edge_index = torch.load(os.path.join(base_path, "processed", "edge_index.pt")) @@ -136,7 +136,7 @@ def get_raw_text_arxiv_2023(use_text=True, base_path="dataset/arxiv_2023"): titles_df = pd.read_csv(os.path.join(base_path, "raw", "titles.csv.gz"), compression="gzip") abstracts_df = pd.read_csv(os.path.join(base_path, - "raw", "abstracts.csv.gz"), compression="gzip") + "raw", "abstracts.csv.gz"), compression="gzip") ids_df = pd.read_csv(os.path.join(base_path, "raw", "ids.csv.gz"), compression="gzip") labels_df = pd.read_csv(os.path.join(base_path, "raw", "labels.csv.gz"), @@ -189,7 +189,8 @@ def get_raw_text_arxiv_2023(use_text=True, base_path="dataset/arxiv_2023"): return data, text -################# Cora ################# +# Cora + cora_mapping = { 0: "Case Based", @@ -201,12 +202,15 @@ def get_raw_text_arxiv_2023(use_text=True, base_path="dataset/arxiv_2023"): 6: "Theory" } + def get_cora_casestudy(seed=0): """ + Get raw text data for the cora dataset. + Reference: https://github.com/XiaoxinHe/TAPE/blob/main/ core/data_utils/load_cora.py """ - data_x, data_y, data_citeid, data_edges = parse_cora() + (data_x, data_y, data_citeid, data_edges) = parse_cora() # data_x = sklearn.preprocessing.normalize(data_x, norm="l1") torch.manual_seed(seed) @@ -245,8 +249,10 @@ def get_cora_casestudy(seed=0): return data, data_citeid + # credit: https://github.com/tkipf/pygcn/issues/27, xuhaiyun def parse_cora(): + """Parse the cora dataset.""" path = "cora_raw/cora" idx_features_labels = np.genfromtxt( f"{path}.content", dtype=np.dtype(str)) @@ -259,20 +265,24 @@ def parse_cora(): "Reinforcement_Learning", "Rule_Learning", "Theory"])} - data_y = np.array([class_map[l] for l in labels]) + data_y = np.array([class_map[lb] for lb in labels]) data_citeid = idx_features_labels[:, 0] idx = np.array(data_citeid, dtype=np.dtype(str)) idx_map = {j: i for i, j in enumerate(idx)} edges_unordered = np.genfromtxt( f"{path}.cites", dtype=np.dtype(str)) - edges = np.array(list(map(idx_map.get, edges_unordered.flatten()))).reshape( - edges_unordered.shape) + edges = np.array(list(map(idx_map.get, edges_unordered.flatten())))\ + .reshape(edges_unordered.shape) data_edges = np.array(edges[~(edges is None).max(1)], dtype="int") data_edges = np.vstack((data_edges, np.fliplr(data_edges))) - return data_x, data_y, data_citeid, \ - np.unique(data_edges, axis=0).transpose() + return (data_x, + data_y, + data_citeid, + np.unique(data_edges, axis=0).transpose()) + def get_raw_text_cora(use_text=False, seed=0): + """Return data and text for cora dataset.""" data, data_citeid = get_cora_casestudy(seed) if not use_text: return data, None @@ -317,22 +327,22 @@ def get_raw_text_cora(use_text=False, seed=0): return data, text -################# Ogbn-product ################# - +# Ogbn-product - -def get_raw_dataset(raw_train="dataset/ogbn_products/Amazon-3M.raw/trn.json.gz", - raw_test="dataset/ogbn_products/"\ +def get_raw_dataset(raw_train="dataset/ogbn_products/Amazon-3M.raw/" + "trn.json.gz", + raw_test="dataset/ogbn_products/" "Amazon-3M.raw/tst.json.gz", - label2cat="dataset/ogbn_products/mapping/"\ + label2cat="dataset/ogbn_products/mapping/" "labelidx2productcategory.csv.gz", - idx2asin="dataset/ogbn_products/mapping/"\ + idx2asin="dataset/ogbn_products/mapping/" "nodeidx2asin.csv.gz"): """ + Get raw dataset for the ogbn-products dataset. + mapping references: https://github.com/CurryTang/Graph-LLM/blob/master/utils.py """ - train_part = load_dataset("json", data_files=raw_train) test_part = load_dataset("json", data_files=raw_test) train_df = train_part["train"].to_pandas() @@ -349,7 +359,9 @@ def get_raw_dataset(raw_train="dataset/ogbn_products/Amazon-3M.raw/trn.json.gz", return idx_mapping, content_mapping, label_mapping + def get_raw_text_products(use_text=False): + """Return data and text for the ogbn-products dataset.""" dataset = PygNodePropPredDataset(name="ogbn-products") data = dataset[0] @@ -390,63 +402,57 @@ def get_raw_text_products(use_text=False): products_mapping = {"Home & Kitchen": "Home & Kitchen", - "Health & Personal Care": "Health & Personal Care", - "Beauty": "Beauty", - "Sports & Outdoors": "Sports & Outdoors", - "Books": "Books", - "Patio, Lawn & Garden": "Patio, Lawn & Garden", - "Toys & Games": "Toys & Games", - "CDs & Vinyl": "CDs & Vinyl", - "Cell Phones & Accessories": "Cell Phones & Accessories", - "Grocery & Gourmet Food": "Grocery & Gourmet Food", - "Arts, Crafts & Sewing": "Arts, Crafts & Sewing", - "Clothing, Shoes & Jewelry": "Clothing, Shoes & Jewelry", - "Electronics": "Electronics", - "Movies & TV": "Movies & TV", - "Software": "Software", - "Video Games": "Video Games", - "Automotive": "Automotive", - "Pet Supplies": "Pet Supplies", - "Office Products": "Office Products", - "Industrial & Scientific": "Industrial & Scientific", - "Musical Instruments": "Musical Instruments", - "Tools & Home Improvement": "Tools & Home Improvement", - "Magazine Subscriptions": "Magazine Subscriptions", - "Baby Products": "Baby Products", - "label 25": "label 25", - "Appliances": "Appliances", - "Kitchen & Dining": "Kitchen & Dining", - "Collectibles & Fine Art": "Collectibles & Fine Art", - "All Beauty": "All Beauty", - "Luxury Beauty": "Luxury Beauty", - "Amazon Fashion": "Amazon Fashion", - "Computers": "Computers", - "All Electronics": "All Electronics", - "Purchase Circles": "Purchase Circles", - "MP3 Players & Accessories": "MP3 Players & Accessories", - "Gift Cards": "Gift Cards", - "Office & School Supplies": "Office & School Supplies", - "Home Improvement": "Home Improvement", - "Camera & Photo": "Camera & Photo", - "GPS & Navigation": "GPS & Navigation", - "Digital Music": "Digital Music", - "Car Electronics": "Car Electronics", - "Baby": "Baby", - "Kindle Store": "Kindle Store", - "Buy a Kindle": "Buy a Kindle", - "Furniture & Décor": "Furniture & Decor", - "#508510": "#508510"} + "Health & Personal Care": "Health & Personal Care", + "Beauty": "Beauty", + "Sports & Outdoors": "Sports & Outdoors", + "Books": "Books", + "Patio, Lawn & Garden": "Patio, Lawn & Garden", + "Toys & Games": "Toys & Games", + "CDs & Vinyl": "CDs & Vinyl", + "Cell Phones & Accessories": "Cell Phones & Accessories", + "Grocery & Gourmet Food": "Grocery & Gourmet Food", + "Arts, Crafts & Sewing": "Arts, Crafts & Sewing", + "Clothing, Shoes & Jewelry": "Clothing, Shoes & Jewelry", + "Electronics": "Electronics", + "Movies & TV": "Movies & TV", + "Software": "Software", + "Video Games": "Video Games", + "Automotive": "Automotive", + "Pet Supplies": "Pet Supplies", + "Office Products": "Office Products", + "Industrial & Scientific": "Industrial & Scientific", + "Musical Instruments": "Musical Instruments", + "Tools & Home Improvement": "Tools & Home Improvement", + "Magazine Subscriptions": "Magazine Subscriptions", + "Baby Products": "Baby Products", + "label 25": "label 25", + "Appliances": "Appliances", + "Kitchen & Dining": "Kitchen & Dining", + "Collectibles & Fine Art": "Collectibles & Fine Art", + "All Beauty": "All Beauty", + "Luxury Beauty": "Luxury Beauty", + "Amazon Fashion": "Amazon Fashion", + "Computers": "Computers", + "All Electronics": "All Electronics", + "Purchase Circles": "Purchase Circles", + "MP3 Players & Accessories": "MP3 Players & Accessories", + "Gift Cards": "Gift Cards", + "Office & School Supplies": "Office & School Supplies", + "Home Improvement": "Home Improvement", + "Camera & Photo": "Camera & Photo", + "GPS & Navigation": "GPS & Navigation", + "Digital Music": "Digital Music", + "Car Electronics": "Car Electronics", + "Baby": "Baby", + "Kindle Store": "Kindle Store", + "Buy a Kindle": "Buy a Kindle", + "Furniture & Décor": "Furniture & Decor", + "#508510": "#508510"} products_keys_list = list(products_mapping.keys()) - -################# Pubmed ################# - -""" -Reference: https://github.com/XiaoxinHe/TAPE/blob/main/core/ -data_utils/load_pubmed.py -""" +# Pubmed pubmed_mapping = { 0: "Experimentally induced diabetes", @@ -454,8 +460,15 @@ def get_raw_text_products(use_text=False): 2: "Type 2 diabetes", } + def get_pubmed_casestudy(corrected=False, seed=0): - _, data_x, data_y, data_pubid, data_edges = parse_pubmed() + """ + Get raw text data for the pubmed dataset. + + Reference: https://github.com/XiaoxinHe/TAPE/blob/main/core/ + data_utils/load_pubmed.py + """ + (_, data_x, data_y, data_pubid, data_edges) = parse_pubmed() data_x = normalize(data_x, norm="l1") torch.manual_seed(seed) @@ -503,6 +516,7 @@ def get_pubmed_casestudy(corrected=False, seed=0): def parse_pubmed(): + """Parse the pubmed dataset.""" path = "dataset/PubMed/data/" n_nodes = 19717 @@ -517,8 +531,8 @@ def parse_pubmed(): feature_to_index = {} # parse nodes - with open(path + "Pubmed-Diabetes.NODE.paper.tab", "r", encoding="UTF-8")\ - as node_file: + with open(path + "Pubmed-Diabetes.NODE.paper.tab", + "r", encoding="UTF-8") as node_file: # first two lines are headers node_file.readline() node_file.readline() @@ -575,11 +589,15 @@ def parse_pubmed(): data_edges.append( (paper_to_index[tail], paper_to_index[head])) - return data_a, data_x, data_y, data_pubid,\ - np.unique(data_edges, axis=0).transpose() + return (data_a, + data_x, + data_y, + data_pubid, + np.unique(data_edges, axis=0).transpose()) def get_raw_text_pubmed(use_text=False, seed=0): + """Return the data and text for the pubmed dataset.""" data, _ = get_pubmed_casestudy(seed=seed) if not use_text: return data, None diff --git a/pyproject.toml b/pyproject.toml index b4e09738..643c7056 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["numpy>=1.19", "scipy>=1.5", "torch>=1.10", "dgl>=0.6"] +dependencies = ["numpy>=1.19", "scipy>=1.5", "torch>=1.10", "dgl>=0.6", "pandas>=2.0.0"] optional-dependencies = { test = [ "pytest", "pydocstyle", From 9822f3223863c34e7e42c7408fd7e8b8b1b77667 Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Mon, 4 Mar 2024 15:29:53 -0500 Subject: [PATCH 03/15] fix panda version --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 643c7056..6611d44c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["numpy>=1.19", "scipy>=1.5", "torch>=1.10", "dgl>=0.6", "pandas>=2.0.0"] +dependencies = ["numpy>=1.19", "scipy>=1.5", "torch>=1.10", "dgl>=0.6", "pandas"] optional-dependencies = { test = [ "pytest", "pydocstyle", From 211560c2cae4f68dc376775c82c4c1711211e487 Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Mon, 4 Mar 2024 15:33:52 -0500 Subject: [PATCH 04/15] add datasets package --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6611d44c..a4f1ce5a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["numpy>=1.19", "scipy>=1.5", "torch>=1.10", "dgl>=0.6", "pandas"] +dependencies = ["numpy>=1.19", "scipy>=1.5", "torch>=1.10", "dgl>=0.6", "pandas", "datasets"] optional-dependencies = { test = [ "pytest", "pydocstyle", From 7336d6be8e40c78d916d23d1eda9d179db1e0cd2 Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Mon, 4 Mar 2024 15:37:24 -0500 Subject: [PATCH 05/15] remove raw_text_utils in gli.init --- gli/__init__.py | 1 - pyproject.toml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/gli/__init__.py b/gli/__init__.py index 7bdc67a5..d2cb5229 100644 --- a/gli/__init__.py +++ b/gli/__init__.py @@ -5,6 +5,5 @@ from . import graph from . import task from . import utils -from . import raw_text_utils from .dataloading import get_gli_graph, get_gli_task, \ get_gli_dataset, combine_graph_and_task diff --git a/pyproject.toml b/pyproject.toml index a4f1ce5a..b4e09738 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["numpy>=1.19", "scipy>=1.5", "torch>=1.10", "dgl>=0.6", "pandas", "datasets"] +dependencies = ["numpy>=1.19", "scipy>=1.5", "torch>=1.10", "dgl>=0.6"] optional-dependencies = { test = [ "pytest", "pydocstyle", From 8f11f22157cb1f231a5dfc2604aba7f5d041e1da Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Mon, 4 Mar 2024 15:42:58 -0500 Subject: [PATCH 06/15] remove redundent prints --- gli/io/graph.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/gli/io/graph.py b/gli/io/graph.py index 04c37d82..d7bfbb0d 100644 --- a/gli/io/graph.py +++ b/gli/io/graph.py @@ -334,8 +334,6 @@ def save_homograph( # Call save_data(). key_to_loc = save_data(f"{name}__graph", save_dir=save_dir, **data) - print(f"{key_to_loc = }") - # Create the metadata dict. metadata = {"description": description, "data": {}} @@ -362,7 +360,6 @@ def save_homograph( # Add the metadata of the raw text raw_text_dict = {} for r in raw_text_attrs: - print(f"{r.name = }") raw_text_dict[r.name] = _attr_to_metadata_dict(key_to_loc, "RawText", r) metadata["data"]["RawText"] = raw_text_dict From 8dda830f35c4281fa9de7c0b73b424814080247d Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Mon, 4 Mar 2024 18:17:48 -0500 Subject: [PATCH 07/15] fix error handling of raw test --- datasets/cora/cora.ipynb | 23 +++++++---------------- datasets/cora/metadata.json | 2 +- gli/graph.py | 3 +++ gli/raw_text_utils.py | 6 +++++- gli/utils.py | 1 - 5 files changed, 16 insertions(+), 19 deletions(-) diff --git a/datasets/cora/cora.ipynb b/datasets/cora/cora.ipynb index 509771b1..e6814e05 100644 --- a/datasets/cora/cora.ipynb +++ b/datasets/cora/cora.ipynb @@ -133,16 +133,7 @@ "cell_type": "code", "execution_count": 5, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "key_to_loc = {'RawText_NodeRawText': {'optional file': 'cora__graph__835178b65ba8cfdfb9c91f33c6260506.optional.npz', 'key': 'RawText_NodeRawText'}, 'Edge_Edge': {'file': 'cora__graph__6c912909fa18eff10797210ea5e485fe.npz', 'key': 'Edge_Edge'}, 'Node_NodeLabel': {'file': 'cora__graph__6c912909fa18eff10797210ea5e485fe.npz', 'key': 'Node_NodeLabel'}, 'Node_NodeFeature': {'file': 'cora__graph__Node_NodeFeature__7032c9c380d1889061dcbbcd76b8c427.sparse.npz'}, 'Graph_NodeList': {'file': 'cora__graph__Graph_NodeList__23bbef862fd6037395412eb03b4e1d9c.sparse.npz'}}\n", - "r.name = 'NodeRawText'\n" - ] - } - ], + "outputs": [], "source": [ "from gli.io import save_graph, Attribute\n", "\n", @@ -166,7 +157,7 @@ " Attribute(\n", " \"NodeRawText\",\n", " raw_text_dict,\n", - " \"Raw text of title, abstract and label of each node in Cora dataset, string.\",\n", + " \"Raw text of title, abstract and label of each node in Cora dataset, dict of list of strings.\",\n", " \"Dict\",\n", " 'Dict[str, list[str]]'\n", " )\n", @@ -231,7 +222,7 @@ " },\n", " \"RawText\": {\n", " \"NodeRawText\": {\n", - " \"description\": \"Raw text of title, abstract and label of each node in Cora dataset, string.\",\n", + " \"description\": \"Raw text of title, abstract and label of each node in Cora dataset, dict of list of strings.\",\n", " \"type\": \"Dict\",\n", " \"format\": \"Dict[str, list[str]]\",\n", " \"optional file\": \"cora__graph__835178b65ba8cfdfb9c91f33c6260506.optional.npz\",\n", @@ -428,15 +419,15 @@ "name": "stdout", "output_type": "stream", "text": [ - "title ['Title: The megaprior heuristic for discovering protein sequence patterns ', 'Title: Applications of machine learning: a medical follow up study ', 'Title: Submitted to NIPS96, Section: Applications. Preference: Oral presentation Reinforcement Learning for Dynamic Channel Allocation in']\n", - "abs ['Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength (i.e., decreases the variance) of the prior in proportion to the size of the sequence dataset. This causes each column in the final model to strongly resemble the mean of a single component of the prior, regardless of the size of the dataset. We describe the cause of the convex combination problem, analyze it mathematically, motivate and describe the implementation of the megaprior heuristic, and show how it can effectively eliminate the problem of convex combinations in protein sequence pattern discovery. ', 'Abstract: This paper describes preliminary work that aims to apply some learning strategies to a medical follow-up study. An investigation of the application of three machine learning algorithms-1R, FOIL and InductH to identify risk factors that govern the colposuspension cure rate has been made. The goal of this study is to induce a generalised description or explanation of the classification attribute, colposuspension cure rate (completely cured, improved, unchanged and worse) from the 767 examples in the questionnaires. We looked for a set of rules that described which risk factors result in differences of cure rate. The results were encouraging, and indicate that machine learning can play a useful role in large scale medical problem solving. ', 'Abstract: In cellular telephone systems, an important problem is to dynamically allocate the communication resource (channels) so as to maximize service in a stochastic caller environment. This problem is naturally formulated as a dynamic programming problem and we use a reinforcement learning (RL) method to find dynamic channel allocation policies that are better than previous heuristic solutions. The policies obtained perform well for a broad variety of call traffic patterns. We present results on a large cellular system In cellular communication systems, an important problem is to allocate the communication resource (bandwidth) so as to maximize the service provided to a set of mobile callers whose demand for service changes stochastically. A given geographical area is divided into mutually disjoint cells, and each cell serves the calls that are within its boundaries (see Figure 1a). The total system bandwidth is divided into channels, with each channel centered around a frequency. Each channel can be used simultaneously at different cells, provided these cells are sufficiently separated spatially, so that there is no interference between them. The minimum separation distance between simultaneous reuse of the same channel is called the channel reuse constraint . When a call requests service in a given cell either a free channel (one that does not violate the channel reuse constraint) may be assigned to the call, or else the call is blocked from the system; this will happen if no free channel can be found. Also, when a mobile caller crosses from one cell to another, the call is \"handed off\" to the cell of entry; that is, a new free channel is provided to the call at the new cell. If no such channel is available, the call must be dropped/disconnected from the system. One objective of a channel allocation policy is to allocate the available channels to calls so that the number of blocked calls is minimized. An additional objective is to minimize the number of calls that are dropped when they are handed off to a busy cell. These two objectives must be weighted appropriately to reflect their relative importance, since dropping existing calls is generally more undesirable than blocking new calls. with approximately 70 49 states.']\n", - "label ['Neural Networks', 'Rule Learning', 'Reinforcement Learning']\n" + "title ['Title: The megaprior heuristic for discovering protein sequence patterns ']\n", + "abs ['Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength (i.e., decreases the variance) of the prior in proportion to the size of the sequence dataset. This causes each column in the final model to strongly resemble the mean of a single component of the prior, regardless of the size of the dataset. We describe the cause of the convex combination problem, analyze it mathematically, motivate and describe the implementation of the megaprior heuristic, and show how it can effectively eliminate the problem of convex combinations in protein sequence pattern discovery. ']\n", + "label ['Neural Networks']\n" ] } ], "source": [ "for key, item in data.NodeRawText['RawText_NodeRawText'].items():\n", - " print(key, item[:3])" + " print(key, item[:1])" ] }, { diff --git a/datasets/cora/metadata.json b/datasets/cora/metadata.json index 8a058581..25f82fc0 100644 --- a/datasets/cora/metadata.json +++ b/datasets/cora/metadata.json @@ -29,7 +29,7 @@ }, "RawText": { "NodeRawText": { - "description": "Raw text of title, abstract and label of each node in Cora dataset, string.", + "description": "Raw text of title, abstract and label of each node in Cora dataset, dict of list of strings.", "type": "Dict", "format": "Dict[str, list[str]]", "optional file": "cora__graph__835178b65ba8cfdfb9c91f33c6260506.optional.npz", diff --git a/gli/graph.py b/gli/graph.py index 6f3c23e4..56296f40 100644 --- a/gli/graph.py +++ b/gli/graph.py @@ -177,6 +177,9 @@ def _get_homograph(data, load_raw_text=False): g.edata[attr] = _to_tensor(array) if load_raw_text: + assert "RawText" in data, "RawText is not found in the data,"\ + " please verify that the dataset "\ + "contains raw text." for attr, raw_text_dict in data["RawText"].items(): setattr(g, attr, raw_text_dict) diff --git a/gli/raw_text_utils.py b/gli/raw_text_utils.py index 011d787b..a922d149 100644 --- a/gli/raw_text_utils.py +++ b/gli/raw_text_utils.py @@ -273,7 +273,11 @@ def parse_cora(): f"{path}.cites", dtype=np.dtype(str)) edges = np.array(list(map(idx_map.get, edges_unordered.flatten())))\ .reshape(edges_unordered.shape) - data_edges = np.array(edges[~(edges is None).max(1)], dtype="int") + # data_edges = np.array(edges[~(edges == None).max(1)], dtype="int") + # edges = np.where(edges == None, np.nan, edges) + edges = np.array([[np.nan if x is None else x for x in row] + for row in edges]) + data_edges = np.array(edges[~np.isnan(edges).any(1)], dtype="int") data_edges = np.vstack((data_edges, np.fliplr(data_edges))) return (data_x, data_y, diff --git a/gli/utils.py b/gli/utils.py index 7231c43f..c8267e05 100644 --- a/gli/utils.py +++ b/gli/utils.py @@ -309,7 +309,6 @@ def _get_url_from_server(data_file: str): resp = requests.request("GET", f"{SERVER_IP}/api/get-url/{data_file}", timeout=5) - print(resp.url) resp = resp.json() if resp["message_type"] == "error": return None From 5dfa9f91afb38d99c2697e002cba5922e00ebd22 Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Wed, 6 Mar 2024 20:34:32 -0500 Subject: [PATCH 08/15] save raw text as node attributes --- datasets/cora/cora.ipynb | 99 ++++++++++++++++++++++++------------- datasets/cora/metadata.json | 30 +++++++---- gli/graph.py | 23 ++++----- gli/io/graph.py | 21 ++------ gli/utils.py | 34 ++++++------- tests/test_metadata.py | 5 ++ 6 files changed, 121 insertions(+), 91 deletions(-) diff --git a/datasets/cora/cora.ipynb b/datasets/cora/cora.ipynb index e6814e05..10c2318f 100644 --- a/datasets/cora/cora.ipynb +++ b/datasets/cora/cora.ipynb @@ -110,9 +110,9 @@ "output_type": "stream", "text": [ "dict_keys(['title', 'abs', 'label'])\n", - "title ['Title: The megaprior heuristic for discovering protein sequence patterns ', 'Title: Applications of machine learning: a medical follow up study ', 'Title: Submitted to NIPS96, Section: Applications. Preference: Oral presentation Reinforcement Learning for Dynamic Channel Allocation in']\n", - "abs ['Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength (i.e., decreases the variance) of the prior in proportion to the size of the sequence dataset. This causes each column in the final model to strongly resemble the mean of a single component of the prior, regardless of the size of the dataset. We describe the cause of the convex combination problem, analyze it mathematically, motivate and describe the implementation of the megaprior heuristic, and show how it can effectively eliminate the problem of convex combinations in protein sequence pattern discovery. ', 'Abstract: This paper describes preliminary work that aims to apply some learning strategies to a medical follow-up study. An investigation of the application of three machine learning algorithms-1R, FOIL and InductH to identify risk factors that govern the colposuspension cure rate has been made. The goal of this study is to induce a generalised description or explanation of the classification attribute, colposuspension cure rate (completely cured, improved, unchanged and worse) from the 767 examples in the questionnaires. We looked for a set of rules that described which risk factors result in differences of cure rate. The results were encouraging, and indicate that machine learning can play a useful role in large scale medical problem solving. ', 'Abstract: In cellular telephone systems, an important problem is to dynamically allocate the communication resource (channels) so as to maximize service in a stochastic caller environment. This problem is naturally formulated as a dynamic programming problem and we use a reinforcement learning (RL) method to find dynamic channel allocation policies that are better than previous heuristic solutions. The policies obtained perform well for a broad variety of call traffic patterns. We present results on a large cellular system In cellular communication systems, an important problem is to allocate the communication resource (bandwidth) so as to maximize the service provided to a set of mobile callers whose demand for service changes stochastically. A given geographical area is divided into mutually disjoint cells, and each cell serves the calls that are within its boundaries (see Figure 1a). The total system bandwidth is divided into channels, with each channel centered around a frequency. Each channel can be used simultaneously at different cells, provided these cells are sufficiently separated spatially, so that there is no interference between them. The minimum separation distance between simultaneous reuse of the same channel is called the channel reuse constraint . When a call requests service in a given cell either a free channel (one that does not violate the channel reuse constraint) may be assigned to the call, or else the call is blocked from the system; this will happen if no free channel can be found. Also, when a mobile caller crosses from one cell to another, the call is \"handed off\" to the cell of entry; that is, a new free channel is provided to the call at the new cell. If no such channel is available, the call must be dropped/disconnected from the system. One objective of a channel allocation policy is to allocate the available channels to calls so that the number of blocked calls is minimized. An additional objective is to minimize the number of calls that are dropped when they are handed off to a busy cell. These two objectives must be weighted appropriately to reflect their relative importance, since dropping existing calls is generally more undesirable than blocking new calls. with approximately 70 49 states.']\n", - "label ['Neural Networks', 'Rule Learning', 'Reinforcement Learning']\n" + "title ['Title: The megaprior heuristic for discovering protein sequence patterns ']\n", + "abs ['Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength (i.e., decreases the variance) of the prior in proportion to the size of the sequence dataset. This causes each column in the final model to strongly resemble the mean of a single component of the prior, regardless of the size of the dataset. We describe the cause of the convex combination problem, analyze it mathematically, motivate and describe the implementation of the megaprior heuristic, and show how it can effectively eliminate the problem of convex combinations in protein sequence pattern discovery. ']\n", + "label ['Neural Networks']\n" ] } ], @@ -126,7 +126,7 @@ "print(raw_text_dict.keys())\n", "\n", "for key, item in raw_text_dict.items():\n", - " print(key, item[:3])\n" + " print(key, item[:1])\n" ] }, { @@ -151,24 +151,36 @@ " \"Node labels of Cora dataset, int ranged from 1 to 7.\",\n", " \"int\",\n", " \"Tensor\",\n", - " )\n", - "]\n", - "raw_text_attrs = [\n", + " ),\n", + " Attribute(\n", + " \"NodeRawTextTitle\",\n", + " raw_text_dict[\"title\"],\n", + " \"Raw text of title of each node in Cora dataset, list of strings.\",\n", + " \"str\",\n", + " \"List[str]\"\n", + " ),\n", + " Attribute(\n", + " \"NodeRawTextAbstract\",\n", + " raw_text_dict[\"abs\"],\n", + " \"Raw text of abstract of each node in Cora dataset, list of strings.\",\n", + " \"str\",\n", + " \"List[str]\"\n", + " ),\n", " Attribute(\n", - " \"NodeRawText\",\n", - " raw_text_dict,\n", - " \"Raw text of title, abstract and label of each node in Cora dataset, dict of list of strings.\",\n", - " \"Dict\",\n", - " 'Dict[str, list[str]]'\n", + " \"NodeRawTextLabel\",\n", + " raw_text_dict[\"label\"],\n", + " \"Raw text of label of each node in Cora dataset, list of strings.\",\n", + " \"str\",\n", + " \"List[str]\"\n", " )\n", "]\n", "\n", + "\n", "metadata = save_graph(\n", " name=\"cora\",\n", " edge=edge,\n", " num_nodes=graph.num_nodes(),\n", " node_attrs=node_attrs,\n", - " raw_text_attrs=raw_text_attrs,\n", " description=\"CORA dataset.\",\n", " cite=\n", " \"@inproceedings{yang2016revisiting,\\ntitle={Revisiting semi-supervised learning with graph embeddings},\\nauthor={Yang, Zhilin and Cohen, William and Salakhudinov, Ruslan},\\nbooktitle={International conference on machine learning},\\npages={40--48},\\nyear={2016},\\norganization={PMLR}\\n}\",\n", @@ -207,6 +219,27 @@ " \"format\": \"Tensor\",\n", " \"file\": \"cora__graph__6c912909fa18eff10797210ea5e485fe.npz\",\n", " \"key\": \"Node_NodeLabel\"\n", + " },\n", + " \"NodeRawTextTitle\": {\n", + " \"description\": \"Raw text of title of each node in Cora dataset, list of strings.\",\n", + " \"type\": \"str\",\n", + " \"format\": \"List[str]\",\n", + " \"optional file\": \"cora__graph__Node_NodeRawTextTitle__4a9ad6575f5acfe3b828fe66f072bd5c.optional.npz\",\n", + " \"key\": \"Node_NodeRawTextTitle\"\n", + " },\n", + " \"NodeRawTextAbstract\": {\n", + " \"description\": \"Raw text of abstract of each node in Cora dataset, list of strings.\",\n", + " \"type\": \"str\",\n", + " \"format\": \"List[str]\",\n", + " \"optional file\": \"cora__graph__Node_NodeRawTextAbstract__d0e5436087314624c74a9f040d6f394f.optional.npz\",\n", + " \"key\": \"Node_NodeRawTextAbstract\"\n", + " },\n", + " \"NodeRawTextLabel\": {\n", + " \"description\": \"Raw text of label of each node in Cora dataset, list of strings.\",\n", + " \"type\": \"str\",\n", + " \"format\": \"List[str]\",\n", + " \"optional file\": \"cora__graph__Node_NodeRawTextLabel__06d184316789acc0902db2b8c1472f95.optional.npz\",\n", + " \"key\": \"Node_NodeRawTextLabel\"\n", " }\n", " },\n", " \"Edge\": {\n", @@ -219,15 +252,6 @@ " \"_NodeList\": {\n", " \"file\": \"cora__graph__Graph_NodeList__23bbef862fd6037395412eb03b4e1d9c.sparse.npz\"\n", " }\n", - " },\n", - " \"RawText\": {\n", - " \"NodeRawText\": {\n", - " \"description\": \"Raw text of title, abstract and label of each node in Cora dataset, dict of list of strings.\",\n", - " \"type\": \"Dict\",\n", - " \"format\": \"Dict[str, list[str]]\",\n", - " \"optional file\": \"cora__graph__835178b65ba8cfdfb9c91f33c6260506.optional.npz\",\n", - " \"key\": \"RawText_NodeRawText\"\n", - " }\n", " }\n", " },\n", " \"citation\": \"@inproceedings{yang2016revisiting,\\ntitle={Revisiting semi-supervised learning with graph embeddings},\\nauthor={Yang, Zhilin and Cohen, William and Salakhudinov, Ruslan},\\nbooktitle={International conference on machine learning},\\npages={40--48},\\nyear={2016},\\norganization={PMLR}\\n}\",\n", @@ -348,7 +372,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/jinhuang/Documents/research/gli/datasets/cora/../../gli/utils.py:263: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /Users/runner/work/_temp/anaconda/conda-bld/pytorch_1682343673238/work/aten/src/ATen/SparseCsrTensorImpl.cpp:56.)\n", + "/Users/jinhuang/Documents/research/gli/datasets/cora/../../gli/utils.py:262: UserWarning: Sparse CSR tensor support is in beta state. If you miss a functionality in the sparse tensor support, please submit a feature request to https://github.com/pytorch/pytorch/issues. (Triggered internally at /Users/runner/work/_temp/anaconda/conda-bld/pytorch_1682343673238/work/aten/src/ATen/SparseCsrTensorImpl.cpp:56.)\n", " return torch.sparse_csr_tensor(crow_indices,\n" ] }, @@ -374,6 +398,13 @@ "data[0]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Loading data with raw text." + ] + }, { "cell_type": "code", "execution_count": 11, @@ -412,22 +443,24 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [ { - "name": "stdout", - "output_type": "stream", - "text": [ - "title ['Title: The megaprior heuristic for discovering protein sequence patterns ']\n", - "abs ['Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength (i.e., decreases the variance) of the prior in proportion to the size of the sequence dataset. This causes each column in the final model to strongly resemble the mean of a single component of the prior, regardless of the size of the dataset. We describe the cause of the convex combination problem, analyze it mathematically, motivate and describe the implementation of the megaprior heuristic, and show how it can effectively eliminate the problem of convex combinations in protein sequence pattern discovery. ']\n", - "label ['Neural Networks']\n" - ] + "data": { + "text/plain": [ + "('Title: The megaprior heuristic for discovering protein sequence patterns ',\n", + " 'Abstract: Several computer algorithms for discovering patterns in groups of protein sequences are in use that are based on fitting the parameters of a statistical model to a group of related sequences. These include hidden Markov model (HMM) algorithms for multiple sequence alignment, and the MEME and Gibbs sampler algorithms for discovering motifs. These algorithms are sometimes prone to producing models that are incorrect because two or more patterns have been combined. The statistical model produced in this situation is a convex combination (weighted average) of two or more different models. This paper presents a solution to the problem of convex combinations in the form of a heuristic based on using extremely low variance Dirichlet mixture priors as part of the statistical model. This heuristic, which we call the megaprior heuristic, increases the strength (i.e., decreases the variance) of the prior in proportion to the size of the sequence dataset. This causes each column in the final model to strongly resemble the mean of a single component of the prior, regardless of the size of the dataset. We describe the cause of the convex combination problem, analyze it mathematically, motivate and describe the implementation of the megaprior heuristic, and show how it can effectively eliminate the problem of convex combinations in protein sequence pattern discovery. ',\n", + " 'Neural Networks')" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "for key, item in data.NodeRawText['RawText_NodeRawText'].items():\n", - " print(key, item[:1])" + "data.NodeRawTextTitle[0], data.NodeRawTextAbstract[0], data.NodeRawTextLabel[0]" ] }, { diff --git a/datasets/cora/metadata.json b/datasets/cora/metadata.json index 25f82fc0..a928d8b5 100644 --- a/datasets/cora/metadata.json +++ b/datasets/cora/metadata.json @@ -14,6 +14,27 @@ "format": "Tensor", "file": "cora__graph__6c912909fa18eff10797210ea5e485fe.npz", "key": "Node_NodeLabel" + }, + "NodeRawTextTitle": { + "description": "Raw text of title of each node in Cora dataset, list of strings.", + "type": "str", + "format": "List[str]", + "optional file": "cora__graph__Node_NodeRawTextTitle__4a9ad6575f5acfe3b828fe66f072bd5c.optional.npz", + "key": "Node_NodeRawTextTitle" + }, + "NodeRawTextAbstract": { + "description": "Raw text of abstract of each node in Cora dataset, list of strings.", + "type": "str", + "format": "List[str]", + "optional file": "cora__graph__Node_NodeRawTextAbstract__d0e5436087314624c74a9f040d6f394f.optional.npz", + "key": "Node_NodeRawTextAbstract" + }, + "NodeRawTextLabel": { + "description": "Raw text of label of each node in Cora dataset, list of strings.", + "type": "str", + "format": "List[str]", + "optional file": "cora__graph__Node_NodeRawTextLabel__06d184316789acc0902db2b8c1472f95.optional.npz", + "key": "Node_NodeRawTextLabel" } }, "Edge": { @@ -26,15 +47,6 @@ "_NodeList": { "file": "cora__graph__Graph_NodeList__23bbef862fd6037395412eb03b4e1d9c.sparse.npz" } - }, - "RawText": { - "NodeRawText": { - "description": "Raw text of title, abstract and label of each node in Cora dataset, dict of list of strings.", - "type": "Dict", - "format": "Dict[str, list[str]]", - "optional file": "cora__graph__835178b65ba8cfdfb9c91f33c6260506.optional.npz", - "key": "RawText_NodeRawText" - } } }, "citation": "@inproceedings{yang2016revisiting,\ntitle={Revisiting semi-supervised learning with graph embeddings},\nauthor={Yang, Zhilin and Cohen, William and Salakhudinov, Ruslan},\nbooktitle={International conference on machine learning},\npages={40--48},\nyear={2016},\norganization={PMLR}\n}", diff --git a/gli/graph.py b/gli/graph.py index 56296f40..69c655ae 100644 --- a/gli/graph.py +++ b/gli/graph.py @@ -69,8 +69,7 @@ def read_gli_graph(metadata_path: os.PathLike, device="cpu", load_raw_text=load_raw_text) if _is_single_graph(data): - return _get_single_graph(data, device, hetero=hetero, - name=name, load_raw_text=load_raw_text) + return _get_single_graph(data, device, hetero=hetero, name=name) else: return _get_multi_graph(data, device, name=name) @@ -106,12 +105,12 @@ def _to_tensor(x, device="cpu"): def _get_single_graph(data, device="cpu", hetero=False, - name=None, load_raw_text=False): + name=None): """Initialize and return a single Graph instance given data.""" if hetero: g = _get_heterograph(data) else: - g = _get_homograph(data, load_raw_text=load_raw_text) + g = _get_homograph(data) setattr(g, "name", name) return g.to(device=device) @@ -158,7 +157,7 @@ def _get_multi_graph(data, device="cpu", name=None): return graphs -def _get_homograph(data, load_raw_text=False): +def _get_homograph(data): """Get a homogeneous graph from data.""" edges = data["Edge"].pop("_Edge") # (num_edges, 2) src_nodes, dst_nodes = edges.T[0], edges.T[1] @@ -171,18 +170,16 @@ def _get_homograph(data, load_raw_text=False): device="cpu") for attr, array in data["Node"].items(): - g.ndata[attr] = _to_tensor(array) + if "RawText" not in attr: + g.ndata[attr] = _to_tensor(array) + else: + # For any raw text attributes as list of strings, + # store them as a attribute of the graph object. + setattr(g, attr, array) for attr, array in data["Edge"].items(): g.edata[attr] = _to_tensor(array) - if load_raw_text: - assert "RawText" in data, "RawText is not found in the data,"\ - " please verify that the dataset "\ - "contains raw text." - for attr, raw_text_dict in data["RawText"].items(): - setattr(g, attr, raw_text_dict) - return g diff --git a/gli/io/graph.py b/gli/io/graph.py index d7bfbb0d..ab42a91c 100644 --- a/gli/io/graph.py +++ b/gli/io/graph.py @@ -57,8 +57,8 @@ def __init__(self, "or numpy array.") if self.format == "Tensor": self.num_data = len(data) - elif self.format == "Dict[str, list[str]]": - self.num_data = None + elif self.format == "List[str]": + self.num_data = len(data) else: self.num_data = data.shape[0] @@ -106,7 +106,6 @@ def save_graph( graph_node_list: Optional[spmatrix] = None, graph_edge_list: Optional[spmatrix] = None, graph_attrs: Optional[List[Attribute]] = None, - raw_text_attrs: Optional[List[Attribute]] = None, is_hetero: bool = False, description: str = "", cite: str = "", @@ -122,7 +121,7 @@ def save_graph( if not is_hetero: return save_homograph(name, edge, num_nodes, node_attrs, edge_attrs, graph_node_list, graph_edge_list, graph_attrs, - raw_text_attrs, description, cite, save_dir) + description, cite, save_dir) # verify the inputs are dict for heterograph if not isinstance(edge, dict): raise TypeError("The input edge must be a dictionary for heterograph.") @@ -149,7 +148,6 @@ def save_homograph( graph_node_list: Optional[spmatrix] = None, graph_edge_list: Optional[spmatrix] = None, graph_attrs: Optional[List[Attribute]] = None, - raw_text_attrs: Optional[List[Attribute]] = None, description: str = "", citation: str = "", save_dir: str = ".", @@ -198,7 +196,6 @@ def save_homograph( :rtype: dict Example - TODO: update this code example for raw_text ------- .. code-block:: python @@ -279,8 +276,6 @@ def save_homograph( edge_attrs = [] if graph_attrs is None: graph_attrs = [] - if raw_text_attrs is None: - raw_text_attrs = [] # Check the length of node/edge/graph attrs. _verify_attrs(node_attrs, "node") @@ -327,9 +322,6 @@ def save_homograph( assert g.name not in ("NodeList", "EdgeList"), \ "The name of a graph attribute cannot be 'NodeList' or 'EdgeList'." data[f"Graph_{g.name}"] = g.data - if raw_text_attrs is not None: - for r in raw_text_attrs: - data[f"RawText_{r.name}"] = r.data # Call save_data(). key_to_loc = save_data(f"{name}__graph", save_dir=save_dir, **data) @@ -357,13 +349,6 @@ def save_homograph( graph_dict[g.name] = _attr_to_metadata_dict(key_to_loc, "Graph", g) metadata["data"]["Graph"] = graph_dict - # Add the metadata of the raw text - raw_text_dict = {} - for r in raw_text_attrs: - raw_text_dict[r.name] = _attr_to_metadata_dict(key_to_loc, - "RawText", r) - metadata["data"]["RawText"] = raw_text_dict - metadata["citation"] = citation metadata["is_heterogeneous"] = False diff --git a/gli/utils.py b/gli/utils.py index c8267e05..06e3c5cb 100644 --- a/gli/utils.py +++ b/gli/utils.py @@ -213,11 +213,10 @@ def load_data(path, key=None, device="cpu", load_raw_text=False): return sp.load_npz(path) if path.endswith(".optional.npz"): - # For raw text, which is saved as a dict if load_raw_text: - a = np.load(path, allow_pickle=True) - d = {k: a[k].item() for k in a.files} - return d + loaded_data = np.load(path, allow_pickle=True) + loaded_list = loaded_data["arr_0"] + return loaded_list else: return None @@ -540,10 +539,10 @@ def save_data(prefix, save_dir=".", **kwargs): """ dense_arrays = {} sparse_arrays = {} - dict_array = {} + list_arrays = {} for key, matrix in kwargs.items(): - if isinstance(matrix, dict): - dict_array[key] = matrix + if isinstance(matrix, list): + list_arrays[key] = matrix elif sp.issparse(matrix): sparse_arrays[key] = matrix elif isinstance(matrix, np.ndarray): @@ -565,19 +564,18 @@ def _dir(filename): return os.path.join(save_dir, filename) # Save dict with raw text to "optional file" - if dict_array: - np.savez_compressed(_dir(f"{prefix}.optional.npz"), **dict_array) - with open(_dir(f"{prefix}.optional.npz"), "rb") as f: - md5 = hashlib.md5(f.read()).hexdigest() - os.rename(_dir(f"{prefix}.optional.npz"), - _dir(f"{prefix}__{md5}.optional.npz")) - key_to_loc.update({ - key: { - "optional file": f"{prefix}__{md5}.optional.npz", + if list_arrays: + for key, list_ in list_arrays.items(): + array_ = np.array(list_, dtype=str) + np.savez_compressed(_dir(f"{prefix}__{key}.optional.npz"), array_) + with open(_dir(f"{prefix}__{key}.optional.npz"), "rb") as f: + md5 = hashlib.md5(f.read()).hexdigest() + os.rename(_dir(f"{prefix}__{key}.optional.npz"), + _dir(f"{prefix}__{key}__{md5}.optional.npz")) + key_to_loc[key] = { + "optional file": f"{prefix}__{key}__{md5}.optional.npz", "key": key } - for key in dict_array - }) # Save numpy arrays into a single file np.savez_compressed(_dir(f"{prefix}.npz"), **dense_arrays) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index ceeb88a9..01cb8dcc 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -49,6 +49,11 @@ def check_essential_keys_metadata_json_homogeneous(dic): # Scipy sparse file only stores one array # No `key` is needed. continue + elif sub_key == "file": + alt_key = "optional file" + if dic["data"]["Node"][key].get(alt_key, None)\ + is not None: + continue missing_keys.append("data: Node: " + key + ": " + sub_key) for sup_key in ["Edge", "Graph"]: From b4c5ab2c4619ca8fcb0a8412ed32d91804451c2a Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Wed, 6 Mar 2024 20:46:53 -0500 Subject: [PATCH 09/15] add pandas into env --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b4e09738..6611d44c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["numpy>=1.19", "scipy>=1.5", "torch>=1.10", "dgl>=0.6"] +dependencies = ["numpy>=1.19", "scipy>=1.5", "torch>=1.10", "dgl>=0.6", "pandas"] optional-dependencies = { test = [ "pytest", "pydocstyle", From bd91a99c27f900d9d43ead868818134dbbfc7fb5 Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Wed, 6 Mar 2024 21:36:28 -0500 Subject: [PATCH 10/15] fix doc test --- pyproject.toml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6611d44c..1156cfde 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ classifiers = [ "License :: OSI Approved :: MIT License", "Operating System :: OS Independent", ] -dependencies = ["numpy>=1.19", "scipy>=1.5", "torch>=1.10", "dgl>=0.6", "pandas"] +dependencies = ["numpy>=1.19", "scipy>=1.5", "torch>=1.10", "dgl>=0.6"] optional-dependencies = { test = [ "pytest", "pydocstyle", @@ -28,7 +28,9 @@ optional-dependencies = { test = [ ], doc = [ "sphinx", "sphinx-rtd-theme", - "sphinx_copybutton" + "sphinx_copybutton", + "pandas", + "yaml" ], tag = [ "powerlaw", ] } From 611cc57451117efd80271490e8406d744e364454 Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Wed, 6 Mar 2024 21:41:28 -0500 Subject: [PATCH 11/15] fix yaml for doc test --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1156cfde..424dea15 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -30,7 +30,7 @@ optional-dependencies = { test = [ "sphinx-rtd-theme", "sphinx_copybutton", "pandas", - "yaml" + "PyYAML" ], tag = [ "powerlaw", ] } From 05fedf9de92fbf0dc77fdfe5944fd907d7e4abb4 Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Wed, 6 Mar 2024 21:49:37 -0500 Subject: [PATCH 12/15] fix doc test env --- pyproject.toml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 424dea15..63d3f7a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,15 +22,16 @@ optional-dependencies = { test = [ "pytest", "pydocstyle", "pycodestyle", - "pylint", + "pylint",pydantic "pyyaml", "pre-commit", ], doc = [ "sphinx", "sphinx-rtd-theme", "sphinx_copybutton", + "pylint",pydantic "pandas", - "PyYAML" + "pyyaml" ], tag = [ "powerlaw", ] } From a218d9a4d5502091c52a1147c9b964c53343ef19 Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Wed, 6 Mar 2024 21:52:20 -0500 Subject: [PATCH 13/15] update doc env --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 63d3f7a5..7e514c28 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,7 +29,7 @@ optional-dependencies = { test = [ "sphinx", "sphinx-rtd-theme", "sphinx_copybutton", - "pylint",pydantic + "pydantic", "pandas", "pyyaml" ], tag = [ From dfa1895e533342e3d74d3c37140b16411679f1e3 Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Wed, 6 Mar 2024 21:55:16 -0500 Subject: [PATCH 14/15] fix test env --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7e514c28..640eb7a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,7 +22,7 @@ optional-dependencies = { test = [ "pytest", "pydocstyle", "pycodestyle", - "pylint",pydantic + "pylint", "pyyaml", "pre-commit", ], doc = [ From 7447d641f767c5335d7ea2f52a667299ccbacef3 Mon Sep 17 00:00:00 2001 From: StevenGolden1203 Date: Thu, 7 Mar 2024 00:42:12 -0500 Subject: [PATCH 15/15] finialize notebook --- datasets/cora/cora.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/datasets/cora/cora.ipynb b/datasets/cora/cora.ipynb index 10c2318f..35c4e4d0 100644 --- a/datasets/cora/cora.ipynb +++ b/datasets/cora/cora.ipynb @@ -443,7 +443,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -454,7 +454,7 @@ " 'Neural Networks')" ] }, - "execution_count": 14, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" }