diff --git a/graphdoc/assets/configs/single_prompt_doc_generator_module.yaml b/graphdoc/assets/configs/single_prompt_doc_generator_module.yaml index cdf1d94..34aa38e 100644 --- a/graphdoc/assets/configs/single_prompt_doc_generator_module.yaml +++ b/graphdoc/assets/configs/single_prompt_doc_generator_module.yaml @@ -25,7 +25,7 @@ data: trainset_size: 10 # The size of the trainset evalset_ratio: 0.1 # The proportionate size of the evalset data_helper_type: generation # Type of data helper to use (quality, generation) - + seed: 42 # The seed for the random number generator prompt: prompt: base_doc_gen # Which prompt signature to use class: DocGeneratorPrompt # Must be a child of SinglePrompt (we will use an enum to map this) diff --git a/graphdoc/assets/configs/single_prompt_doc_generator_module_eval.yaml b/graphdoc/assets/configs/single_prompt_doc_generator_module_eval.yaml index 41db88a..2be6156 100644 --- a/graphdoc/assets/configs/single_prompt_doc_generator_module_eval.yaml +++ b/graphdoc/assets/configs/single_prompt_doc_generator_module_eval.yaml @@ -25,6 +25,7 @@ data: trainset_size: 10 # The size of the trainset evalset_ratio: 0.1 # The proportionate size of the evalset data_helper_type: generation # Type of data helper to use (quality, generation) + seed: 42 # The seed for the random number generator prompt: prompt: base_doc_gen # Which prompt signature to use diff --git a/graphdoc/assets/configs/single_prompt_doc_generator_trainer.yaml b/graphdoc/assets/configs/single_prompt_doc_generator_trainer.yaml index 1693e9a..afa26c3 100644 --- a/graphdoc/assets/configs/single_prompt_doc_generator_trainer.yaml +++ b/graphdoc/assets/configs/single_prompt_doc_generator_trainer.yaml @@ -25,7 +25,7 @@ data: trainset_size: 10 # The size of the trainset evalset_ratio: 0.1 # The proportionate size of the evalset data_helper_type: generation # Type of data helper to use (quality, generation) - + seed: 42 # The seed for the random number generator prompt: prompt: base_doc_gen # Which prompt signature to use class: DocGeneratorPrompt # Must be a child of SinglePrompt (we will use an enum to map this) diff --git a/graphdoc/assets/configs/single_prompt_doc_quality_trainer.yaml b/graphdoc/assets/configs/single_prompt_doc_quality_trainer.yaml index 6a0a48c..582bafe 100644 --- a/graphdoc/assets/configs/single_prompt_doc_quality_trainer.yaml +++ b/graphdoc/assets/configs/single_prompt_doc_quality_trainer.yaml @@ -25,7 +25,7 @@ data: trainset_size: 10 # The size of the trainset evalset_ratio: 0.1 # The proportionate size of the evalset data_helper_type: quality # Type of data helper to use (quality, generation) - + seed: 42 # The seed for the random number generator prompt: prompt: doc_quality # Which prompt signature to use class: DocQualityPrompt # Must be a child of SinglePrompt (we will use an enum to map this) diff --git a/graphdoc/graphdoc/config.py b/graphdoc/graphdoc/config.py index d12b2de..da1a463 100644 --- a/graphdoc/graphdoc/config.py +++ b/graphdoc/graphdoc/config.py @@ -29,10 +29,6 @@ # logging log = logging.getLogger(__name__) -# global variables -random.seed(42) - - ####################### # Resource Setup # ####################### @@ -160,7 +156,9 @@ def trainset_from_yaml(yaml_path: Union[str, Path]) -> List[dspy.Example]: def split_trainset( - trainset: List[dspy.Example], evalset_ratio: float + trainset: List[dspy.Example], + evalset_ratio: float, + seed: int = 42, ) -> tuple[List[dspy.Example], List[dspy.Example]]: """Split a trainset into a trainset and evalset. @@ -170,6 +168,7 @@ def split_trainset( tuple[List[dspy.Example], List[dspy.Example]] """ + random.seed(seed) split_idx = int(len(trainset) * (1 - evalset_ratio)) random.shuffle(trainset) evalset = trainset[split_idx:] @@ -201,6 +200,7 @@ def trainset_and_evalset_from_yaml( evalset_ratio: 0.1, # The proportionate size of evalset data_helper_type: quality # Type of data helper to use # (quality, generation) + seed: 42 # The seed for the random number generator :param yaml_path: Path to the YAML file. :type yaml_path: Union[str, Path] @@ -210,7 +210,9 @@ def trainset_and_evalset_from_yaml( """ config = load_yaml_config(yaml_path) trainset = trainset_from_dict(config["data"]) - return split_trainset(trainset, config["data"]["evalset_ratio"]) + return split_trainset( + trainset, config["data"]["evalset_ratio"], config["data"]["seed"] + ) ####################### diff --git a/graphdoc/graphdoc/main.py b/graphdoc/graphdoc/main.py index 588f7f9..478a0a4 100644 --- a/graphdoc/graphdoc/main.py +++ b/graphdoc/graphdoc/main.py @@ -3,7 +3,6 @@ import argparse import logging -import random # system packages import sys @@ -20,9 +19,6 @@ # logging log = logging.getLogger(__name__) -# global variables -random.seed(42) - ####################### # Main Entry Point # ####################### diff --git a/graphdoc/tests/assets/configs/single_prompt_doc_generator_module.yaml b/graphdoc/tests/assets/configs/single_prompt_doc_generator_module.yaml index cdf1d94..34aa38e 100644 --- a/graphdoc/tests/assets/configs/single_prompt_doc_generator_module.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_doc_generator_module.yaml @@ -25,7 +25,7 @@ data: trainset_size: 10 # The size of the trainset evalset_ratio: 0.1 # The proportionate size of the evalset data_helper_type: generation # Type of data helper to use (quality, generation) - + seed: 42 # The seed for the random number generator prompt: prompt: base_doc_gen # Which prompt signature to use class: DocGeneratorPrompt # Must be a child of SinglePrompt (we will use an enum to map this) diff --git a/graphdoc/tests/assets/configs/single_prompt_doc_generator_module_eval.yaml b/graphdoc/tests/assets/configs/single_prompt_doc_generator_module_eval.yaml index 07edea4..e46a538 100644 --- a/graphdoc/tests/assets/configs/single_prompt_doc_generator_module_eval.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_doc_generator_module_eval.yaml @@ -25,7 +25,7 @@ data: trainset_size: 10 # The size of the trainset evalset_ratio: 0.1 # The proportionate size of the evalset data_helper_type: generation # Type of data helper to use (quality, generation) - + seed: 42 # The seed for the random number generator prompt: prompt: base_doc_gen # Which prompt signature to use class: DocGeneratorPrompt # Must be a child of SinglePrompt (we will use an enum to map this) diff --git a/graphdoc/tests/assets/configs/single_prompt_doc_generator_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_doc_generator_trainer.yaml index 91fd89b..7e94e14 100644 --- a/graphdoc/tests/assets/configs/single_prompt_doc_generator_trainer.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_doc_generator_trainer.yaml @@ -25,7 +25,7 @@ data: trainset_size: 1000 # The size of the trainset evalset_ratio: 0.1 # The proportionate size of the evalset data_helper_type: generation # Type of data helper to use (quality, generation) - + seed: 42 # The seed for the random number generator prompt: prompt: base_doc_gen # Which prompt signature to use class: DocGeneratorPrompt # Must be a child of SinglePrompt (we will use an enum to map this) diff --git a/graphdoc/tests/assets/configs/single_prompt_doc_quality_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_doc_quality_trainer.yaml index 1cc63c6..463e19f 100644 --- a/graphdoc/tests/assets/configs/single_prompt_doc_quality_trainer.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_doc_quality_trainer.yaml @@ -25,6 +25,7 @@ data: trainset_size: 1000 # The size of the trainset evalset_ratio: 0.1 # The proportionate size of the evalset data_helper_type: quality # Type of data helper to use (quality, generation) + seed: 42 # The seed for the random number generator prompt: prompt: doc_quality # Which prompt signature to use diff --git a/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_trainer.yaml index 841ebd7..942f1b1 100644 --- a/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_trainer.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_trainer.yaml @@ -15,6 +15,7 @@ data: load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true) local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true) local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true) + seed: 42 # The seed for the random number generator prompt: prompt: zero_shot_doc_gen # Which prompt signature to use diff --git a/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_version.yaml b/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_version.yaml index c0e264b..4e93614 100644 --- a/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_version.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_schema_doc_generator_version.yaml @@ -15,6 +15,7 @@ data: load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true) local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true) local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true) + seed: 42 # The seed for the random number generator prompt: prompt: zero_shot_doc_gen # Which prompt signature to use diff --git a/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_trainer.yaml index 0c728d0..708344e 100644 --- a/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_trainer.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_trainer.yaml @@ -15,7 +15,7 @@ data: load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true) local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true) local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true) - + seed: 42 # The seed for the random number generator prompt: prompt: doc_quality # Which prompt signature to use class: SchemaDocQualityPrompt # Must be a child of SinglePrompt (we will use an enum to map this) diff --git a/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_version.yaml b/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_version.yaml index b239055..55ad936 100644 --- a/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_version.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_schema_doc_quality_version.yaml @@ -15,6 +15,7 @@ data: load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true) local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true) local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true) + seed: 42 # The seed for the random number generator prompt: prompt: doc_quality # Which prompt signature to use diff --git a/graphdoc/tests/assets/configs/single_prompt_trainer.yaml b/graphdoc/tests/assets/configs/single_prompt_trainer.yaml index 0c728d0..4b71afe 100644 --- a/graphdoc/tests/assets/configs/single_prompt_trainer.yaml +++ b/graphdoc/tests/assets/configs/single_prompt_trainer.yaml @@ -15,6 +15,7 @@ data: load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true) local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true) local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true) + seed: 42 # The seed for the random number generator prompt: prompt: doc_quality # Which prompt signature to use