Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ data:
trainset_size: 10 # The size of the trainset
evalset_ratio: 0.1 # The proportionate size of the evalset
data_helper_type: generation # Type of data helper to use (quality, generation)

seed: 42 # The seed for the random number generator
prompt:
prompt: base_doc_gen # Which prompt signature to use
class: DocGeneratorPrompt # Must be a child of SinglePrompt (we will use an enum to map this)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ data:
trainset_size: 10 # The size of the trainset
evalset_ratio: 0.1 # The proportionate size of the evalset
data_helper_type: generation # Type of data helper to use (quality, generation)
seed: 42 # The seed for the random number generator

prompt:
prompt: base_doc_gen # Which prompt signature to use
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ data:
trainset_size: 10 # The size of the trainset
evalset_ratio: 0.1 # The proportionate size of the evalset
data_helper_type: generation # Type of data helper to use (quality, generation)

seed: 42 # The seed for the random number generator
prompt:
prompt: base_doc_gen # Which prompt signature to use
class: DocGeneratorPrompt # Must be a child of SinglePrompt (we will use an enum to map this)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ data:
trainset_size: 10 # The size of the trainset
evalset_ratio: 0.1 # The proportionate size of the evalset
data_helper_type: quality # Type of data helper to use (quality, generation)

seed: 42 # The seed for the random number generator
prompt:
prompt: doc_quality # Which prompt signature to use
class: DocQualityPrompt # Must be a child of SinglePrompt (we will use an enum to map this)
Expand Down
14 changes: 8 additions & 6 deletions graphdoc/graphdoc/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,6 @@
# logging
log = logging.getLogger(__name__)

# global variables
random.seed(42)


#######################
# Resource Setup #
#######################
Expand Down Expand Up @@ -160,7 +156,9 @@ def trainset_from_yaml(yaml_path: Union[str, Path]) -> List[dspy.Example]:


def split_trainset(
trainset: List[dspy.Example], evalset_ratio: float
trainset: List[dspy.Example],
evalset_ratio: float,
seed: int = 42,
) -> tuple[List[dspy.Example], List[dspy.Example]]:
"""Split a trainset into a trainset and evalset.

Expand All @@ -170,6 +168,7 @@ def split_trainset(
tuple[List[dspy.Example], List[dspy.Example]]

"""
random.seed(seed)
split_idx = int(len(trainset) * (1 - evalset_ratio))
random.shuffle(trainset)
evalset = trainset[split_idx:]
Expand Down Expand Up @@ -201,6 +200,7 @@ def trainset_and_evalset_from_yaml(
evalset_ratio: 0.1, # The proportionate size of evalset
data_helper_type: quality # Type of data helper to use
# (quality, generation)
seed: 42 # The seed for the random number generator

:param yaml_path: Path to the YAML file.
:type yaml_path: Union[str, Path]
Expand All @@ -210,7 +210,9 @@ def trainset_and_evalset_from_yaml(
"""
config = load_yaml_config(yaml_path)
trainset = trainset_from_dict(config["data"])
return split_trainset(trainset, config["data"]["evalset_ratio"])
return split_trainset(
trainset, config["data"]["evalset_ratio"], config["data"]["seed"]
)


#######################
Expand Down
4 changes: 0 additions & 4 deletions graphdoc/graphdoc/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import argparse
import logging
import random

# system packages
import sys
Expand All @@ -20,9 +19,6 @@
# logging
log = logging.getLogger(__name__)

# global variables
random.seed(42)

#######################
# Main Entry Point #
#######################
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ data:
trainset_size: 10 # The size of the trainset
evalset_ratio: 0.1 # The proportionate size of the evalset
data_helper_type: generation # Type of data helper to use (quality, generation)

seed: 42 # The seed for the random number generator
prompt:
prompt: base_doc_gen # Which prompt signature to use
class: DocGeneratorPrompt # Must be a child of SinglePrompt (we will use an enum to map this)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ data:
trainset_size: 10 # The size of the trainset
evalset_ratio: 0.1 # The proportionate size of the evalset
data_helper_type: generation # Type of data helper to use (quality, generation)

seed: 42 # The seed for the random number generator
prompt:
prompt: base_doc_gen # Which prompt signature to use
class: DocGeneratorPrompt # Must be a child of SinglePrompt (we will use an enum to map this)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ data:
trainset_size: 1000 # The size of the trainset
evalset_ratio: 0.1 # The proportionate size of the evalset
data_helper_type: generation # Type of data helper to use (quality, generation)

seed: 42 # The seed for the random number generator
prompt:
prompt: base_doc_gen # Which prompt signature to use
class: DocGeneratorPrompt # Must be a child of SinglePrompt (we will use an enum to map this)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ data:
trainset_size: 1000 # The size of the trainset
evalset_ratio: 0.1 # The proportionate size of the evalset
data_helper_type: quality # Type of data helper to use (quality, generation)
seed: 42 # The seed for the random number generator

prompt:
prompt: doc_quality # Which prompt signature to use
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ data:
load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true)
local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true)
local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true)
seed: 42 # The seed for the random number generator

prompt:
prompt: zero_shot_doc_gen # Which prompt signature to use
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ data:
load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true)
local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true)
local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true)
seed: 42 # The seed for the random number generator

prompt:
prompt: zero_shot_doc_gen # Which prompt signature to use
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ data:
load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true)
local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true)
local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true)

seed: 42 # The seed for the random number generator
prompt:
prompt: doc_quality # Which prompt signature to use
class: SchemaDocQualityPrompt # Must be a child of SinglePrompt (we will use an enum to map this)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ data:
load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true)
local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true)
local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true)
seed: 42 # The seed for the random number generator

prompt:
prompt: doc_quality # Which prompt signature to use
Expand Down
1 change: 1 addition & 0 deletions graphdoc/tests/assets/configs/single_prompt_trainer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ data:
load_local_specific_category: false # Whether to load all categories or a specific category (if load_from_local is true)
local_specific_category: perfect # The specific category to load from the dataset (if load_from_local is true)
local_parse_objects: True # Whether to parse the objects in the dataset (if load_from_local is true)
seed: 42 # The seed for the random number generator

prompt:
prompt: doc_quality # Which prompt signature to use
Expand Down