diff --git a/.github/workflows/cpu-basic-install-prepare-train-inf-test.yml b/.github/workflows/cpu-basic-install-prepare-train-inf-test.yml new file mode 100644 index 0000000000..39d2318941 --- /dev/null +++ b/.github/workflows/cpu-basic-install-prepare-train-inf-test.yml @@ -0,0 +1,34 @@ +name: Basic Pytorch Installation, Data Prep, CPU Training, CPU Inference +on: [push, pull_request] +jobs: + Install-Dependencies_Data-Prep_CPU-Training_CPU-Inference: + runs-on: ubuntu-latest + steps: + - name: Check out repository code + uses: actions/checkout@v4 + - run: echo "${{ github.repository }} repository has been cloned to the runner." + - run: echo "Currently on ${{ github.ref }} branch" + - name: ls of directory + run: | + ls ${{ github.workspace }} + # Caching pip dependencies + - name: Cache pip dependencies + uses: actions/cache@v3 + with: + path: ~/.cache/pip + key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements_cpu.txt') }} + restore-keys: | + ${{ runner.os }}-pip- + - name: Install CPU Dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + python3 -m pip install -r requirements_cpu.txt + - name: Run Small Network on CPU + run: | + python3 data/shakespeare_char/prepare.py + python3 train.py --out_dir=out --device=cpu --eval_interval=2 --log_interval=1 --block_size=2 --batch_size=2 --n_layer=2 --n_head=2 --n_embd=16 --max_iters=3 --lr_decay_iters=2 --dropout=0.0 + - name: Run CPU Inference + run: | + python3 sample.py --device=cpu --out_dir="out" + diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..dbc4bfac00 --- /dev/null +++ b/.gitignore @@ -0,0 +1,11 @@ +# folders +__pycache__/ +logs/ + +# file extensions +*.pkl +*.bin +*.txt + +# checkpoint directories +out*/ diff --git a/HW/SA/Softer_max/LUT.sv b/HW/SA/Softer_max/LUT.sv new file mode 100644 index 0000000000..97002e2408 --- /dev/null +++ b/HW/SA/Softer_max/LUT.sv @@ -0,0 +1,29 @@ +// a LUT used for pow2 unit, will store this LUT into a on-chip mem in the future +// this LUT contains value from 2^(0/16) to 2^(15/16) +module LUT( + input logic signed [`FRAC-1:0] index, + output logic signed [`DATA_SIZE-1:0] out +); + always_comb begin + //[2^(n/16)] and then round to integer + out = 8'b0; + case(index) + 4'b0000 : out = 8'b00010000; + 4'b0001 : out = 8'b00010001; + 4'b0010 : out = 8'b00010001; + 4'b0011 : out = 8'b00010010; + 4'b0100 : out = 8'b00010011; + 4'b0101 : out = 8'b00010100; + 4'b0110 : out = 8'b00010101; + 4'b0111 : out = 8'b00010110; + 4'b1000 : out = 8'b00010111; + 4'b1001 : out = 8'b00011000; + 4'b1010 : out = 8'b00011001; + 4'b1011 : out = 8'b00011010; + 4'b1100 : out = 8'b00011011; + 4'b1101 : out = 8'b00011100; + 4'b1110 : out = 8'b00011101; + 4'b1111 : out = 8'b00011111; + endcase + end +endmodule \ No newline at end of file diff --git a/HW/SA/Softer_max/Pow2_tb.sv b/HW/SA/Softer_max/Pow2_tb.sv new file mode 100644 index 0000000000..a038735b73 --- /dev/null +++ b/HW/SA/Softer_max/Pow2_tb.sv @@ -0,0 +1,69 @@ +//The testbench for Pow2 unit +module Pow_tb(); + logic clk; + logic signed [`DATA_SIZE-1:0] current_max; + logic signed [`DATA_SIZE-1:0] input_vector; + logic signed [`LARGE_SIZE:0] uSoftmax; + integer write_file0, write_file1, write_file2; + initial begin + + write_file0 = $fopen("Pow_data.txt", "w"); + write_file1 = $fopen("true_pow_data.txt", "w"); + write_file2 = $fopen("Pow_data2.txt", "w"); + end + + Pow2 Pow2 ( + .current_max, + .input_vector, + .uSoftmax //UnnormedSoftmax + ); + int counter; + real true_out, xj, uSoftmax_floating; + always #5 clk = ~clk; + + //initialize "current_max", could also use rand() + always #10 begin + counter = counter + 1; + if(counter == 5) begin + current_max = 0; + counter = 0; + end + else begin + current_max = current_max+1; + end + end + + //initlalize input vectors + always #5 begin + input_vector[`DATA_SIZE-2:`FRAC] = input_vector[`DATA_SIZE-2:`FRAC] + 1; + input_vector[`FRAC-1:0] = input_vector[`FRAC-1:0] + 1; + end + + //calculate the expected value + always #5 begin + xj = input_vector[`DATA_SIZE-2:`FRAC] + (input_vector[`FRAC-1:0]/16.0); + true_out = 2**(xj-current_max+0.0); + end + + //transfer the output from fixed point 8 to a real number + always #5 begin + uSoftmax_floating = uSoftmax[`LARGE_SIZE-2:`FRAC] + uSoftmax[`FRAC-1:0]*1.0/16.0; + end + + //write the outputs into output files + always #5 begin + $fdisplay(write_file0,"%0d\t",uSoftmax); + $fdisplay(write_file1,"%0f\t",true_out); + $fdisplay(write_file2,"%0f\t",uSoftmax_floating); + end + + //testing begin + initial begin + clk = 0; + counter = 0; + current_max = 0; + input_vector = 5; + #200; + $finish; + end +endmodule \ No newline at end of file diff --git a/HW/SA/Softer_max/Pow2_unit.sv b/HW/SA/Softer_max/Pow2_unit.sv new file mode 100644 index 0000000000..d08e0f19ea --- /dev/null +++ b/HW/SA/Softer_max/Pow2_unit.sv @@ -0,0 +1,30 @@ +//perform 2^(xj-localMax) +//To do: modify output data size +//To do: explore floating point 8 representation +module Pow2 ( + input logic signed [`DATA_SIZE-1:0] current_max, + input logic signed [`DATA_SIZE-1:0] input_vector, + output logic signed [`LARGE_SIZE:0] uSoftmax //UnnormedSoftmax +); + logic signed [`DATA_SIZE-1:0] pow2_frac; + logic signed [`LARGE_SIZE-1:0] FP_1; + + //fixed_point 2 + always_comb begin + FP_1 = 0; + FP_1[`FRAC] = 1; + end + + //return [2^(n/16)]*2^4 and then round to integer + LUT LUT0 ( + .index(input_vector[`FRAC-1:0]), + .out(pow2_frac) + ); + + //for debug + logic signed [`DATA_SIZE-1-`FRAC:0] temp; + assign temp = input_vector [`DATA_SIZE-1:`FRAC]; + //2^[(int xj)-localMax]*2^(frac+4)/2^4 + //8'b01000000, 2 in FixedP8 + assign uSoftmax = ((FP_1 <<< (temp - current_max)) * pow2_frac) >>> `FRAC; +endmodule \ No newline at end of file diff --git a/HW/SA/Softer_max/README.md b/HW/SA/Softer_max/README.md new file mode 100644 index 0000000000..e69de29bb2 diff --git a/HW/SA/Softer_max/help.py b/HW/SA/Softer_max/help.py new file mode 100644 index 0000000000..9c66630a58 --- /dev/null +++ b/HW/SA/Softer_max/help.py @@ -0,0 +1,44 @@ +# a helper python code to compare the expected ouputs of Pow2 unit and its real outputs +# files are generated by Pow2_tb.sv +def fp8_to_real(value): + # FP8(3,4) has: + # - 3 integer bits + # - 4 fractional bits + # - Signed format + + # Extract sign bit + + # Extract integer part + integer = value >> 4 + + # Extract fraction part + fraction = value & 0b00001111 + + #Compute real value + real_value = (integer + fraction / 16) + + return real_value + +def compare_files(file1_path, file2_path): + differences = [] + with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2: + for line1, line2 in zip(file1, file2): + try: + num1 = float(line1.strip()) + num2 = float(line2.strip()) + #num1 = fp8_to_real(num1) + difference = abs(num1 - num2) + differences.append(difference) + except ValueError: + # Handle the case where a line does not contain a valid number + differences.append(None) + + return differences + +# Example usage +file1_path = 'Pow_data2.txt' +file2_path = 'true_pow_data.txt' +differences = compare_files(file1_path, file2_path) +print(differences) +print(f"average diff = {sum(differences)/len(differences)}") +#print(f"average diff after scale down by 2^4 is: {(sum(differences)/len(differences))/2**4}") diff --git a/data/csv_data/.gitignore b/data/csv_data/.gitignore new file mode 100644 index 0000000000..73ff7cde94 --- /dev/null +++ b/data/csv_data/.gitignore @@ -0,0 +1,3 @@ +*.csv +*.bin +*.pkl diff --git a/data/csv_data/README.md b/data/csv_data/README.md new file mode 100644 index 0000000000..7104922e50 --- /dev/null +++ b/data/csv_data/README.md @@ -0,0 +1,79 @@ +# Data-Shuffler for ML Permutation Invariance + +These Python scripts process time-series data from a CSV file to add permutation +invariance to the csv's data fields. + +Each row in the CSV becomes a single line in the text file, with each cell +represented by a unique lowercase letter (starting from 'a') followed by the +value from the cell. + +One has the option to shuffle the letter-value pairs in each line, using a +command-line flag. + +Training on this data with the shuffle option, will create a form of +in-frame-permutation invariance. + +This will give -- during inference -- data the freedom to move around and unlock +special capabilities otherwise not available to fixed-frame trained networks. + +For example, one can utilize a beam search for each of the labels to determine +which of the letter value pairs gives the strongest certainty of data points in +this frame, and build the next frame up incrementally using this technique. + +## Getting Started + +### Prerequisites + +- Python (3.6 or above) + +## Usage + +1. Separate your file into one with timestamp columns and one with data columns. + +2. Navigate to the directory where the script is located and run process_csv on + one's data-column file: + +```sh +python3 process_csv.py --shuffle --exclude e +``` + +3. Recombine the output file from process_csv.py with the time column data. + + +```sh +python3 combine_csvs.py +``` + +4. Prepare the processed_data_file for training + +```sh +python3 prepare.py -i +``` + +5. `cd` to the `explorations` folder, and utilize the script to run training: + + +```sh +cd ../../explorations +bash run_csv_data_training.sh +``` + +6. [Optional] Create an exploration script to test training and inference with + and without with and without shuffling. + +### Arguments + +- `input_file`: The path to the input CSV file containing time-series data. +- `output_file`: The path to the output text file. +- `--shuffle`: Optional flag to shuffle the order of letter-value pairs in each line. +- `--exclude`: Optional flag to remove any letters used by the dataset (e.g. `e` + for scientific notation) + +### Example + +For a full example see the `main.sh` script on generated sine + noise data. + +## License + +This project is licensed under the MIT License + diff --git a/data/csv_data/combine_csvs.py b/data/csv_data/combine_csvs.py new file mode 100644 index 0000000000..9427a01aef --- /dev/null +++ b/data/csv_data/combine_csvs.py @@ -0,0 +1,30 @@ +import csv +import argparse + +def combine_csv_columns(file_path1, file_path2, output_file_path): + with open(file_path1, 'r') as file1, open(file_path2, 'r') as file2, open(output_file_path, 'w', newline='') as outfile: + reader1 = csv.reader(file1) + reader2 = csv.reader(file2) + + for row1, row2 in zip(reader1, reader2): + # Combine the rows from both CSVs + combined_row = row1 + row2 + + # Join the row with no delimiter and write it to the file + outfile.write(''.join(combined_row)) + + # Write a newline character after each row + outfile.write('\n') + +def main(args): + combine_csv_columns(args.file1, args.file2, args.output) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Combine columns of two CSV files.') + parser.add_argument('file1', type=str, help='Path to the first input CSV file.') + parser.add_argument('file2', type=str, help='Path to the second input CSV file.') + parser.add_argument('output', type=str, help='Path to the output CSV file.') + + args = parser.parse_args() + main(args) + diff --git a/data/csv_data/main.sh b/data/csv_data/main.sh new file mode 100644 index 0000000000..4d4ce676d8 --- /dev/null +++ b/data/csv_data/main.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# This is an example using generated data to demonstrate: +# 1. labelling of different time series data +# 2. shuffling of different fields with their label + +set -x + +# CREATE SYNTHETIC DATA -- Skip if using your own data +# This is a generator create two csvs (if `--split` is specified): +# 1. time data called: `time_filename.csv` +# 2. signal data called: `data_filename.csv` +python3 sine_noise_generator.py --noise_level 0.3 --filename sine_data.csv --scientific --precision 2 --modulo 1000 --points 1000000 --split + +set +x +echo -e "\nPreview: Generated Times" +head time_sine_data.csv + +echo -e "\n\nPreview: Generated Data" +head data_sine_data.csv +echo -e "\n\n" +set -x + +# Utilize a dataset only csv (no timestamps) in this case `data_sine_data.csv` +# This script does two things: +# 1. _prepend_ labels to the data +# 2. (optionally) shuffle the data +# Also 'e' is used for scientific notation, skip this letter when doing labelling +python3 process_csv.py data_sine_data.csv sine_noise_sn_shuffled.csv --shuffle --exclude e + +# preview the result +set +x +echo -e "\nPreview: Shuffled Data" +head sine_noise_sn_shuffled.csv +echo -e "\n\n" +set -x + +# recombine +python3 combine_csvs.py time_sine_data.csv sine_noise_sn_shuffled.csv processed_sine_data.csv + +set +x +echo -e "\nPreview: Timestamps with Shuffled Data" +head processed_sine_data.csv diff --git a/data/csv_data/prepare.py b/data/csv_data/prepare.py new file mode 100644 index 0000000000..68d9831572 --- /dev/null +++ b/data/csv_data/prepare.py @@ -0,0 +1,69 @@ +import os +import pickle +import numpy as np +import argparse + + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input_file", required=True) + parser.add_argument("-t", "--token_file", default=None) + return parser.parse_args() + + +def process_data(input_file, token_file): + with open(input_file, "r") as f: + data = f.read() + + if token_file != None: + with open(token_file, "r") as f: + token_data = f.read() + else: + token_data = data + + print(f"Length of dataset: {len(data):,}") + + chars = sorted(list(set(token_data))) + vocab_size = len(chars) + + print(f"Unique chars: {''.join(chars)}") + print(f"Vocab size: {vocab_size:,}") + + stoi = {ch: i for i, ch in enumerate(chars)} + itos = {i: ch for i, ch in enumerate(chars)} + + def encode(s): + return [stoi[c] for c in s] + + def decode(l): + return "".join([itos[i] for i in l]) + + n = len(data) + train_data = data[: int(n * 0.9)] + val_data = data[int(n * 0.9) :] + + train_ids = encode(train_data) + val_ids = encode(val_data) + + print(f"Train tokens: {len(train_ids):,}") + print(f"Val tokens: {len(val_ids):,}") + + return train_ids, val_ids, stoi, itos + + +def save_data(train_ids, val_ids, stoi, itos, output_dir): + train_ids = np.array(train_ids, dtype=np.uint16) + val_ids = np.array(val_ids, dtype=np.uint16) + + train_ids.tofile(os.path.join(output_dir, "train.bin")) + val_ids.tofile(os.path.join(output_dir, "val.bin")) + + meta = {"vocab_size": len(stoi), "itos": itos, "stoi": stoi} + with open(os.path.join(output_dir, "meta.pkl"), "wb") as f: + pickle.dump(meta, f) + + +if __name__ == "__main__": + args = parse_args() + train_ids, val_ids, stoi, itos = process_data(args.input_file, args.token_file) + save_data(train_ids, val_ids, stoi, itos, ".") diff --git a/data/csv_data/process_csv.py b/data/csv_data/process_csv.py new file mode 100644 index 0000000000..fe279bc2b5 --- /dev/null +++ b/data/csv_data/process_csv.py @@ -0,0 +1,49 @@ +import csv +import argparse +import random + +def create_letter_mapping(exclude: list) -> dict: + # Create a mapping of indices to letters, skipping excluded letters. + allowed_letters = [chr(i) for i in range(ord('a'), ord('z') + 1) if chr(i) not in exclude] + return {i: letter for i, letter in enumerate(allowed_letters)} + +def process_csv(input_file: str, output_file: str, shuffle: bool, exclude: list) -> None: + # Create the letter mapping + letter_mapping = create_letter_mapping(exclude) + + with open(input_file, mode="r") as csv_file, open(output_file, mode="w") as txt_file: + csv_reader = csv.reader(csv_file) + + for row in csv_reader: + # Use the letter mapping to assign letters to values + letter_value_pairs = [ + f"{letter_mapping[i]}{val}" for i, val in enumerate(row) if i in letter_mapping + ] + + if shuffle: + random.shuffle(letter_value_pairs) + + # Join the letter-value pairs with no spaces and write to the output file. + txt_file.write("".join(letter_value_pairs) + "\n") + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Process a time-series CSV and convert it to a custom text format while excluding certain letters." + ) + parser.add_argument("input_file", type=str, help="Path to the input CSV file.") + parser.add_argument("output_file", type=str, help="Path to the output text file.") + parser.add_argument( + "--shuffle", + action="store_true", + help="Whether to shuffle the order of letter and value pairs.", + ) + parser.add_argument( + "--exclude", + nargs="*", + default=[], + help="A list of letters to exclude from the letter labeling.", + ) + + args = parser.parse_args() + process_csv(args.input_file, args.output_file, args.shuffle, args.exclude) + diff --git a/data/csv_data/sine_noise_generator.py b/data/csv_data/sine_noise_generator.py new file mode 100644 index 0000000000..5d9b4d0093 --- /dev/null +++ b/data/csv_data/sine_noise_generator.py @@ -0,0 +1,98 @@ +import argparse +import numpy as np +import pandas as pd + +# Function to generate sine wave with noise +def generate_sine_wave_with_noise(freq, sample_rate, num_points, noise_level): + t = np.arange(num_points) # Time axis with integer values starting from 0 + sine_wave = np.sin(2 * np.pi * freq * t / sample_rate) + noise = noise_level * np.random.normal(size=num_points) + combined_signal = sine_wave + noise + return t, sine_wave, noise, combined_signal + +# Function to format data as scientific notation if required +def format_data_as_scientific(df, precision): + return df.applymap(lambda x: f"{x:.{precision}e}") + +# Function to save the data to CSV +def save_to_csv(time, signal, noise, combined_signal, filename, scientific, precision, split_files, modulo): + # Apply modulo if necessary + if modulo is not None: + time = time % modulo + + # Create the DataFrame + data = { + 'signal': signal, + 'noise': noise, + 'signal_plus_noise': combined_signal + } + + if split_files: + # Save time data to a separate CSV + time_df = pd.DataFrame({'seconds_from_start': time}) + if scientific: + time_df = format_data_as_scientific(time_df, precision) + time_df.to_csv(f"time_{filename}", header=False, index=False) + + # Save data to a separate CSV + data_df = pd.DataFrame(data) + if scientific: + data_df = format_data_as_scientific(data_df, precision) + data_df.to_csv(f"data_{filename}", header=False, index=False) + else: + # Combine time and data for a single CSV + df = pd.DataFrame({'seconds_from_start': time, **data}) + if scientific: + df = format_data_as_scientific(df, precision) + df.to_csv(filename, header=False, index=False) + +# Parse command-line arguments +def parse_arguments(): + parser = argparse.ArgumentParser(description='Generate a sine wave with noise and export to CSV.') + parser.add_argument('-n', '--noise_level', type=float, default=0.5, required=True, + help='Level of noise relative to the sine wave (0-1).') + parser.add_argument('-f', '--filename', type=str, default='sine_wave.csv', + help='Name of the output CSV file.') + parser.add_argument('--scientific', action='store_true', + help='Output numbers in scientific notation.') + parser.add_argument('--precision', type=int, default=2, + help='Number of digits past the decimal point in scientific notation.') + parser.add_argument('--points', type=int, default=5000, + help='Total number of data points to be created.') + parser.add_argument('--split', action='store_true', + help='Save time data and signal data in separate CSV files.') + parser.add_argument('--modulo', type=int, + help='Modulo value to apply to the time data.') + args = parser.parse_args() + if not (0 <= args.noise_level <= 1): + raise ValueError('Noise level must be between 0 and 1.') + if args.precision < 0: + raise ValueError('Precision must be a non-negative integer.') + if args.points <= 0: + raise ValueError('Number of data points must be a positive integer.') + return args + +def main(): + args = parse_arguments() + + # Parameters for sine wave generation + frequency = 5 # Frequency in Hz + sample_rate = 500 # Sample rate in Hz + num_points = args.points # Total number of data points + + # Generate the sine wave with noise + time, sine_wave, noise, combined_signal = generate_sine_wave_with_noise( + frequency, sample_rate, num_points, args.noise_level + ) + + # Save to CSV file(s) + save_to_csv(time, sine_wave, noise, combined_signal, args.filename, args.scientific, args.precision, args.split, args.modulo) + if args.split: + print(f"Time data saved to time_{args.filename}") + print(f"Signal data saved to data_{args.filename}") + else: + print(f"Sine wave data with noise saved to {args.filename}") + +if __name__ == '__main__': + main() + diff --git a/data/experiments/prepare.py b/data/experiments/prepare.py new file mode 100644 index 0000000000..cbc61c763b --- /dev/null +++ b/data/experiments/prepare.py @@ -0,0 +1,100 @@ +""" +Prepare the Shakespeare dataset for character-level language modeling. +So instead of encoding with GPT-2 BPE tokens, we just map characters to ints. +Will save train.bin, val.bin containing the ids, and meta.pkl containing the +encoder and decoder and some other related info. +""" +import os +import pickle +import requests +import numpy as np +import argparse +from datasets import load_dataset +from pathlib import Path + +parser = argparse.ArgumentParser(description="Select training model, little Shakespeare or tinystories") +parser.add_argument('--choice', type=int, default=0, help="0 for little Shakespeare, 1 for tinystories") + +args = parser.parse_args() +choice = args.choice + +input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt') +#load data from hugging face +if choice == 1: + data_dir = Path("data") + data_dir.mkdir(exist_ok=True) + if not os.path.exists(data_dir / "full.txt"): + dataset = load_dataset("msaligane/tinystories_phonology", split="train") + full_text = "" + for i, example in enumerate(dataset): + filename = f"tinystoryP{i:02d}.txt" + filepath = data_dir / filename + + with open(filepath, "w") as f: + f.write(example["text"]) + + full_text += example["text"] + "\n" + + with open(data_dir / "full.txt", "w") as f: + f.write(full_text) + #get data from + with open(data_dir / "full.txt", 'r') as f: + data = f.read() +# download the tiny shakespeare dataset +elif choice == 0: + + if not os.path.exists(input_file_path): + data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt' + with open(input_file_path, 'w') as f: + f.write(requests.get(data_url).text) + + with open(input_file_path, 'r') as f: + data = f.read() +print(f"length of dataset in characters: {len(data):,}") + +# get all the unique characters that occur in this text +chars = sorted(list(set(data))) +vocab_size = len(chars) +print("all the unique characters:", ''.join(chars)) +print(f"vocab size: {vocab_size:,}") + +# create a mapping from characters to integers +stoi = { ch:i for i,ch in enumerate(chars) } +itos = { i:ch for i,ch in enumerate(chars) } +def encode(s): + return [stoi[c] for c in s] # encoder: take a string, output a list of integers +def decode(l): + return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string + +# create the train and test splits +n = len(data) +train_data = data[:int(n*0.9)] +val_data = data[int(n*0.9):] + +# encode both to integers +train_ids = encode(train_data) +val_ids = encode(val_data) +print(f"train has {len(train_ids):,} tokens") +print(f"val has {len(val_ids):,} tokens") + +# export to bin files +train_ids = np.array(train_ids, dtype=np.uint16) +val_ids = np.array(val_ids, dtype=np.uint16) +train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin')) +val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin')) + +# save the meta information as well, to help us encode/decode later +meta = { + 'vocab_size': vocab_size, + 'itos': itos, + 'stoi': stoi, +} +with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f: + pickle.dump(meta, f) + +# length of dataset in characters: 1115394 +# all the unique characters: +# !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz +# vocab size: 65 +# train has 1003854 tokens +# val has 111540 tokens \ No newline at end of file diff --git a/data/experiments/prepare_tiktoken.py b/data/experiments/prepare_tiktoken.py new file mode 100644 index 0000000000..97503d1524 --- /dev/null +++ b/data/experiments/prepare_tiktoken.py @@ -0,0 +1,66 @@ +import os +import requests +import tiktoken +import numpy as np +import argparse +from datasets import load_dataset +from pathlib import Path + +parser = argparse.ArgumentParser(description="Select training model, little Shakespeare or tinystories") +parser.add_argument('--choice', type=int, default=0, help="0 for little Shakespeare, 1 for tinystories") + +args = parser.parse_args() +choice = args.choice + +input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt') +#load data from hugging face +if choice == 1: + data_dir = Path("data") + data_dir.mkdir(exist_ok=True) + if not os.path.exists(data_dir / "full.txt"): + dataset = load_dataset("msaligane/tinystories_phonology", split="train") + full_text = "" + for i, example in enumerate(dataset): + filename = f"tinystoryP{i:02d}.txt" + filepath = data_dir / filename + + with open(filepath, "w") as f: + f.write(example["text"]) + + full_text += example["text"] + "\n" + + with open(data_dir / "full.txt", "w") as f: + f.write(full_text) + #get data from + with open(data_dir / "full.txt", 'r') as f: + data = f.read() +# download the tiny shakespeare dataset +elif choice == 0: + # download the tiny shakespeare dataset + if not os.path.exists(input_file_path): + data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt' + with open(input_file_path, 'w') as f: + f.write(requests.get(data_url).text) + + with open(input_file_path, 'r') as f: + data = f.read() + +n = len(data) +train_data = data[:int(n*0.9)] +val_data = data[int(n*0.9):] + +# encode with tiktoken gpt2 bpe +enc = tiktoken.get_encoding("gpt2") +train_ids = enc.encode_ordinary(train_data) +val_ids = enc.encode_ordinary(val_data) +print(f"train has {len(train_ids):,} tokens") +print(f"val has {len(val_ids):,} tokens") + +# export to bin files +train_ids = np.array(train_ids, dtype=np.uint16) +val_ids = np.array(val_ids, dtype=np.uint16) +train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin')) +val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin')) + +# train.bin has 301,966 tokens +# val.bin has 36,059 tokens diff --git a/data/experiments/readme.md b/data/experiments/readme.md new file mode 100644 index 0000000000..32e12999f5 --- /dev/null +++ b/data/experiments/readme.md @@ -0,0 +1,24 @@ + +# tiny stories +Users who prefer to upload a dataset programmatically can use the huggingface_hub library. This library allows users to interact with the Hub from Python. + +Plz do follwing before run prepare.py or prepare_tiktoken.py +Begin by installing the library: +``` +pip install huggingface_hub +``` +To upload a dataset on the Hub in Python, you need to log in to your Hugging Face account: +``` +huggingface-cli login +``` +you will be asked for choosing model, 0 for little Shakespeare, 1 for tinystories + +clear generated files when rerun prepare.py or prepare_tiktoken.py + +when run config/train_shakespeare_char.py remember to change the following params +``` +wandb_project = 'experiments' +wandb_run_name = 'mini-gpt' +dataset = 'experiments' +``` + diff --git a/data/shakespeare_char/prepare.py b/data/shakespeare_char/prepare.py index 9fd1621d55..a42dff5a0e 100644 --- a/data/shakespeare_char/prepare.py +++ b/data/shakespeare_char/prepare.py @@ -65,4 +65,4 @@ def decode(l): # !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz # vocab size: 65 # train has 1003854 tokens -# val has 111540 tokens +# val has 111540 tokens \ No newline at end of file diff --git a/explorations/run_csv_data_training.sh b/explorations/run_csv_data_training.sh new file mode 100644 index 0000000000..209d6f364b --- /dev/null +++ b/explorations/run_csv_data_training.sh @@ -0,0 +1,9 @@ +#/bin/bash + +cd ../ +python3 train.py \ + --max_iters 3000 \ + --dataset csv_data \ + --tensorboard_project csv_data \ + --tensorboard_run_name csv_data + diff --git a/model.py b/model.py index 5b04ecff80..bfbb11d303 100644 --- a/model.py +++ b/model.py @@ -52,6 +52,73 @@ def forward(self, x): e_x = torch.pow(2.0, x) return e_x / e_x.sum(dim=self.dim, keepdim=True) +# Softmax base 2, with constant denominator, and option to remove max subtraction +class Constantmax(nn.Module): + """ Base-2 Softmax with option to remove max subtraction""" + def __init__(self, dim=-1, subtract_max=True, constant=1000): + super().__init__() + self.dim = dim + self.subtract_max = subtract_max + self.constant = constant + + def forward(self, x): + if self.subtract_max: + max_x = x.max(dim=self.dim, keepdim=True).values + x = x - max_x + e_x = torch.pow(2.0, x) + return e_x / self.constant + +# Like softermax, but parameterized to permit exploration of bases greater than 2 +class Strongermax(nn.Module): + """ Base-2 Softmax with option to remove max subtraction""" + def __init__(self, dim=-1, subtract_max=True, strength=2): + super().__init__() + self.strength = strength + self.dim = dim + self.subtract_max = subtract_max + + def forward(self, x): + if self.subtract_max: + max_x = x.max(dim=self.dim, keepdim=True).values + x = x - max_x + e_x = torch.pow(self.strength, x) + return e_x / e_x.sum(dim=self.dim, keepdim=True) + +# Polynomial estimate of Softmax +class Polymax(nn.Module): + def __init__(self, x_intercept=-100, y_intercept=1, power=2, divisor=1000.0): + super().__init__() + + assert(x_intercept < 0) # ensure x_intercepts strictly left of the y-axis + self.x_intercept = x_intercept # where to transition from y=0 to m*x+b + self.y_intercept = y_intercept # where teh graph crosses y-axis + + self.power = power + self.divisor = divisor + + # TODO: create single location for printing the model settings + print("Use polymax") + + def forward(self, x): + # Overview: + # Flat section: -inf < x < x_intercept + # Linear section: x_intercept <= x <= 0 + # Polynomial section: 0 < x < inf + + # Flat section + flat_piece = torch.where(x < self.x_intercept, torch.tensor(0.0, device=x.device), torch.tensor(0.0, device=x.device)) + + # Linear section + m = self.y_intercept/self.x_intercept # aka 'slope', also x intercept !=0 + b = self.y_intercept + linear_piece = torch.where((x >= self.x_intercept) & (x <= 0), m * x + b, torch.tensor(0.0, device=x.device)) + + # Polynomial section + poly_piece = torch.where(x > 0, x**self.power + self.y_intercept, torch.tensor(0.0, device=x.device)) + + # Combine sections + return (poly_piece + linear_piece + flat_piece)/self.divisor + # SigSoftmax from https://arxiv.org/abs/1805.10829 class SigSoftmax(nn.Module): def __init__(self): @@ -112,6 +179,18 @@ def __init__(self, config): self.use_softermax_xmax = config.use_softermax_xmax self.softmax_layer = Softermax(subtract_max=self.use_softermax_xmax) + if self.softmax_variant == "constantmax": + self.use_softermax_xmax = config.use_softermax_xmax + self.constantmax_constant = config.constantmax_constant + self.softmax_layer = Constantmax(subtract_max=self.use_softermax_xmax, constant=self.constantmax_constant) + + if self.softmax_variant == "strongermax": + self.use_softermax_xmax = config.use_softermax_xmax + self.softmax_layer = Strongermax(subtract_max=self.use_softermax_xmax,strength=config.strongermax_strength) + + if self.softmax_variant == "polymax": + self.softmax_layer = Polymax() + if self.softmax_variant == "sigsoftmax": self.softmax_layer = SigSoftmax() @@ -218,13 +297,15 @@ class GPTConfig: dropout: float = 0.0 # Softmax Alternatives and Options - use_softmax_variant = False - softmax_variant: str = "softermax" # Choices: "softermax" "sigsoftmax" "sigsoftmax_base2" + use_softmax_variant: bool = False + softmax_variant: str = "softermax" # Choices: "softermax" "sigsoftmax" "sigsoftmax_base2" "polymax" "strongermax" "constantmax" use_softermax_xmax: bool = True # Softermax Option active is softermax selected - True: uses (x - x_max) normalization; False: removes normalization (potential overflow) + constantmax_constant: int = 1000 # denominator to utilize for Constantmax + strongermax_strength: int = 2 # Softermax Option active is softermax selected - True: uses (x - x_max) normalization; False: removes normalization (potential overflow) # Layernorm Alternatives and Options use_rmsnorm: bool = True # Add option for RMSNorm in place of LayerNorm: https://arxiv.org/abs/1910.07467 - use_relu: bool = True #True: relu squared, False: do not utilize + use_relu: bool = False #True: relu squared, False: do not utilize use_squared_relu: bool = False #True: utilize relu squared, False: do not utilize bias: bool = False # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster @@ -255,6 +336,18 @@ def __init__(self, config): self.use_softermax_xmax = config.use_softermax_xmax self.softmax_layer = Softermax(subtract_max=self.use_softermax_xmax) + if self.softmax_variant == "constantmax": + self.use_softermax_xmax = config.use_softermax_xmax + self.constantmax_constant = config.constantmax_constant + self.softmax_layer = Constantmax(subtract_max=self.use_softermax_xmax, constant=self.constantmax_constant) + + if self.softmax_variant == "strongermax": + self.use_softermax_xmax = config.use_softermax_xmax + self.softmax_layer = Strongermax(subtract_max=self.use_softermax_xmax, strength=config.strongermax_strength) + + if self.softmax_variant == "polymax": + self.softmax_layer = Polymax() + if self.softmax_variant == "sigsoftmax": self.softmax_layer = SigSoftmax() diff --git a/pruning.py b/pruning.py new file mode 100644 index 0000000000..bc6a0f0f15 --- /dev/null +++ b/pruning.py @@ -0,0 +1,15 @@ +import numpy as np +import torch +#a basic magnitude-based weight pruning for a neural network using PyTorch +def magnitude_prune(model, pruning_rate): + all_weights = [] + for name, param in model.named_parameters(): + if 'weight' in name: + all_weights += list(param.cpu().detach().abs().numpy().flatten()) + + threshold = np.percentile(np.array(all_weights), pruning_rate) + + for name, param in model.named_parameters(): + if 'weight' in name: + with torch.no_grad(): + param *= (param.abs() >= threshold).float() \ No newline at end of file diff --git a/requirements_cpu.txt b/requirements_cpu.txt new file mode 100644 index 0000000000..15132a5550 --- /dev/null +++ b/requirements_cpu.txt @@ -0,0 +1,77 @@ +absl-py==2.0.0 +aiohttp==3.8.6 +aiosignal==1.3.1 +appdirs==1.4.4 +async-timeout==4.0.3 +attrs==23.1.0 +black==23.10.1 +cachetools==5.3.2 +certifi==2022.12.7 +charset-normalizer==2.1.1 +click==8.1.7 +datasets==2.14.6 +dill==0.3.7 +docker-pycreds==0.4.0 +filelock==3.9.0 +frozenlist==1.4.0 +fsspec==2023.4.0 +gitdb==4.0.11 +GitPython==3.1.40 +google-auth==2.23.4 +google-auth-oauthlib==1.1.0 +greenlet==3.0.1 +grpcio==1.59.2 +huggingface-hub==0.17.3 +idna==3.4 +Jinja2==3.1.2 +Markdown==3.5.1 +MarkupSafe==2.1.2 +mpmath==1.3.0 +msgpack==1.0.7 +multidict==6.0.4 +multiprocess==0.70.15 +mypy-extensions==1.0.0 +networkx==3.0 +numpy==1.26.1 +oauthlib==3.2.2 +packaging==23.2 +pandas==2.1.2 +pathspec==0.11.2 +pathtools==0.1.2 +Pillow==9.3.0 +platformdirs==3.11.0 +protobuf==4.23.4 +psutil==5.9.6 +pyarrow==14.0.0 +pyasn1==0.5.0 +pyasn1-modules==0.3.0 +pynvim==0.4.3 +python-dateutil==2.8.2 +pytz==2023.3.post1 +PyYAML==6.0.1 +regex==2023.10.3 +requests==2.28.1 +requests-oauthlib==1.3.1 +rsa==4.9 +safetensors==0.4.0 +sentry-sdk==1.34.0 +setproctitle==1.3.3 +six==1.16.0 +smmap==5.0.1 +sympy==1.12 +tensorboard==2.15.1 +tensorboard-data-server==0.7.2 +tiktoken==0.5.1 +tokenizers==0.14.1 +torch==2.1.0+cpu +torchaudio==2.1.0+cpu +torchvision==0.16.0+cpu +tqdm==4.66.1 +transformers==4.35.0 +typing_extensions==4.4.0 +tzdata==2023.3 +urllib3==1.26.13 +wandb==0.15.12 +Werkzeug==3.0.1 +xxhash==3.4.1 +yarl==1.9.2 diff --git a/sample.py b/sample.py index 78d8c4e88f..11eec29469 100644 --- a/sample.py +++ b/sample.py @@ -7,35 +7,43 @@ import torch import tiktoken from model import GPTConfig, GPT +import argparse -# ----------------------------------------------------------------------------- -init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl') -out_dir = 'out' # ignored if init_from is not 'resume' -start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt" -num_samples = 10 # number of samples to draw -max_new_tokens = 500 # number of tokens generated in each sample -temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions -top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability -seed = 1337 -device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc. -dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16' -compile = False # use PyTorch 2.0 to compile the model to be faster +def parseargs(): + parser = argparse.ArgumentParser(description='') + parser.add_argument("--device", type=str, help="device to run inference, e.g. 'cpu' or 'cuda' or 'cuda:0', 'cuda:1', etc...") + parser.add_argument("--out_dir", type=str, help="directory to load checkpoint from") + parser.add_argument("--init_from", type=str, default="resume", help="Either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')") + parser.add_argument("--start", type=str, default="\n", help="\\n or '<|endoftext|>' or etc. Can also specify a file, use as: 'FILE:prompt.txt'") + parser.add_argument("--num_samples", type=int, default=3, help="number of inference streams to draw") + parser.add_argument("--max_new_tokens", type=int, default=500, help="number of tokens generated in each sample") + parser.add_argument("--temperature", type=float, default=0.8, help="1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions") + parser.add_argument("--top_k", type=int, default=200, help="retain only the top_k most likely tokens, clamp others to have 0 probability") + parser.add_argument("--seed", type=int, default=1337, help="seed for psuedorandom number generator") + parser.add_argument("--dtype", type=str, default="bfloat16", choices=["bfloat16", "float16", "float32"], help="torch data type for inference, e.g. 'int8'") + parser.add_argument('--compile', default=False, action=argparse.BooleanOptionalAction) + + return parser.parse_args() + +args = parseargs() # ----------------------------------------------------------------------------- -torch.manual_seed(seed) -torch.cuda.manual_seed(seed) +torch.manual_seed(args.seed) +torch.cuda.manual_seed(args.seed) torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn -device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast -ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype] +device_type = 'cuda' if 'cuda' in args.device else 'cpu' # for later use in torch.autocast +ptdtype = {'bfloat16': torch.bfloat16, 'float16': torch.float16, 'float32': torch.float32}[args.dtype] ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype) # model -if init_from == 'resume': +if args.init_from == 'resume': # init from a model saved in a specific directory - ckpt_path = os.path.join(out_dir, 'ckpt.pt') - checkpoint = torch.load(ckpt_path, map_location=device) + ckpt_path = os.path.join(args.out_dir, 'ckpt.pt') + checkpoint = torch.load(ckpt_path, map_location=args.device) + checkpoint['model_args']['dropout'] = 0.0 # typically we don't want dropout during inference gptconf = GPTConfig(**checkpoint['model_args']) + print(checkpoint['model_args']) model = GPT(gptconf) state_dict = checkpoint['model'] unwanted_prefix = '_orig_mod.' @@ -43,18 +51,18 @@ if k.startswith(unwanted_prefix): state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k) model.load_state_dict(state_dict) -elif init_from.startswith('gpt2'): +elif args.init_from.startswith('gpt2'): # init from a given GPT-2 model - model = GPT.from_pretrained(init_from, dict(dropout=0.0)) + model = GPT.from_pretrained(args.init_from, dict(dropout=0.0)) model.eval() -model.to(device) -if compile: +model.to(args.device) +if args.compile: model = torch.compile(model) # requires PyTorch 2.0 (optional) # look for the meta pickle in case it is available in the dataset folder load_meta = False -if init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these... +if args.init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these... meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl') load_meta = os.path.exists(meta_path) if load_meta: @@ -73,16 +81,17 @@ decode = lambda l: enc.decode(l) # encode the beginning of the prompt -if start.startswith('FILE:'): - with open(start[5:], 'r', encoding='utf-8') as f: - start = f.read() -start_ids = encode(start) -x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...]) +if args.start.startswith('FILE:'): + with open(args.start[5:], 'r', encoding='utf-8') as f: + args.start = f.read() +start_ids = encode(args.start) +x = (torch.tensor(start_ids, dtype=torch.long, device=args.device)[None, ...]) # run generation with torch.no_grad(): with ctx: - for k in range(num_samples): - y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k) + for k in range(args.num_samples): + y = model.generate(x, args.max_new_tokens, + temperature=args.temperature, top_k=args.top_k) print(decode(y[0].tolist())) print('---------------') diff --git a/train.py b/train.py index a123e5bef7..e879ab6922 100644 --- a/train.py +++ b/train.py @@ -4,6 +4,9 @@ from datetime import datetime import math import pickle +from contextlib import nullcontext +import argparse +from pruning import magnitude_prune import numpy as np import torch @@ -20,79 +23,92 @@ def parse_args(): parser = argparse.ArgumentParser() + # argparse groups + model_group = parser.add_argument_group('model_group') + training_group = parser.add_argument_group('training_group') + logging_group = parser.add_argument_group('logging_group') + # I/O args - parser.add_argument('--out_dir', default='out', type=str) - parser.add_argument('--eval_interval', default=250, type=int) - parser.add_argument('--log_interval', default=10, type=int) - parser.add_argument('--eval_iters', default=200, type=int) - parser.add_argument('--eval_only', action='store_true') + training_group.add_argument('--out_dir', default='out', type=str) + training_group.add_argument('--eval_interval', default=250, type=int) + training_group.add_argument('--log_interval', default=10, type=int) + training_group.add_argument('--eval_iters', default=200, type=int) + training_group.add_argument('--eval_only', action='store_true') # Checkpoint args - parser.add_argument('--only_save_checkpoint_at_end', action='store_true') - parser.add_argument('--always_save_checkpoint', action='store_true', - default=False) - parser.add_argument('--init_from', default='scratch', choices=['scratch', 'resume', 'gpt2*'], type=str) + training_group.add_argument('--only_save_checkpoint_at_end', action='store_true') + training_group.add_argument('--always_save_checkpoint', action='store_true') + training_group.add_argument('--init_from', default='scratch', choices=['scratch', 'resume', 'gpt2*'], type=str) # Data args - parser.add_argument('--dataset', default='shakespeare_char', type=str) - parser.add_argument('--gradient_accumulation_steps', default=1, type=int) - parser.add_argument('--batch_size', default=64, type=int) - parser.add_argument('--block_size', default=256, type=int) + training_group.add_argument('--dataset', default='shakespeare_char', type=str) + training_group.add_argument('--gradient_accumulation_steps', default=1, type=int) + training_group.add_argument('--batch_size', default=64, type=int) # Model args - parser.add_argument('--n_layer', default=6, type=int) - parser.add_argument('--n_head', default=6, type=int) - parser.add_argument('--n_embd', default=384, type=int) - parser.add_argument('--dropout', default=0.2, type=float) - parser.add_argument('--bias', action='store_true', default=False) + model_group.add_argument('--block_size', default=256, type=int) + model_group.add_argument('--n_layer', default=6, type=int) + model_group.add_argument('--n_head', default=6, type=int) + model_group.add_argument('--n_embd', default=384, type=int) + model_group.add_argument('--dropout', default=0.2, type=float) + model_group.add_argument('--bias', default=False, action=argparse.BooleanOptionalAction) + + # Norm variations + model_group.add_argument('--use_rmsnorm', default=True, action=argparse.BooleanOptionalAction) - # Model variations - parser.add_argument('--use_rmsnorm', action='store_true', default=True) + # Softmax variations + model_group.add_argument('--use_softmax_variant', default=False, action=argparse.BooleanOptionalAction) + model_group.add_argument("--softmax_variant", type=str, default="softermax", choices=["polymax", "strongermax", "softermax", "sigsoftmax", "sigsoftmax_base2"]) + + # Custom Softmax Variation Options + model_group.add_argument('--use_softermax_xmax', default=False, action=argparse.BooleanOptionalAction) + model_group.add_argument("--strongermax_strength", type=int, default=2) # Optimizer args - parser.add_argument('--learning_rate', default=1e-3, type=float) - parser.add_argument('--max_iters', default=5000, type=int) - parser.add_argument('--weight_decay', default=1e-1, type=float) - parser.add_argument('--beta1', default=0.9, type=float) - parser.add_argument('--beta2', default=0.99, type=float) - parser.add_argument('--grad_clip', default=1.0, type=float) + training_group.add_argument('--learning_rate', default=1e-3, type=float) + training_group.add_argument('--max_iters', default=5000, type=int) + training_group.add_argument('--weight_decay', default=1e-1, type=float) + training_group.add_argument('--beta1', default=0.9, type=float) + training_group.add_argument('--beta2', default=0.99, type=float) + training_group.add_argument('--grad_clip', default=1.0, type=float) # LR schedule args - parser.add_argument('--decay_lr', action='store_true') - parser.add_argument('--warmup_iters', default=100, type=int) - parser.add_argument('--lr_decay_iters', default=5000, type=int) - parser.add_argument('--min_lr', default=1e-4, type=float) + training_group.add_argument('--decay_lr', action='store_true') + training_group.add_argument('--warmup_iters', default=100, type=int) + training_group.add_argument('--lr_decay_iters', default=5000, type=int) + training_group.add_argument('--min_lr', default=1e-4, type=float) # DDP args - parser.add_argument('--backend', default='nccl', type=str) + training_group.add_argument('--backend', default='nccl', type=str) # System args - parser.add_argument('--device', default='cuda', type=str) - parser.add_argument('--dtype', default='float16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16', type=str) - parser.add_argument('--compile', action='store_true', default=True) + training_group.add_argument('--device', default='cuda', type=str) + training_group.add_argument("--dtype", type=str, default="bfloat16", choices=["bfloat16", "float16", "float32"], help="torch data type for inference, e.g. 'int8'") + training_group.add_argument('--compile', default=False, action=argparse.BooleanOptionalAction) # Logging args - parser.add_argument('--log_project', default='out-test', type=str) - parser.add_argument('--log_run_name', default='logs-test', type=str) + logging_group.add_argument('--log_project', default='out-test', type=str) + logging_group.add_argument('--log_run_name', default='logs-test', type=str) # Tensorboard args - parser.add_argument('--tensorboard_log', action='store_true', default=True) - parser.add_argument('--tensorboard_log_dir', type=str, default='logs') - parser.add_argument('--tensorboard_project', type=str, default='out-test') - parser.add_argument('--tensorboard_run_name', type=str, default='logs-test') + logging_group.add_argument('--tensorboard_log', default=True, action=argparse.BooleanOptionalAction) + logging_group.add_argument('--tensorboard_log_dir', type=str, default='logs') + logging_group.add_argument('--tensorboard_project', type=str, default='out-test') + logging_group.add_argument('--tensorboard_run_name', type=str, default='logs-test') - # Tensorboard args - parser.add_argument('--wandb_log', action='store_true', default=False) - parser.add_argument('--wandb_project', type=str, default='out-test') - parser.add_argument('--wandb_run_name', type=str, default='logs-test') + # Wandb args + logging_group.add_argument('--wandb_log', default=False, action=argparse.BooleanOptionalAction) + logging_group.add_argument('--wandb_project', type=str, default='out-test') + logging_group.add_argument('--wandb_run_name', type=str, default='logs-test') args = parser.parse_args() - return args + return args, model_group, training_group, logging_group class Trainer: - def __init__(self, args): + def __init__(self, args, model_group): self.args = args + self.model_group = model_group self.setup() def setup(self): @@ -126,7 +142,7 @@ def setup(self): torch.backends.cudnn.allow_tf32 = True self.device_type = 'cuda' if 'cuda' in self.args.device else 'cpu' - self.ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[self.args.dtype] + self.ptdtype = {"bfloat16" : torch.bfloat16, "float16" : torch.float16, "float32" : torch.float32}[self.args.dtype] self.ctx = nullcontext() if self.device_type == 'cpu' else torch.amp.autocast(device_type=self.device_type, dtype=self.ptdtype) # Data loader @@ -140,9 +156,10 @@ def setup(self): self.meta_vocab_size = meta['vocab_size'] # Model - self.model_args = dict(n_layer=self.args.n_layer, n_head=self.args.n_head, n_embd=self.args.n_embd, - block_size=self.args.block_size, bias=self.args.bias, vocab_size=None, - dropout=self.args.dropout, use_rmsnorm=self.args.use_rmsnorm) + # TODO only add if they are defined from the argparse + self.model_args = {action.dest: getattr(self.args, action.dest) for action in self.model_group._group_actions} + print(self.model_args) + self.model_args['vocab_size'] = None if self.args.init_from == 'scratch': self.model_args['vocab_size'] = self.meta_vocab_size if self.meta_vocab_size is not None else 50304 @@ -194,7 +211,9 @@ def setup(self): # Tensorboard if self.args.tensorboard_log: - timestamp = time.strftime("%Y%m%d-%H%M%S") + timestamp = time.strftime("%Y%m%d-%H%M%S" + "_" + + self.args.tensorboard_project + "_" + + self.args.tensorboard_run_name) log_subpath = os.path.join(self.args.tensorboard_log_dir, timestamp) self.writer = SummaryWriter(log_subpath) @@ -348,9 +367,8 @@ def train(self): wandb.finish() def main(): - args = parse_args() - print(args.device) - trainer = Trainer(args) + args, model_group, _, _ = parse_args() + trainer = Trainer(args, model_group) trainer.train() if trainer.ddp: