diff --git a/.github/workflows/cpu-basic-install-prepare-train-inf-test.yml b/.github/workflows/cpu-basic-install-prepare-train-inf-test.yml
new file mode 100644
index 0000000000..39d2318941
--- /dev/null
+++ b/.github/workflows/cpu-basic-install-prepare-train-inf-test.yml
@@ -0,0 +1,34 @@
+name: Basic Pytorch Installation, Data Prep, CPU Training, CPU Inference
+on: [push, pull_request]
+jobs:
+  Install-Dependencies_Data-Prep_CPU-Training_CPU-Inference:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+      - run: echo "${{ github.repository }} repository has been cloned to the runner."
+      - run: echo "Currently on ${{ github.ref }} branch"
+      - name: ls of directory
+        run: |
+          ls ${{ github.workspace }}
+             # Caching pip dependencies
+      - name: Cache pip dependencies
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements_cpu.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-
+      - name: Install CPU Dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          python3 -m pip install -r requirements_cpu.txt
+      - name: Run Small Network on CPU
+        run: |
+          python3 data/shakespeare_char/prepare.py
+          python3 train.py --out_dir=out --device=cpu --eval_interval=2 --log_interval=1 --block_size=2 --batch_size=2 --n_layer=2 --n_head=2 --n_embd=16 --max_iters=3 --lr_decay_iters=2 --dropout=0.0
+      - name: Run CPU Inference
+        run: |
+          python3 sample.py --device=cpu --out_dir="out"
+
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000..dbc4bfac00
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,11 @@
+# folders
+__pycache__/
+logs/
+
+# file extensions
+*.pkl
+*.bin
+*.txt
+
+# checkpoint directories
+out*/
diff --git a/HW/SA/Softer_max/LUT.sv b/HW/SA/Softer_max/LUT.sv
new file mode 100644
index 0000000000..97002e2408
--- /dev/null
+++ b/HW/SA/Softer_max/LUT.sv
@@ -0,0 +1,29 @@
+// a LUT used for pow2 unit, will store this LUT into a on-chip mem in the future
+// this LUT contains value from 2^(0/16) to 2^(15/16)
+module LUT(
+    input logic signed [`FRAC-1:0] index,
+    output logic signed [`DATA_SIZE-1:0] out
+);
+    always_comb begin
+        //[2^(n/16)] and then round to integer
+        out = 8'b0;
+        case(index)
+            4'b0000 : out = 8'b00010000;
+            4'b0001 : out = 8'b00010001;
+            4'b0010 : out = 8'b00010001;
+            4'b0011 : out = 8'b00010010;
+            4'b0100 : out = 8'b00010011;
+            4'b0101 : out = 8'b00010100;
+            4'b0110 : out = 8'b00010101;
+            4'b0111 : out = 8'b00010110;
+            4'b1000 : out = 8'b00010111;
+            4'b1001 : out = 8'b00011000;
+            4'b1010 : out = 8'b00011001;
+            4'b1011 : out = 8'b00011010;
+            4'b1100 : out = 8'b00011011;
+            4'b1101 : out = 8'b00011100;
+            4'b1110 : out = 8'b00011101;
+            4'b1111 : out = 8'b00011111;          
+        endcase
+    end
+endmodule
\ No newline at end of file
diff --git a/HW/SA/Softer_max/Pow2_tb.sv b/HW/SA/Softer_max/Pow2_tb.sv
new file mode 100644
index 0000000000..a038735b73
--- /dev/null
+++ b/HW/SA/Softer_max/Pow2_tb.sv
@@ -0,0 +1,69 @@
+//The testbench for Pow2 unit
+module Pow_tb();
+    logic clk;
+    logic signed [`DATA_SIZE-1:0] current_max;
+    logic signed [`DATA_SIZE-1:0] input_vector;
+    logic signed [`LARGE_SIZE:0] uSoftmax;
+    integer write_file0, write_file1, write_file2;
+    initial begin
+            
+        write_file0 = $fopen("Pow_data.txt", "w");
+        write_file1 = $fopen("true_pow_data.txt", "w");
+        write_file2 = $fopen("Pow_data2.txt", "w");
+    end
+
+    Pow2 Pow2 (
+        .current_max,
+        .input_vector,
+        .uSoftmax //UnnormedSoftmax
+    ); 
+    int counter;
+    real true_out, xj, uSoftmax_floating;
+    always #5 clk = ~clk;
+
+    //initialize "current_max", could also use rand()
+    always #10 begin
+        counter = counter + 1;
+        if(counter == 5) begin
+            current_max = 0;
+            counter = 0;
+        end
+        else begin
+            current_max = current_max+1;
+        end
+    end
+
+    //initlalize input vectors
+    always #5 begin
+        input_vector[`DATA_SIZE-2:`FRAC] = input_vector[`DATA_SIZE-2:`FRAC] + 1;
+        input_vector[`FRAC-1:0] = input_vector[`FRAC-1:0] + 1;
+    end
+
+    //calculate the expected value
+    always #5 begin
+        xj = input_vector[`DATA_SIZE-2:`FRAC] + (input_vector[`FRAC-1:0]/16.0);
+        true_out = 2**(xj-current_max+0.0);
+    end
+
+    //transfer the output from fixed point 8 to a real number
+    always #5 begin
+        uSoftmax_floating = uSoftmax[`LARGE_SIZE-2:`FRAC] + uSoftmax[`FRAC-1:0]*1.0/16.0;
+    end
+
+    //write the outputs into output files
+    always #5 begin
+        $fdisplay(write_file0,"%0d\t",uSoftmax);
+        $fdisplay(write_file1,"%0f\t",true_out);
+        $fdisplay(write_file2,"%0f\t",uSoftmax_floating);
+    end
+
+    //testing begin
+    initial begin
+        clk = 0;
+        counter = 0;
+        current_max = 0;
+        input_vector = 5;
+        #200;
+        $finish;
+    end
+endmodule
\ No newline at end of file
diff --git a/HW/SA/Softer_max/Pow2_unit.sv b/HW/SA/Softer_max/Pow2_unit.sv
new file mode 100644
index 0000000000..d08e0f19ea
--- /dev/null
+++ b/HW/SA/Softer_max/Pow2_unit.sv
@@ -0,0 +1,30 @@
+//perform 2^(xj-localMax)
+//To do: modify output data size
+//To do: explore floating point 8 representation
+module Pow2 (
+    input logic signed [`DATA_SIZE-1:0] current_max,
+    input logic signed [`DATA_SIZE-1:0] input_vector,
+    output logic signed [`LARGE_SIZE:0] uSoftmax //UnnormedSoftmax
+);  
+    logic signed [`DATA_SIZE-1:0] pow2_frac;
+    logic signed [`LARGE_SIZE-1:0] FP_1;
+
+    //fixed_point 2
+    always_comb begin
+        FP_1 = 0;
+        FP_1[`FRAC] = 1;
+    end
+
+    //return [2^(n/16)]*2^4 and then round to integer
+    LUT LUT0 (
+        .index(input_vector[`FRAC-1:0]),
+        .out(pow2_frac)
+    );
+
+    //for debug
+    logic signed [`DATA_SIZE-1-`FRAC:0] temp;
+    assign temp = input_vector [`DATA_SIZE-1:`FRAC];
+    //2^[(int xj)-localMax]*2^(frac+4)/2^4
+    //8'b01000000, 2 in FixedP8
+    assign uSoftmax = ((FP_1 <<< (temp - current_max)) * pow2_frac) >>> `FRAC;
+endmodule
\ No newline at end of file
diff --git a/HW/SA/Softer_max/README.md b/HW/SA/Softer_max/README.md
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/HW/SA/Softer_max/help.py b/HW/SA/Softer_max/help.py
new file mode 100644
index 0000000000..9c66630a58
--- /dev/null
+++ b/HW/SA/Softer_max/help.py
@@ -0,0 +1,44 @@
+# a helper python code to compare the expected ouputs of Pow2 unit and its real outputs
+# files are generated by Pow2_tb.sv
+def fp8_to_real(value):
+  # FP8(3,4) has:
+  # - 3 integer bits
+  # - 4 fractional bits
+  # - Signed format
+
+  # Extract sign bit
+
+  # Extract integer part 
+  integer = value >> 4
+
+  # Extract fraction part
+  fraction = value & 0b00001111
+
+  #Compute real value  
+  real_value = (integer + fraction / 16)
+
+  return real_value
+
+def compare_files(file1_path, file2_path):
+    differences = []
+    with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2:
+        for line1, line2 in zip(file1, file2):
+            try:
+                num1 = float(line1.strip())
+                num2 = float(line2.strip())
+                #num1 = fp8_to_real(num1)
+                difference = abs(num1 - num2)
+                differences.append(difference)
+            except ValueError:
+                # Handle the case where a line does not contain a valid number
+                differences.append(None)
+
+    return differences
+
+# Example usage
+file1_path = 'Pow_data2.txt'
+file2_path = 'true_pow_data.txt'
+differences = compare_files(file1_path, file2_path)
+print(differences)
+print(f"average diff = {sum(differences)/len(differences)}")
+#print(f"average diff after scale down by 2^4 is: {(sum(differences)/len(differences))/2**4}")
diff --git a/data/csv_data/.gitignore b/data/csv_data/.gitignore
new file mode 100644
index 0000000000..73ff7cde94
--- /dev/null
+++ b/data/csv_data/.gitignore
@@ -0,0 +1,3 @@
+*.csv
+*.bin
+*.pkl
diff --git a/data/csv_data/README.md b/data/csv_data/README.md
new file mode 100644
index 0000000000..7104922e50
--- /dev/null
+++ b/data/csv_data/README.md
@@ -0,0 +1,79 @@
+# Data-Shuffler for ML Permutation Invariance
+
+These Python scripts process time-series data from a CSV file to add permutation
+invariance to the csv's data fields.
+
+Each row in the CSV becomes a single line in the text file, with each cell
+represented by a unique lowercase letter (starting from 'a') followed by the
+value from the cell.
+
+One has the option to shuffle the letter-value pairs in each line, using a
+command-line flag.
+
+Training on this data with the shuffle option, will create a form of
+in-frame-permutation invariance.
+
+This will give -- during inference -- data the freedom to move around and unlock
+special capabilities otherwise not available to fixed-frame trained networks.
+
+For example, one can utilize a beam search for each of the labels to determine
+which of the letter value pairs gives the strongest certainty of data points in
+this frame, and build the next frame up incrementally using this technique.
+
+## Getting Started
+
+### Prerequisites
+
+- Python (3.6 or above)
+
+## Usage
+
+1. Separate your file into one with timestamp columns and one with data columns.
+
+2. Navigate to the directory where the script is located and run process_csv on
+   one's data-column file:
+
+```sh
+python3 process_csv.py <data_column_file> <processed_data_file> --shuffle --exclude e
+```
+
+3. Recombine the output file from process_csv.py with the time column data.
+
+
+```sh
+python3 combine_csvs.py <time_column_file> <processed_data_file> <processed_csv>
+```
+
+4. Prepare the processed_data_file for training
+
+```sh
+python3 prepare.py -i <processed_data_file>
+```
+
+5. `cd` to the `explorations` folder, and utilize the script to run training:
+
+
+```sh
+cd ../../explorations
+bash run_csv_data_training.sh
+```
+
+6. [Optional] Create an exploration script to test training and inference with
+   and without with and without shuffling.
+
+### Arguments
+
+- `input_file`: The path to the input CSV file containing time-series data.
+- `output_file`: The path to the output text file.
+- `--shuffle`: Optional flag to shuffle the order of letter-value pairs in each line.
+- `--exclude`: Optional flag to remove any letters used by the dataset (e.g. `e`
+    for scientific notation)
+
+### Example
+
+For a full example see the `main.sh` script on generated sine + noise data.
+
+## License
+
+This project is licensed under the MIT License
+
diff --git a/data/csv_data/combine_csvs.py b/data/csv_data/combine_csvs.py
new file mode 100644
index 0000000000..9427a01aef
--- /dev/null
+++ b/data/csv_data/combine_csvs.py
@@ -0,0 +1,30 @@
+import csv
+import argparse
+
+def combine_csv_columns(file_path1, file_path2, output_file_path):
+    with open(file_path1, 'r') as file1, open(file_path2, 'r') as file2, open(output_file_path, 'w', newline='') as outfile:
+        reader1 = csv.reader(file1)
+        reader2 = csv.reader(file2)
+
+        for row1, row2 in zip(reader1, reader2):
+            # Combine the rows from both CSVs
+            combined_row = row1 + row2
+
+            # Join the row with no delimiter and write it to the file
+            outfile.write(''.join(combined_row))
+
+            # Write a newline character after each row
+            outfile.write('\n')
+
+def main(args):
+    combine_csv_columns(args.file1, args.file2, args.output)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Combine columns of two CSV files.')
+    parser.add_argument('file1', type=str, help='Path to the first input CSV file.')
+    parser.add_argument('file2', type=str, help='Path to the second input CSV file.')
+    parser.add_argument('output', type=str, help='Path to the output CSV file.')
+
+    args = parser.parse_args()
+    main(args)
+
diff --git a/data/csv_data/main.sh b/data/csv_data/main.sh
new file mode 100644
index 0000000000..4d4ce676d8
--- /dev/null
+++ b/data/csv_data/main.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# This is an example using generated data to demonstrate:
+# 1. labelling of different time series data
+# 2. shuffling of different fields with their label
+
+set -x
+
+# CREATE SYNTHETIC DATA -- Skip if using your own data
+# This is a generator create two csvs (if `--split` is specified):
+# 1. time data called:    `time_filename.csv`
+# 2. signal data called:  `data_filename.csv`
+python3 sine_noise_generator.py --noise_level 0.3 --filename sine_data.csv --scientific --precision 2 --modulo 1000 --points 1000000 --split
+
+set +x
+echo -e "\nPreview: Generated Times"
+head time_sine_data.csv
+
+echo -e "\n\nPreview: Generated Data"
+head data_sine_data.csv
+echo -e "\n\n"
+set -x
+
+# Utilize a dataset only csv (no timestamps) in this case `data_sine_data.csv`
+# This script does two things:
+# 1. _prepend_ labels to the data
+# 2. (optionally) shuffle the data
+# Also 'e' is used for scientific notation, skip this letter when doing labelling
+python3 process_csv.py data_sine_data.csv sine_noise_sn_shuffled.csv --shuffle --exclude e
+
+# preview the result
+set +x
+echo -e "\nPreview: Shuffled Data"
+head sine_noise_sn_shuffled.csv
+echo -e "\n\n"
+set -x
+
+# recombine
+python3 combine_csvs.py time_sine_data.csv sine_noise_sn_shuffled.csv processed_sine_data.csv
+
+set +x
+echo -e "\nPreview: Timestamps with Shuffled Data"
+head processed_sine_data.csv
diff --git a/data/csv_data/prepare.py b/data/csv_data/prepare.py
new file mode 100644
index 0000000000..68d9831572
--- /dev/null
+++ b/data/csv_data/prepare.py
@@ -0,0 +1,69 @@
+import os
+import pickle
+import numpy as np
+import argparse
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-i", "--input_file", required=True)
+    parser.add_argument("-t", "--token_file", default=None)
+    return parser.parse_args()
+
+
+def process_data(input_file, token_file):
+    with open(input_file, "r") as f:
+        data = f.read()
+
+    if token_file != None:
+        with open(token_file, "r") as f:
+            token_data = f.read()
+    else:
+        token_data = data
+
+    print(f"Length of dataset: {len(data):,}")
+
+    chars = sorted(list(set(token_data)))
+    vocab_size = len(chars)
+
+    print(f"Unique chars: {''.join(chars)}")
+    print(f"Vocab size: {vocab_size:,}")
+
+    stoi = {ch: i for i, ch in enumerate(chars)}
+    itos = {i: ch for i, ch in enumerate(chars)}
+
+    def encode(s):
+        return [stoi[c] for c in s]
+
+    def decode(l):
+        return "".join([itos[i] for i in l])
+
+    n = len(data)
+    train_data = data[: int(n * 0.9)]
+    val_data = data[int(n * 0.9) :]
+
+    train_ids = encode(train_data)
+    val_ids = encode(val_data)
+
+    print(f"Train tokens: {len(train_ids):,}")
+    print(f"Val tokens: {len(val_ids):,}")
+
+    return train_ids, val_ids, stoi, itos
+
+
+def save_data(train_ids, val_ids, stoi, itos, output_dir):
+    train_ids = np.array(train_ids, dtype=np.uint16)
+    val_ids = np.array(val_ids, dtype=np.uint16)
+
+    train_ids.tofile(os.path.join(output_dir, "train.bin"))
+    val_ids.tofile(os.path.join(output_dir, "val.bin"))
+
+    meta = {"vocab_size": len(stoi), "itos": itos, "stoi": stoi}
+    with open(os.path.join(output_dir, "meta.pkl"), "wb") as f:
+        pickle.dump(meta, f)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    train_ids, val_ids, stoi, itos = process_data(args.input_file, args.token_file)
+    save_data(train_ids, val_ids, stoi, itos, ".")
diff --git a/data/csv_data/process_csv.py b/data/csv_data/process_csv.py
new file mode 100644
index 0000000000..fe279bc2b5
--- /dev/null
+++ b/data/csv_data/process_csv.py
@@ -0,0 +1,49 @@
+import csv
+import argparse
+import random
+
+def create_letter_mapping(exclude: list) -> dict:
+    # Create a mapping of indices to letters, skipping excluded letters.
+    allowed_letters = [chr(i) for i in range(ord('a'), ord('z') + 1) if chr(i) not in exclude]
+    return {i: letter for i, letter in enumerate(allowed_letters)}
+
+def process_csv(input_file: str, output_file: str, shuffle: bool, exclude: list) -> None:
+    # Create the letter mapping
+    letter_mapping = create_letter_mapping(exclude)
+
+    with open(input_file, mode="r") as csv_file, open(output_file, mode="w") as txt_file:
+        csv_reader = csv.reader(csv_file)
+
+        for row in csv_reader:
+            # Use the letter mapping to assign letters to values
+            letter_value_pairs = [
+                f"{letter_mapping[i]}{val}" for i, val in enumerate(row) if i in letter_mapping
+            ]
+
+            if shuffle:
+                random.shuffle(letter_value_pairs)
+
+            # Join the letter-value pairs with no spaces and write to the output file.
+            txt_file.write("".join(letter_value_pairs) + "\n")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Process a time-series CSV and convert it to a custom text format while excluding certain letters."
+    )
+    parser.add_argument("input_file", type=str, help="Path to the input CSV file.")
+    parser.add_argument("output_file", type=str, help="Path to the output text file.")
+    parser.add_argument(
+        "--shuffle",
+        action="store_true",
+        help="Whether to shuffle the order of letter and value pairs.",
+    )
+    parser.add_argument(
+        "--exclude",
+        nargs="*",
+        default=[],
+        help="A list of letters to exclude from the letter labeling.",
+    )
+
+    args = parser.parse_args()
+    process_csv(args.input_file, args.output_file, args.shuffle, args.exclude)
+
diff --git a/data/csv_data/sine_noise_generator.py b/data/csv_data/sine_noise_generator.py
new file mode 100644
index 0000000000..5d9b4d0093
--- /dev/null
+++ b/data/csv_data/sine_noise_generator.py
@@ -0,0 +1,98 @@
+import argparse
+import numpy as np
+import pandas as pd
+
+# Function to generate sine wave with noise
+def generate_sine_wave_with_noise(freq, sample_rate, num_points, noise_level):
+    t = np.arange(num_points)  # Time axis with integer values starting from 0
+    sine_wave = np.sin(2 * np.pi * freq * t / sample_rate)
+    noise = noise_level * np.random.normal(size=num_points)
+    combined_signal = sine_wave + noise
+    return t, sine_wave, noise, combined_signal
+
+# Function to format data as scientific notation if required
+def format_data_as_scientific(df, precision):
+    return df.applymap(lambda x: f"{x:.{precision}e}")
+
+# Function to save the data to CSV
+def save_to_csv(time, signal, noise, combined_signal, filename, scientific, precision, split_files, modulo):
+    # Apply modulo if necessary
+    if modulo is not None:
+        time = time % modulo
+
+    # Create the DataFrame
+    data = {
+        'signal': signal,
+        'noise': noise,
+        'signal_plus_noise': combined_signal
+    }
+
+    if split_files:
+        # Save time data to a separate CSV
+        time_df = pd.DataFrame({'seconds_from_start': time})
+        if scientific:
+            time_df = format_data_as_scientific(time_df, precision)
+        time_df.to_csv(f"time_{filename}", header=False, index=False)
+
+        # Save data to a separate CSV
+        data_df = pd.DataFrame(data)
+        if scientific:
+            data_df = format_data_as_scientific(data_df, precision)
+        data_df.to_csv(f"data_{filename}", header=False, index=False)
+    else:
+        # Combine time and data for a single CSV
+        df = pd.DataFrame({'seconds_from_start': time, **data})
+        if scientific:
+            df = format_data_as_scientific(df, precision)
+        df.to_csv(filename, header=False, index=False)
+
+# Parse command-line arguments
+def parse_arguments():
+    parser = argparse.ArgumentParser(description='Generate a sine wave with noise and export to CSV.')
+    parser.add_argument('-n', '--noise_level', type=float, default=0.5, required=True,
+                        help='Level of noise relative to the sine wave (0-1).')
+    parser.add_argument('-f', '--filename', type=str, default='sine_wave.csv',
+                        help='Name of the output CSV file.')
+    parser.add_argument('--scientific', action='store_true',
+                        help='Output numbers in scientific notation.')
+    parser.add_argument('--precision', type=int, default=2,
+                        help='Number of digits past the decimal point in scientific notation.')
+    parser.add_argument('--points', type=int, default=5000,
+                        help='Total number of data points to be created.')
+    parser.add_argument('--split', action='store_true',
+                        help='Save time data and signal data in separate CSV files.')
+    parser.add_argument('--modulo', type=int,
+                        help='Modulo value to apply to the time data.')
+    args = parser.parse_args()
+    if not (0 <= args.noise_level <= 1):
+        raise ValueError('Noise level must be between 0 and 1.')
+    if args.precision < 0:
+        raise ValueError('Precision must be a non-negative integer.')
+    if args.points <= 0:
+        raise ValueError('Number of data points must be a positive integer.')
+    return args
+
+def main():
+    args = parse_arguments()
+
+    # Parameters for sine wave generation
+    frequency = 5  # Frequency in Hz
+    sample_rate = 500  # Sample rate in Hz
+    num_points = args.points  # Total number of data points
+
+    # Generate the sine wave with noise
+    time, sine_wave, noise, combined_signal = generate_sine_wave_with_noise(
+        frequency, sample_rate, num_points, args.noise_level
+    )
+
+    # Save to CSV file(s)
+    save_to_csv(time, sine_wave, noise, combined_signal, args.filename, args.scientific, args.precision, args.split, args.modulo)
+    if args.split:
+        print(f"Time data saved to time_{args.filename}")
+        print(f"Signal data saved to data_{args.filename}")
+    else:
+        print(f"Sine wave data with noise saved to {args.filename}")
+
+if __name__ == '__main__':
+    main()
+
diff --git a/data/experiments/prepare.py b/data/experiments/prepare.py
new file mode 100644
index 0000000000..cbc61c763b
--- /dev/null
+++ b/data/experiments/prepare.py
@@ -0,0 +1,100 @@
+"""
+Prepare the Shakespeare dataset for character-level language modeling.
+So instead of encoding with GPT-2 BPE tokens, we just map characters to ints.
+Will save train.bin, val.bin containing the ids, and meta.pkl containing the
+encoder and decoder and some other related info.
+"""
+import os
+import pickle
+import requests
+import numpy as np
+import argparse
+from datasets import load_dataset
+from pathlib import Path
+
+parser = argparse.ArgumentParser(description="Select training model, little Shakespeare or tinystories")
+parser.add_argument('--choice', type=int, default=0, help="0 for little Shakespeare, 1 for tinystories")
+
+args = parser.parse_args()
+choice = args.choice
+
+input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt')
+#load data from hugging face
+if choice == 1:
+    data_dir = Path("data")
+    data_dir.mkdir(exist_ok=True)
+    if not os.path.exists(data_dir / "full.txt"):
+        dataset = load_dataset("msaligane/tinystories_phonology",  split="train")
+        full_text = ""
+        for i, example in enumerate(dataset):
+            filename = f"tinystoryP{i:02d}.txt"
+            filepath = data_dir / filename
+        
+            with open(filepath, "w") as f:
+                f.write(example["text"])
+        
+            full_text += example["text"] + "\n"
+
+        with open(data_dir / "full.txt", "w") as f:
+            f.write(full_text)
+    #get data from 
+    with open(data_dir / "full.txt", 'r') as f:
+        data = f.read()
+# download the tiny shakespeare dataset
+elif choice == 0:
+
+    if not os.path.exists(input_file_path):
+        data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
+        with open(input_file_path, 'w') as f:
+            f.write(requests.get(data_url).text)
+
+    with open(input_file_path, 'r') as f:
+        data = f.read()
+print(f"length of dataset in characters: {len(data):,}")
+
+# get all the unique characters that occur in this text
+chars = sorted(list(set(data)))
+vocab_size = len(chars)
+print("all the unique characters:", ''.join(chars))
+print(f"vocab size: {vocab_size:,}")
+
+# create a mapping from characters to integers
+stoi = { ch:i for i,ch in enumerate(chars) }
+itos = { i:ch for i,ch in enumerate(chars) }
+def encode(s):
+    return [stoi[c] for c in s] # encoder: take a string, output a list of integers
+def decode(l):
+    return ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
+
+# create the train and test splits
+n = len(data)
+train_data = data[:int(n*0.9)]
+val_data = data[int(n*0.9):]
+
+# encode both to integers
+train_ids = encode(train_data)
+val_ids = encode(val_data)
+print(f"train has {len(train_ids):,} tokens")
+print(f"val has {len(val_ids):,} tokens")
+
+# export to bin files
+train_ids = np.array(train_ids, dtype=np.uint16)
+val_ids = np.array(val_ids, dtype=np.uint16)
+train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
+val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
+
+# save the meta information as well, to help us encode/decode later
+meta = {
+    'vocab_size': vocab_size,
+    'itos': itos,
+    'stoi': stoi,
+}
+with open(os.path.join(os.path.dirname(__file__), 'meta.pkl'), 'wb') as f:
+    pickle.dump(meta, f)
+
+# length of dataset in characters:  1115394
+# all the unique characters:
+#  !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+# vocab size: 65
+# train has 1003854 tokens
+# val has 111540 tokens
\ No newline at end of file
diff --git a/data/experiments/prepare_tiktoken.py b/data/experiments/prepare_tiktoken.py
new file mode 100644
index 0000000000..97503d1524
--- /dev/null
+++ b/data/experiments/prepare_tiktoken.py
@@ -0,0 +1,66 @@
+import os
+import requests
+import tiktoken
+import numpy as np
+import argparse
+from datasets import load_dataset
+from pathlib import Path
+
+parser = argparse.ArgumentParser(description="Select training model, little Shakespeare or tinystories")
+parser.add_argument('--choice', type=int, default=0, help="0 for little Shakespeare, 1 for tinystories")
+
+args = parser.parse_args()
+choice = args.choice
+
+input_file_path = os.path.join(os.path.dirname(__file__), 'input.txt')
+#load data from hugging face
+if choice == 1:
+    data_dir = Path("data")
+    data_dir.mkdir(exist_ok=True)
+    if not os.path.exists(data_dir / "full.txt"):
+        dataset = load_dataset("msaligane/tinystories_phonology",  split="train")
+        full_text = ""
+        for i, example in enumerate(dataset):
+            filename = f"tinystoryP{i:02d}.txt"
+            filepath = data_dir / filename
+        
+            with open(filepath, "w") as f:
+                f.write(example["text"])
+        
+            full_text += example["text"] + "\n"
+
+        with open(data_dir / "full.txt", "w") as f:
+            f.write(full_text)
+    #get data from 
+    with open(data_dir / "full.txt", 'r') as f:
+        data = f.read()
+# download the tiny shakespeare dataset
+elif choice == 0:
+    # download the tiny shakespeare dataset
+    if not os.path.exists(input_file_path):
+        data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
+        with open(input_file_path, 'w') as f:
+            f.write(requests.get(data_url).text)
+
+    with open(input_file_path, 'r') as f:
+        data = f.read()
+        
+n = len(data)
+train_data = data[:int(n*0.9)]
+val_data = data[int(n*0.9):]
+
+# encode with tiktoken gpt2 bpe
+enc = tiktoken.get_encoding("gpt2")
+train_ids = enc.encode_ordinary(train_data)
+val_ids = enc.encode_ordinary(val_data)
+print(f"train has {len(train_ids):,} tokens")
+print(f"val has {len(val_ids):,} tokens")
+
+# export to bin files
+train_ids = np.array(train_ids, dtype=np.uint16)
+val_ids = np.array(val_ids, dtype=np.uint16)
+train_ids.tofile(os.path.join(os.path.dirname(__file__), 'train.bin'))
+val_ids.tofile(os.path.join(os.path.dirname(__file__), 'val.bin'))
+
+# train.bin has 301,966 tokens
+# val.bin has 36,059 tokens
diff --git a/data/experiments/readme.md b/data/experiments/readme.md
new file mode 100644
index 0000000000..32e12999f5
--- /dev/null
+++ b/data/experiments/readme.md
@@ -0,0 +1,24 @@
+
+# tiny stories
+Users who prefer to upload a dataset programmatically can use the huggingface_hub library. This library allows users to interact with the Hub from Python.
+
+Plz do follwing before run prepare.py or prepare_tiktoken.py
+Begin by installing the library:
+```
+pip install huggingface_hub
+```
+To upload a dataset on the Hub in Python, you need to log in to your Hugging Face account:
+```
+huggingface-cli login
+```
+you will be asked for choosing model, 0 for little Shakespeare, 1 for tinystories
+
+clear generated files when rerun prepare.py or prepare_tiktoken.py
+
+when run config/train_shakespeare_char.py remember to change the following params
+```
+wandb_project = 'experiments'
+wandb_run_name = 'mini-gpt'
+dataset = 'experiments'
+```
+
diff --git a/data/shakespeare_char/prepare.py b/data/shakespeare_char/prepare.py
index 9fd1621d55..a42dff5a0e 100644
--- a/data/shakespeare_char/prepare.py
+++ b/data/shakespeare_char/prepare.py
@@ -65,4 +65,4 @@ def decode(l):
 #  !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
 # vocab size: 65
 # train has 1003854 tokens
-# val has 111540 tokens
+# val has 111540 tokens
\ No newline at end of file
diff --git a/explorations/run_csv_data_training.sh b/explorations/run_csv_data_training.sh
new file mode 100644
index 0000000000..209d6f364b
--- /dev/null
+++ b/explorations/run_csv_data_training.sh
@@ -0,0 +1,9 @@
+#/bin/bash
+
+cd ../
+python3 train.py \
+  --max_iters 3000 \
+  --dataset csv_data \
+  --tensorboard_project csv_data \
+  --tensorboard_run_name csv_data
+
diff --git a/model.py b/model.py
index 5b04ecff80..bfbb11d303 100644
--- a/model.py
+++ b/model.py
@@ -52,6 +52,73 @@ def forward(self, x):
         e_x = torch.pow(2.0, x)
         return e_x / e_x.sum(dim=self.dim, keepdim=True)
 
+# Softmax base 2, with constant denominator, and option to remove max subtraction
+class Constantmax(nn.Module):
+    """ Base-2 Softmax with option to remove max subtraction"""
+    def __init__(self, dim=-1, subtract_max=True, constant=1000):
+        super().__init__()
+        self.dim = dim
+        self.subtract_max = subtract_max
+        self.constant = constant
+
+    def forward(self, x):
+        if self.subtract_max:
+            max_x = x.max(dim=self.dim, keepdim=True).values
+            x = x - max_x
+        e_x = torch.pow(2.0, x)
+        return e_x / self.constant
+
+# Like softermax, but parameterized to permit exploration of bases greater than 2
+class Strongermax(nn.Module):
+    """ Base-2 Softmax with option to remove max subtraction"""
+    def __init__(self, dim=-1, subtract_max=True, strength=2):
+        super().__init__()
+        self.strength = strength
+        self.dim = dim
+        self.subtract_max = subtract_max
+
+    def forward(self, x):
+        if self.subtract_max:
+            max_x = x.max(dim=self.dim, keepdim=True).values
+            x = x - max_x
+        e_x = torch.pow(self.strength, x)
+        return e_x / e_x.sum(dim=self.dim, keepdim=True)
+
+# Polynomial estimate of Softmax
+class Polymax(nn.Module):
+    def __init__(self, x_intercept=-100, y_intercept=1, power=2, divisor=1000.0):
+        super().__init__()
+
+        assert(x_intercept < 0) # ensure x_intercepts strictly left of the y-axis
+        self.x_intercept = x_intercept # where to transition from y=0 to m*x+b
+        self.y_intercept = y_intercept # where teh graph crosses y-axis
+
+        self.power = power
+        self.divisor = divisor
+
+        # TODO: create single location for printing the model settings
+        print("Use polymax")
+
+    def forward(self, x):
+        # Overview:
+        # Flat section:       -inf < x < x_intercept
+        # Linear section:     x_intercept <= x <= 0
+        # Polynomial section: 0 < x < inf
+
+        # Flat section
+        flat_piece = torch.where(x < self.x_intercept, torch.tensor(0.0, device=x.device), torch.tensor(0.0, device=x.device))
+
+        # Linear section
+        m = self.y_intercept/self.x_intercept # aka 'slope', also x intercept !=0
+        b = self.y_intercept
+        linear_piece = torch.where((x >= self.x_intercept) & (x <= 0), m * x + b, torch.tensor(0.0, device=x.device))
+
+        # Polynomial section
+        poly_piece = torch.where(x > 0, x**self.power + self.y_intercept, torch.tensor(0.0, device=x.device))
+
+        # Combine sections
+        return (poly_piece + linear_piece + flat_piece)/self.divisor
+
 # SigSoftmax from https://arxiv.org/abs/1805.10829
 class SigSoftmax(nn.Module):
     def __init__(self):
@@ -112,6 +179,18 @@ def __init__(self, config):
               self.use_softermax_xmax = config.use_softermax_xmax
               self.softmax_layer = Softermax(subtract_max=self.use_softermax_xmax)
 
+            if self.softmax_variant == "constantmax":
+              self.use_softermax_xmax = config.use_softermax_xmax
+              self.constantmax_constant = config.constantmax_constant
+              self.softmax_layer = Constantmax(subtract_max=self.use_softermax_xmax, constant=self.constantmax_constant)
+
+            if self.softmax_variant == "strongermax":
+              self.use_softermax_xmax = config.use_softermax_xmax
+              self.softmax_layer = Strongermax(subtract_max=self.use_softermax_xmax,strength=config.strongermax_strength)
+
+            if self.softmax_variant == "polymax":
+              self.softmax_layer = Polymax()
+
             if self.softmax_variant == "sigsoftmax":
               self.softmax_layer = SigSoftmax()
 
@@ -218,13 +297,15 @@ class GPTConfig:
     dropout: float = 0.0
 
     # Softmax Alternatives and Options
-    use_softmax_variant = False
-    softmax_variant: str = "softermax" # Choices: "softermax" "sigsoftmax" "sigsoftmax_base2"
+    use_softmax_variant: bool = False
+    softmax_variant: str = "softermax" # Choices: "softermax" "sigsoftmax" "sigsoftmax_base2" "polymax" "strongermax" "constantmax"
     use_softermax_xmax: bool = True # Softermax Option active is softermax selected - True: uses (x - x_max) normalization; False: removes normalization (potential overflow)
+    constantmax_constant: int = 1000 # denominator to utilize for Constantmax
+    strongermax_strength: int = 2 # Softermax Option active is softermax selected - True: uses (x - x_max) normalization; False: removes normalization (potential overflow)
 
     # Layernorm Alternatives and Options
     use_rmsnorm: bool = True # Add option for RMSNorm in place of LayerNorm: https://arxiv.org/abs/1910.07467
-    use_relu: bool = True #True: relu squared, False: do not utilize
+    use_relu: bool = False #True: relu squared, False: do not utilize
     use_squared_relu: bool = False #True: utilize relu squared, False: do not utilize
     bias: bool = False # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
 
@@ -255,6 +336,18 @@ def __init__(self, config):
               self.use_softermax_xmax = config.use_softermax_xmax
               self.softmax_layer = Softermax(subtract_max=self.use_softermax_xmax)
 
+            if self.softmax_variant == "constantmax":
+              self.use_softermax_xmax = config.use_softermax_xmax
+              self.constantmax_constant = config.constantmax_constant
+              self.softmax_layer = Constantmax(subtract_max=self.use_softermax_xmax, constant=self.constantmax_constant)
+
+            if self.softmax_variant == "strongermax":
+              self.use_softermax_xmax = config.use_softermax_xmax
+              self.softmax_layer = Strongermax(subtract_max=self.use_softermax_xmax, strength=config.strongermax_strength)
+
+            if self.softmax_variant == "polymax":
+              self.softmax_layer = Polymax()
+
             if self.softmax_variant == "sigsoftmax":
               self.softmax_layer = SigSoftmax()
 
diff --git a/pruning.py b/pruning.py
new file mode 100644
index 0000000000..bc6a0f0f15
--- /dev/null
+++ b/pruning.py
@@ -0,0 +1,15 @@
+import numpy as np
+import torch
+#a basic magnitude-based weight pruning for a neural network using PyTorch
+def magnitude_prune(model, pruning_rate):
+    all_weights = []
+    for name, param in model.named_parameters():
+        if 'weight' in name:
+            all_weights += list(param.cpu().detach().abs().numpy().flatten())
+
+    threshold = np.percentile(np.array(all_weights), pruning_rate)
+
+    for name, param in model.named_parameters():
+        if 'weight' in name:
+            with torch.no_grad():
+                param *= (param.abs() >= threshold).float()
\ No newline at end of file
diff --git a/requirements_cpu.txt b/requirements_cpu.txt
new file mode 100644
index 0000000000..15132a5550
--- /dev/null
+++ b/requirements_cpu.txt
@@ -0,0 +1,77 @@
+absl-py==2.0.0
+aiohttp==3.8.6
+aiosignal==1.3.1
+appdirs==1.4.4
+async-timeout==4.0.3
+attrs==23.1.0
+black==23.10.1
+cachetools==5.3.2
+certifi==2022.12.7
+charset-normalizer==2.1.1
+click==8.1.7
+datasets==2.14.6
+dill==0.3.7
+docker-pycreds==0.4.0
+filelock==3.9.0
+frozenlist==1.4.0
+fsspec==2023.4.0
+gitdb==4.0.11
+GitPython==3.1.40
+google-auth==2.23.4
+google-auth-oauthlib==1.1.0
+greenlet==3.0.1
+grpcio==1.59.2
+huggingface-hub==0.17.3
+idna==3.4
+Jinja2==3.1.2
+Markdown==3.5.1
+MarkupSafe==2.1.2
+mpmath==1.3.0
+msgpack==1.0.7
+multidict==6.0.4
+multiprocess==0.70.15
+mypy-extensions==1.0.0
+networkx==3.0
+numpy==1.26.1
+oauthlib==3.2.2
+packaging==23.2
+pandas==2.1.2
+pathspec==0.11.2
+pathtools==0.1.2
+Pillow==9.3.0
+platformdirs==3.11.0
+protobuf==4.23.4
+psutil==5.9.6
+pyarrow==14.0.0
+pyasn1==0.5.0
+pyasn1-modules==0.3.0
+pynvim==0.4.3
+python-dateutil==2.8.2
+pytz==2023.3.post1
+PyYAML==6.0.1
+regex==2023.10.3
+requests==2.28.1
+requests-oauthlib==1.3.1
+rsa==4.9
+safetensors==0.4.0
+sentry-sdk==1.34.0
+setproctitle==1.3.3
+six==1.16.0
+smmap==5.0.1
+sympy==1.12
+tensorboard==2.15.1
+tensorboard-data-server==0.7.2
+tiktoken==0.5.1
+tokenizers==0.14.1
+torch==2.1.0+cpu
+torchaudio==2.1.0+cpu
+torchvision==0.16.0+cpu
+tqdm==4.66.1
+transformers==4.35.0
+typing_extensions==4.4.0
+tzdata==2023.3
+urllib3==1.26.13
+wandb==0.15.12
+Werkzeug==3.0.1
+xxhash==3.4.1
+yarl==1.9.2
diff --git a/sample.py b/sample.py
index 78d8c4e88f..11eec29469 100644
--- a/sample.py
+++ b/sample.py
@@ -7,35 +7,43 @@
 import torch
 import tiktoken
 from model import GPTConfig, GPT
+import argparse
 
-# -----------------------------------------------------------------------------
-init_from = 'resume' # either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')
-out_dir = 'out' # ignored if init_from is not 'resume'
-start = "\n" # or "<|endoftext|>" or etc. Can also specify a file, use as: "FILE:prompt.txt"
-num_samples = 10 # number of samples to draw
-max_new_tokens = 500 # number of tokens generated in each sample
-temperature = 0.8 # 1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions
-top_k = 200 # retain only the top_k most likely tokens, clamp others to have 0 probability
-seed = 1337
-device = 'cuda' # examples: 'cpu', 'cuda', 'cuda:0', 'cuda:1', etc.
-dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
-compile = False # use PyTorch 2.0 to compile the model to be faster
+def parseargs():
+  parser = argparse.ArgumentParser(description='')
+  parser.add_argument("--device", type=str, help="device to run inference, e.g. 'cpu' or 'cuda' or 'cuda:0', 'cuda:1', etc...")
+  parser.add_argument("--out_dir", type=str, help="directory to load checkpoint from")
+  parser.add_argument("--init_from", type=str, default="resume", help="Either 'resume' (from an out_dir) or a gpt2 variant (e.g. 'gpt2-xl')")
+  parser.add_argument("--start", type=str, default="\n", help="\\n or '<|endoftext|>' or etc. Can also specify a file, use as: 'FILE:prompt.txt'")
+  parser.add_argument("--num_samples", type=int, default=3, help="number of inference streams to draw")
+  parser.add_argument("--max_new_tokens", type=int, default=500, help="number of tokens generated in each sample")
+  parser.add_argument("--temperature", type=float, default=0.8, help="1.0 = no change, < 1.0 = less random, > 1.0 = more random, in predictions")
+  parser.add_argument("--top_k", type=int, default=200, help="retain only the top_k most likely tokens, clamp others to have 0 probability")
+  parser.add_argument("--seed", type=int, default=1337, help="seed for psuedorandom number generator")
+  parser.add_argument("--dtype", type=str, default="bfloat16", choices=["bfloat16", "float16", "float32"], help="torch data type for inference, e.g. 'int8'")
+  parser.add_argument('--compile', default=False, action=argparse.BooleanOptionalAction)
+
+  return parser.parse_args()
+
+args = parseargs()
 # -----------------------------------------------------------------------------
 
-torch.manual_seed(seed)
-torch.cuda.manual_seed(seed)
+torch.manual_seed(args.seed)
+torch.cuda.manual_seed(args.seed)
 torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
 torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
-device_type = 'cuda' if 'cuda' in device else 'cpu' # for later use in torch.autocast
-ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
+device_type = 'cuda' if 'cuda' in args.device else 'cpu' # for later use in torch.autocast
+ptdtype = {'bfloat16': torch.bfloat16, 'float16': torch.float16, 'float32': torch.float32}[args.dtype]
 ctx = nullcontext() if device_type == 'cpu' else torch.amp.autocast(device_type=device_type, dtype=ptdtype)
 
 # model
-if init_from == 'resume':
+if args.init_from == 'resume':
     # init from a model saved in a specific directory
-    ckpt_path = os.path.join(out_dir, 'ckpt.pt')
-    checkpoint = torch.load(ckpt_path, map_location=device)
+    ckpt_path = os.path.join(args.out_dir, 'ckpt.pt')
+    checkpoint = torch.load(ckpt_path, map_location=args.device)
+    checkpoint['model_args']['dropout'] = 0.0 # typically we don't want dropout during inference
     gptconf = GPTConfig(**checkpoint['model_args'])
+    print(checkpoint['model_args'])
     model = GPT(gptconf)
     state_dict = checkpoint['model']
     unwanted_prefix = '_orig_mod.'
@@ -43,18 +51,18 @@
         if k.startswith(unwanted_prefix):
             state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
     model.load_state_dict(state_dict)
-elif init_from.startswith('gpt2'):
+elif args.init_from.startswith('gpt2'):
     # init from a given GPT-2 model
-    model = GPT.from_pretrained(init_from, dict(dropout=0.0))
+    model = GPT.from_pretrained(args.init_from, dict(dropout=0.0))
 
 model.eval()
-model.to(device)
-if compile:
+model.to(args.device)
+if args.compile:
     model = torch.compile(model) # requires PyTorch 2.0 (optional)
 
 # look for the meta pickle in case it is available in the dataset folder
 load_meta = False
-if init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these...
+if args.init_from == 'resume' and 'config' in checkpoint and 'dataset' in checkpoint['config']: # older checkpoints might not have these...
     meta_path = os.path.join('data', checkpoint['config']['dataset'], 'meta.pkl')
     load_meta = os.path.exists(meta_path)
 if load_meta:
@@ -73,16 +81,17 @@
     decode = lambda l: enc.decode(l)
 
 # encode the beginning of the prompt
-if start.startswith('FILE:'):
-    with open(start[5:], 'r', encoding='utf-8') as f:
-        start = f.read()
-start_ids = encode(start)
-x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
+if args.start.startswith('FILE:'):
+    with open(args.start[5:], 'r', encoding='utf-8') as f:
+        args.start = f.read()
+start_ids = encode(args.start)
+x = (torch.tensor(start_ids, dtype=torch.long, device=args.device)[None, ...])
 
 # run generation
 with torch.no_grad():
     with ctx:
-        for k in range(num_samples):
-            y = model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
+        for k in range(args.num_samples):
+            y = model.generate(x, args.max_new_tokens,
+                               temperature=args.temperature, top_k=args.top_k)
             print(decode(y[0].tolist()))
             print('---------------')
diff --git a/train.py b/train.py
index a123e5bef7..e879ab6922 100644
--- a/train.py
+++ b/train.py
@@ -4,6 +4,9 @@
 from datetime import datetime
 import math
 import pickle
+from contextlib import nullcontext
+import argparse
+from pruning import magnitude_prune
 
 import numpy as np
 import torch
@@ -20,79 +23,92 @@
 def parse_args():
     parser = argparse.ArgumentParser()
 
+    # argparse groups
+    model_group = parser.add_argument_group('model_group')
+    training_group = parser.add_argument_group('training_group')
+    logging_group = parser.add_argument_group('logging_group')
+
     # I/O args
-    parser.add_argument('--out_dir', default='out', type=str)
-    parser.add_argument('--eval_interval', default=250, type=int)
-    parser.add_argument('--log_interval', default=10, type=int)
-    parser.add_argument('--eval_iters', default=200, type=int)
-    parser.add_argument('--eval_only', action='store_true')
+    training_group.add_argument('--out_dir', default='out', type=str)
+    training_group.add_argument('--eval_interval', default=250, type=int)
+    training_group.add_argument('--log_interval', default=10, type=int)
+    training_group.add_argument('--eval_iters', default=200, type=int)
+    training_group.add_argument('--eval_only', action='store_true')
 
     # Checkpoint args
-    parser.add_argument('--only_save_checkpoint_at_end', action='store_true')
-    parser.add_argument('--always_save_checkpoint', action='store_true',
-                        default=False)
-    parser.add_argument('--init_from', default='scratch', choices=['scratch', 'resume', 'gpt2*'], type=str)
+    training_group.add_argument('--only_save_checkpoint_at_end', action='store_true')
+    training_group.add_argument('--always_save_checkpoint', action='store_true')
+    training_group.add_argument('--init_from', default='scratch', choices=['scratch', 'resume', 'gpt2*'], type=str)
 
     # Data args
-    parser.add_argument('--dataset', default='shakespeare_char', type=str)
-    parser.add_argument('--gradient_accumulation_steps', default=1, type=int)
-    parser.add_argument('--batch_size', default=64, type=int)
-    parser.add_argument('--block_size', default=256, type=int)
+    training_group.add_argument('--dataset', default='shakespeare_char', type=str)
+    training_group.add_argument('--gradient_accumulation_steps', default=1, type=int)
+    training_group.add_argument('--batch_size', default=64, type=int)
 
     # Model args
-    parser.add_argument('--n_layer', default=6, type=int)
-    parser.add_argument('--n_head', default=6, type=int)
-    parser.add_argument('--n_embd', default=384, type=int)
-    parser.add_argument('--dropout', default=0.2, type=float)
-    parser.add_argument('--bias', action='store_true', default=False)
+    model_group.add_argument('--block_size', default=256, type=int)
+    model_group.add_argument('--n_layer', default=6, type=int)
+    model_group.add_argument('--n_head', default=6, type=int)
+    model_group.add_argument('--n_embd', default=384, type=int)
+    model_group.add_argument('--dropout', default=0.2, type=float)
+    model_group.add_argument('--bias', default=False, action=argparse.BooleanOptionalAction)
+
+    # Norm variations
+    model_group.add_argument('--use_rmsnorm', default=True, action=argparse.BooleanOptionalAction)
 
-    # Model variations
-    parser.add_argument('--use_rmsnorm', action='store_true', default=True)
+    # Softmax variations
+    model_group.add_argument('--use_softmax_variant', default=False, action=argparse.BooleanOptionalAction)
+    model_group.add_argument("--softmax_variant", type=str, default="softermax", choices=["polymax", "strongermax", "softermax", "sigsoftmax", "sigsoftmax_base2"])
+
+    # Custom Softmax Variation Options
+    model_group.add_argument('--use_softermax_xmax', default=False, action=argparse.BooleanOptionalAction)
+    model_group.add_argument("--strongermax_strength", type=int, default=2)
 
     # Optimizer args
-    parser.add_argument('--learning_rate', default=1e-3, type=float)
-    parser.add_argument('--max_iters', default=5000, type=int)
-    parser.add_argument('--weight_decay', default=1e-1, type=float)
-    parser.add_argument('--beta1', default=0.9, type=float)
-    parser.add_argument('--beta2', default=0.99, type=float)
-    parser.add_argument('--grad_clip', default=1.0, type=float)
+    training_group.add_argument('--learning_rate', default=1e-3, type=float)
+    training_group.add_argument('--max_iters', default=5000, type=int)
+    training_group.add_argument('--weight_decay', default=1e-1, type=float)
+    training_group.add_argument('--beta1', default=0.9, type=float)
+    training_group.add_argument('--beta2', default=0.99, type=float)
+    training_group.add_argument('--grad_clip', default=1.0, type=float)
 
     # LR schedule args
-    parser.add_argument('--decay_lr', action='store_true')
-    parser.add_argument('--warmup_iters', default=100, type=int)
-    parser.add_argument('--lr_decay_iters', default=5000, type=int)
-    parser.add_argument('--min_lr', default=1e-4, type=float)
+    training_group.add_argument('--decay_lr', action='store_true')
+    training_group.add_argument('--warmup_iters', default=100, type=int)
+    training_group.add_argument('--lr_decay_iters', default=5000, type=int)
+    training_group.add_argument('--min_lr', default=1e-4, type=float)
 
     # DDP args
-    parser.add_argument('--backend', default='nccl', type=str)
+    training_group.add_argument('--backend', default='nccl', type=str)
 
     # System args
-    parser.add_argument('--device', default='cuda', type=str)
-    parser.add_argument('--dtype', default='float16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16', type=str)
-    parser.add_argument('--compile', action='store_true', default=True)
+    training_group.add_argument('--device', default='cuda', type=str)
+    training_group.add_argument("--dtype", type=str, default="bfloat16", choices=["bfloat16", "float16", "float32"], help="torch data type for inference, e.g. 'int8'")
+    training_group.add_argument('--compile', default=False, action=argparse.BooleanOptionalAction)
 
     # Logging args
-    parser.add_argument('--log_project', default='out-test', type=str)
-    parser.add_argument('--log_run_name', default='logs-test', type=str)
+    logging_group.add_argument('--log_project', default='out-test', type=str)
+    logging_group.add_argument('--log_run_name', default='logs-test', type=str)
 
     # Tensorboard args
-    parser.add_argument('--tensorboard_log', action='store_true', default=True)
-    parser.add_argument('--tensorboard_log_dir', type=str, default='logs')
-    parser.add_argument('--tensorboard_project', type=str, default='out-test')
-    parser.add_argument('--tensorboard_run_name', type=str, default='logs-test')
+    logging_group.add_argument('--tensorboard_log', default=True, action=argparse.BooleanOptionalAction)
+    logging_group.add_argument('--tensorboard_log_dir', type=str, default='logs')
+    logging_group.add_argument('--tensorboard_project', type=str, default='out-test')
+    logging_group.add_argument('--tensorboard_run_name', type=str, default='logs-test')
 
-    # Tensorboard args
-    parser.add_argument('--wandb_log', action='store_true', default=False)
-    parser.add_argument('--wandb_project', type=str, default='out-test')
-    parser.add_argument('--wandb_run_name', type=str, default='logs-test')
+    # Wandb args
+    logging_group.add_argument('--wandb_log', default=False, action=argparse.BooleanOptionalAction)
+    logging_group.add_argument('--wandb_project', type=str, default='out-test')
+    logging_group.add_argument('--wandb_run_name', type=str, default='logs-test')
 
     args = parser.parse_args()
-    return args
+    return args, model_group, training_group, logging_group
 
 
 class Trainer:
-    def __init__(self, args):
+    def __init__(self, args, model_group):
         self.args = args
+        self.model_group = model_group
         self.setup()
 
     def setup(self):
@@ -126,7 +142,7 @@ def setup(self):
         torch.backends.cudnn.allow_tf32 = True
 
         self.device_type = 'cuda' if 'cuda' in self.args.device else 'cpu'
-        self.ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[self.args.dtype]
+        self.ptdtype = {"bfloat16" : torch.bfloat16, "float16" : torch.float16, "float32" : torch.float32}[self.args.dtype]
         self.ctx = nullcontext() if self.device_type == 'cpu' else torch.amp.autocast(device_type=self.device_type, dtype=self.ptdtype)
 
         # Data loader
@@ -140,9 +156,10 @@ def setup(self):
             self.meta_vocab_size = meta['vocab_size']
 
         # Model
-        self.model_args = dict(n_layer=self.args.n_layer, n_head=self.args.n_head, n_embd=self.args.n_embd,
-                          block_size=self.args.block_size, bias=self.args.bias, vocab_size=None,
-                          dropout=self.args.dropout, use_rmsnorm=self.args.use_rmsnorm)
+        # TODO only add if they are defined from the argparse
+        self.model_args = {action.dest: getattr(self.args, action.dest) for action in self.model_group._group_actions}
+        print(self.model_args)
+        self.model_args['vocab_size'] = None
 
         if self.args.init_from == 'scratch':
             self.model_args['vocab_size'] = self.meta_vocab_size if self.meta_vocab_size is not None else 50304
@@ -194,7 +211,9 @@ def setup(self):
 
         # Tensorboard
         if self.args.tensorboard_log:
-            timestamp = time.strftime("%Y%m%d-%H%M%S")
+            timestamp = time.strftime("%Y%m%d-%H%M%S" + "_" +
+                                      self.args.tensorboard_project + "_" +
+                                      self.args.tensorboard_run_name)
             log_subpath = os.path.join(self.args.tensorboard_log_dir, timestamp)
             self.writer = SummaryWriter(log_subpath)
 
@@ -348,9 +367,8 @@ def train(self):
             wandb.finish()
 
 def main():
-    args = parse_args()
-    print(args.device)
-    trainer = Trainer(args)
+    args, model_group, _, _ = parse_args()
+    trainer = Trainer(args, model_group)
     trainer.train()
 
     if trainer.ddp: