ReaLLMASIC · 777yifei · Oct 5, 2023 · Oct 5, 2023 · Oct 12, 2023 · Nov 3, 2023
diff --git a/.github/workflows/cpu-basic-install-prepare-train-inf-test.yml b/.github/workflows/cpu-basic-install-prepare-train-inf-test.yml
@@ -0,0 +1,34 @@
+name: Basic Pytorch Installation, Data Prep, CPU Training, CPU Inference
+on: [push, pull_request]
+jobs:
+  Install-Dependencies_Data-Prep_CPU-Training_CPU-Inference:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out repository code
+        uses: actions/checkout@v4
+      - run: echo "${{ github.repository }} repository has been cloned to the runner."
+      - run: echo "Currently on ${{ github.ref }} branch"
+      - name: ls of directory
+        run: |
+          ls ${{ github.workspace }}
+             # Caching pip dependencies
+      - name: Cache pip dependencies
+        uses: actions/cache@v3
+        with:
+          path: ~/.cache/pip
+          key: ${{ runner.os }}-pip-${{ hashFiles('**/requirements_cpu.txt') }}
+          restore-keys: |
+            ${{ runner.os }}-pip-
+      - name: Install CPU Dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
+          python3 -m pip install -r requirements_cpu.txt
+      - name: Run Small Network on CPU
+        run: |
+          python3 data/shakespeare_char/prepare.py
+          python3 train.py --out_dir=out --device=cpu --eval_interval=2 --log_interval=1 --block_size=2 --batch_size=2 --n_layer=2 --n_head=2 --n_embd=16 --max_iters=3 --lr_decay_iters=2 --dropout=0.0
+      - name: Run CPU Inference
+        run: |
+          python3 sample.py --device=cpu --out_dir="out"
+
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,11 @@
+# folders
+__pycache__/
+logs/
+
+# file extensions
+*.pkl
+*.bin
+*.txt
+
+# checkpoint directories
+out*/
diff --git a/HW/SA/Softer_max/LUT.sv b/HW/SA/Softer_max/LUT.sv
@@ -0,0 +1,29 @@
+// a LUT used for pow2 unit, will store this LUT into a on-chip mem in the future
+// this LUT contains value from 2^(0/16) to 2^(15/16)
+module LUT(
+    input logic signed [`FRAC-1:0] index,
+    output logic signed [`DATA_SIZE-1:0] out
+);
+    always_comb begin
+        //[2^(n/16)] and then round to integer
+        out = 8'b0;
+        case(index)
+            4'b0000 : out = 8'b00010000;
+            4'b0001 : out = 8'b00010001;
+            4'b0010 : out = 8'b00010001;
+            4'b0011 : out = 8'b00010010;
+            4'b0100 : out = 8'b00010011;
+            4'b0101 : out = 8'b00010100;
+            4'b0110 : out = 8'b00010101;
+            4'b0111 : out = 8'b00010110;
+            4'b1000 : out = 8'b00010111;
+            4'b1001 : out = 8'b00011000;
+            4'b1010 : out = 8'b00011001;
+            4'b1011 : out = 8'b00011010;
+            4'b1100 : out = 8'b00011011;
+            4'b1101 : out = 8'b00011100;
+            4'b1110 : out = 8'b00011101;
+            4'b1111 : out = 8'b00011111;          
+        endcase
+    end
+endmodule
diff --git a/HW/SA/Softer_max/Pow2_tb.sv b/HW/SA/Softer_max/Pow2_tb.sv
@@ -0,0 +1,69 @@
+//The testbench for Pow2 unit
+module Pow_tb();
+    logic clk;
+    logic signed [`DATA_SIZE-1:0] current_max;
+    logic signed [`DATA_SIZE-1:0] input_vector;
+    logic signed [`LARGE_SIZE:0] uSoftmax;
+    integer write_file0, write_file1, write_file2;
+    initial begin
+
+        write_file0 = $fopen("Pow_data.txt", "w");
+        write_file1 = $fopen("true_pow_data.txt", "w");
+        write_file2 = $fopen("Pow_data2.txt", "w");
+    end
+
+    Pow2 Pow2 (
+        .current_max,
+        .input_vector,
+        .uSoftmax //UnnormedSoftmax
+    ); 
+    int counter;
+    real true_out, xj, uSoftmax_floating;
+    always #5 clk = ~clk;
+
+    //initialize "current_max", could also use rand()
+    always #10 begin
+        counter = counter + 1;
+        if(counter == 5) begin
+            current_max = 0;
+            counter = 0;
+        end
+        else begin
+            current_max = current_max+1;
+        end
+    end
+
+    //initlalize input vectors
+    always #5 begin
+        input_vector[`DATA_SIZE-2:`FRAC] = input_vector[`DATA_SIZE-2:`FRAC] + 1;
+        input_vector[`FRAC-1:0] = input_vector[`FRAC-1:0] + 1;
+    end
+
+    //calculate the expected value
+    always #5 begin
+        xj = input_vector[`DATA_SIZE-2:`FRAC] + (input_vector[`FRAC-1:0]/16.0);
+        true_out = 2**(xj-current_max+0.0);
+    end
+
+    //transfer the output from fixed point 8 to a real number
+    always #5 begin
+        uSoftmax_floating = uSoftmax[`LARGE_SIZE-2:`FRAC] + uSoftmax[`FRAC-1:0]*1.0/16.0;
+    end
+
+    //write the outputs into output files
+    always #5 begin
+        $fdisplay(write_file0,"%0d\t",uSoftmax);
+        $fdisplay(write_file1,"%0f\t",true_out);
+        $fdisplay(write_file2,"%0f\t",uSoftmax_floating);
+    end
+
+    //testing begin
+    initial begin
+        clk = 0;
+        counter = 0;
+        current_max = 0;
+        input_vector = 5;
+        #200;
+        $finish;
+    end
+endmodule
diff --git a/HW/SA/Softer_max/Pow2_unit.sv b/HW/SA/Softer_max/Pow2_unit.sv
@@ -0,0 +1,30 @@
+//perform 2^(xj-localMax)
+//To do: modify output data size
+//To do: explore floating point 8 representation
+module Pow2 (
+    input logic signed [`DATA_SIZE-1:0] current_max,
+    input logic signed [`DATA_SIZE-1:0] input_vector,
+    output logic signed [`LARGE_SIZE:0] uSoftmax //UnnormedSoftmax
+);  
+    logic signed [`DATA_SIZE-1:0] pow2_frac;
+    logic signed [`LARGE_SIZE-1:0] FP_1;
+
+    //fixed_point 2
+    always_comb begin
+        FP_1 = 0;
+        FP_1[`FRAC] = 1;
+    end
+
+    //return [2^(n/16)]*2^4 and then round to integer
+    LUT LUT0 (
+        .index(input_vector[`FRAC-1:0]),
+        .out(pow2_frac)
+    );
+
+    //for debug
+    logic signed [`DATA_SIZE-1-`FRAC:0] temp;
+    assign temp = input_vector [`DATA_SIZE-1:`FRAC];
+    //2^[(int xj)-localMax]*2^(frac+4)/2^4
+    //8'b01000000, 2 in FixedP8
+    assign uSoftmax = ((FP_1 <<< (temp - current_max)) * pow2_frac) >>> `FRAC;
+endmodule
diff --git a/HW/SA/Softer_max/README.md b/HW/SA/Softer_max/README.md
diff --git a/HW/SA/Softer_max/help.py b/HW/SA/Softer_max/help.py
@@ -0,0 +1,44 @@
+# a helper python code to compare the expected ouputs of Pow2 unit and its real outputs
+# files are generated by Pow2_tb.sv
+def fp8_to_real(value):
+  # FP8(3,4) has:
+  # - 3 integer bits
+  # - 4 fractional bits
+  # - Signed format
+
+  # Extract sign bit
+
+  # Extract integer part 
+  integer = value >> 4
+
+  # Extract fraction part
+  fraction = value & 0b00001111
+
+  #Compute real value  
+  real_value = (integer + fraction / 16)
+
+  return real_value
+
+def compare_files(file1_path, file2_path):
+    differences = []
+    with open(file1_path, 'r') as file1, open(file2_path, 'r') as file2:
+        for line1, line2 in zip(file1, file2):
+            try:
+                num1 = float(line1.strip())
+                num2 = float(line2.strip())
+                #num1 = fp8_to_real(num1)
+                difference = abs(num1 - num2)
+                differences.append(difference)
+            except ValueError:
+                # Handle the case where a line does not contain a valid number
+                differences.append(None)
+
+    return differences
+
+# Example usage
+file1_path = 'Pow_data2.txt'
+file2_path = 'true_pow_data.txt'
+differences = compare_files(file1_path, file2_path)
+print(differences)
+print(f"average diff = {sum(differences)/len(differences)}")
+#print(f"average diff after scale down by 2^4 is: {(sum(differences)/len(differences))/2**4}")
diff --git a/data/csv_data/.gitignore b/data/csv_data/.gitignore
@@ -0,0 +1,3 @@
+*.csv
+*.bin
+*.pkl
diff --git a/data/csv_data/README.md b/data/csv_data/README.md
@@ -0,0 +1,79 @@
+# Data-Shuffler for ML Permutation Invariance
+
+These Python scripts process time-series data from a CSV file to add permutation
+invariance to the csv's data fields.
+
+Each row in the CSV becomes a single line in the text file, with each cell
+represented by a unique lowercase letter (starting from 'a') followed by the
+value from the cell.
+
+One has the option to shuffle the letter-value pairs in each line, using a
+command-line flag.
+
+Training on this data with the shuffle option, will create a form of
+in-frame-permutation invariance.
+
+This will give -- during inference -- data the freedom to move around and unlock
+special capabilities otherwise not available to fixed-frame trained networks.
+
+For example, one can utilize a beam search for each of the labels to determine
+which of the letter value pairs gives the strongest certainty of data points in
+this frame, and build the next frame up incrementally using this technique.
+
+## Getting Started
+
+### Prerequisites
+
+- Python (3.6 or above)
+
+## Usage
+
+1. Separate your file into one with timestamp columns and one with data columns.
+
+2. Navigate to the directory where the script is located and run process_csv on
+   one's data-column file:
+
+```sh
+python3 process_csv.py <data_column_file> <processed_data_file> --shuffle --exclude e
+```
+
+3. Recombine the output file from process_csv.py with the time column data.
+
+
+```sh
+python3 combine_csvs.py <time_column_file> <processed_data_file> <processed_csv>
+```
+
+4. Prepare the processed_data_file for training
+
+```sh
+python3 prepare.py -i <processed_data_file>
+```
+
+5. `cd` to the `explorations` folder, and utilize the script to run training:
+
+
+```sh
+cd ../../explorations
+bash run_csv_data_training.sh
+```
+
+6. [Optional] Create an exploration script to test training and inference with
+   and without with and without shuffling.
+
+### Arguments
+
+- `input_file`: The path to the input CSV file containing time-series data.
+- `output_file`: The path to the output text file.
+- `--shuffle`: Optional flag to shuffle the order of letter-value pairs in each line.
+- `--exclude`: Optional flag to remove any letters used by the dataset (e.g. `e`
+    for scientific notation)
+
+### Example
+
+For a full example see the `main.sh` script on generated sine + noise data.
+
+## License
+
+This project is licensed under the MIT License
+
diff --git a/data/csv_data/combine_csvs.py b/data/csv_data/combine_csvs.py
@@ -0,0 +1,30 @@
+import csv
+import argparse
+
+def combine_csv_columns(file_path1, file_path2, output_file_path):
+    with open(file_path1, 'r') as file1, open(file_path2, 'r') as file2, open(output_file_path, 'w', newline='') as outfile:
+        reader1 = csv.reader(file1)
+        reader2 = csv.reader(file2)
+
+        for row1, row2 in zip(reader1, reader2):
+            # Combine the rows from both CSVs
+            combined_row = row1 + row2
+
+            # Join the row with no delimiter and write it to the file
+            outfile.write(''.join(combined_row))
+
+            # Write a newline character after each row
+            outfile.write('\n')
+
+def main(args):
+    combine_csv_columns(args.file1, args.file2, args.output)
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Combine columns of two CSV files.')
+    parser.add_argument('file1', type=str, help='Path to the first input CSV file.')
+    parser.add_argument('file2', type=str, help='Path to the second input CSV file.')
+    parser.add_argument('output', type=str, help='Path to the output CSV file.')
+
+    args = parser.parse_args()
+    main(args)
+
diff --git a/data/csv_data/main.sh b/data/csv_data/main.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# This is an example using generated data to demonstrate:
+# 1. labelling of different time series data
+# 2. shuffling of different fields with their label
+
+set -x
+
+# CREATE SYNTHETIC DATA -- Skip if using your own data
+# This is a generator create two csvs (if `--split` is specified):
+# 1. time data called:    `time_filename.csv`
+# 2. signal data called:  `data_filename.csv`
+python3 sine_noise_generator.py --noise_level 0.3 --filename sine_data.csv --scientific --precision 2 --modulo 1000 --points 1000000 --split
+
+set +x
+echo -e "\nPreview: Generated Times"
+head time_sine_data.csv
+
+echo -e "\n\nPreview: Generated Data"
+head data_sine_data.csv
+echo -e "\n\n"
+set -x
+
+# Utilize a dataset only csv (no timestamps) in this case `data_sine_data.csv`
+# This script does two things:
+# 1. _prepend_ labels to the data
+# 2. (optionally) shuffle the data
+# Also 'e' is used for scientific notation, skip this letter when doing labelling
+python3 process_csv.py data_sine_data.csv sine_noise_sn_shuffled.csv --shuffle --exclude e
+
+# preview the result
+set +x
+echo -e "\nPreview: Shuffled Data"
+head sine_noise_sn_shuffled.csv
+echo -e "\n\n"
+set -x
+
+# recombine
+python3 combine_csvs.py time_sine_data.csv sine_noise_sn_shuffled.csv processed_sine_data.csv
+
+set +x
+echo -e "\nPreview: Timestamps with Shuffled Data"
+head processed_sine_data.csv
-Original file line number
+Diff line change
@@ -0,0 +1,3 @@
+    *.csv
+    *.bin
+    *.pkl