-
Notifications
You must be signed in to change notification settings - Fork 13
Open
Description
when change to set_seed(), bf16 accuracies are fine, but W4A4 openllm chat-template–related task accuracies drop to 0.(for llama3.1-8B-Instruct & Qwen3-8B model)
use newest lm_eval API(version==0.4.9.1)
cmd:
export OMP_NUM_THREADS=8
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128"
model_name="Llama-3.1-8B-Instruct"
MODEL=${MODEL:-"/models/Llama-3.1-8B-Instruct"}
# model_name="Qwen3-8B"
# MODEL=${MODEL:-"/models/Qwen3-8B"}
MODEL_ID=$( echo $MODEL | awk -F/ '{print $NF}' )
# Data params
NUM_SEQUENCES=${NUM_SEQUENCES:-128}
# Quantization params
FORMAT=${FORMAT:-"mxfp"}
W_BITS=${W_BITS:-4}
A_BITS=${A_BITS:-4}
W_GROUP_SIZE=${W_GROUP_SIZE:-32}
A_GROUP_SIZE=${A_GROUP_SIZE:-32}
GPTQ=${GPTQ:-0}
W_OBSERVER=${W_OBSERVER:-"minmax"}
QUANTIZATION_ORDER=${QUANTIZATION_ORDER:-"default"}
# Save params
EXPORT_QUANTIZATION=${EXPORT_QUANTIZATION:-"pseudoquant"}
# Transform params
TRANSFORM_CLASS=${TRANSFORM_CLASS:-"hadamard"}
HADAMARD_GROUP_SIZE=${HADAMARD_GROUP_SIZE:-128}
# Evaluation params
EVAL_PERPLEXITY=${EVAL_PERPLEXITY:-0}
EVAL_OPENLLM=${EVAL_OPENLLM:-1}
LM_EVAL_BATCH_SIZE=${LM_EVAL_BATCH_SIZE:-32}
# Misc params
LOG_WANDB=${LOG_WANDB:-0}
DTYPE=${DTYPE:-"auto"}
CPU_OFFLOAD_ACTIVATIONS=${CPU_OFFLOAD_ACTIVATIONS:-0}
SAVE_DIR="./"
SCRIPT_ARGS=""
if [[ $GPTQ == 1 ]]; then
SCRIPT_ARGS="${SCRIPT_ARGS} --gptq"
fi
if [[ $EVAL_PERPLEXITY == 1 ]]; then
SCRIPT_ARGS="${SCRIPT_ARGS} --eval_perplexity"
fi
if [[ $EVAL_OPENLLM == 1 ]]; then
SCRIPT_ARGS="${SCRIPT_ARGS} --eval_openllm"
fi
if [[ $LOG_WANDB == 1 ]]; then
SCRIPT_ARGS="${SCRIPT_ARGS} --log_wandb"
fi
METHOD_NAME=""
if [[ $GPTQ == 1 ]]; then
METHOD_NAME="GPTQ"
else
METHOD_NAME="RTN"
fi
if [[ $CPU_OFFLOAD_MODULES == 1 ]]; then
SCRIPT_ARGS="${SCRIPT_ARGS} --cpu_offload_modules"
fi
if [[ $CPU_OFFLOAD_ACTIVATIONS == 1 ]]; then
SCRIPT_ARGS="${SCRIPT_ARGS} --cpu_offload_activations"
fi
export WANDB_PROJECT="FP-Quantization-Harness"
export WANDB_NAME=${MODEL}/${FORMAT}-w${W_BITS}-a${A_BITS}-${METHOD_NAME}-${TRANSFORM_CLASS}-transform
if [[ $EXPORT_QUANTIZATION == "realquant" || $EXPORT_QUANTIZATION == "pseudoquant" ]]; then
SCRIPT_ARGS="${SCRIPT_ARGS} --export_quantized_model ${EXPORT_QUANTIZATION}"
if [[ $EXPORT_QUANTIZATION == "realquant" ]]; then
SAVE_DIR+=quantized_models
else
SAVE_DIR+=pseudoquantized_models
fi
fi
device=6
CUDA_VISIBLE_DEVICES=$device python model_quant.py \
--model_name_or_path=${MODEL} \
--format=${FORMAT} \
--w_bits=${W_BITS} \
--a_bits=${A_BITS} \
--w_group_size=${W_GROUP_SIZE} \
--a_group_size=${A_GROUP_SIZE} \
--transform_class=${TRANSFORM_CLASS} \
--w_observer=${W_OBSERVER} \
--quantization_order=${QUANTIZATION_ORDER} \
$SCRIPT_ARGS \
--hadamard_group_size=${HADAMARD_GROUP_SIZE} \
--dataset_name_or_path=fineweb-edu \
--num_sequences=${NUM_SEQUENCES} \
--sequence_length=2048 \
--dtype=${DTYPE} \
--lm_eval_batch_size=${LM_EVAL_BATCH_SIZE} \
--save_path "${SAVE_DIR}/${MODEL_ID}-${FORMAT}-w${W_BITS}-a${A_BITS}-${METHOD_NAME}-${TRANSFORM_CLASS}-transform" \
--cpu_offload_activations \
--cpu_offload_modules \
--fuse_global_scale \
--amp \
--disable_thinking \
--eval_openllm \
2>&1| tee -alogs/fpquant_${model_name}_mxfp_acc.txt
Metadata
Metadata
Assignees
Labels
No labels

