Skip to content

When using transformers.set_seed() instead of src.utils.common_utils.fix_seed(), W4A4 openllm chat-template–related task accuracies drop to 0, but bf16 accuracies are fine. #17

@WeiweiZhang1

Description

@WeiweiZhang1

when change to set_seed(), bf16 accuracies are fine, but W4A4 openllm chat-template–related task accuracies drop to 0.(for llama3.1-8B-Instruct & Qwen3-8B model)

use newest lm_eval API(version==0.4.9.1)

W4A4 pseudoquant
Image

bf16:
Image

cmd:

export OMP_NUM_THREADS=8
export PYTORCH_CUDA_ALLOC_CONF="max_split_size_mb:128"
model_name="Llama-3.1-8B-Instruct"
MODEL=${MODEL:-"/models/Llama-3.1-8B-Instruct"}
# model_name="Qwen3-8B"
# MODEL=${MODEL:-"/models/Qwen3-8B"}
MODEL_ID=$( echo $MODEL | awk -F/ '{print $NF}' )
# Data params
NUM_SEQUENCES=${NUM_SEQUENCES:-128}
# Quantization params
FORMAT=${FORMAT:-"mxfp"}
W_BITS=${W_BITS:-4}
A_BITS=${A_BITS:-4}
W_GROUP_SIZE=${W_GROUP_SIZE:-32}
A_GROUP_SIZE=${A_GROUP_SIZE:-32}
GPTQ=${GPTQ:-0}
W_OBSERVER=${W_OBSERVER:-"minmax"}
QUANTIZATION_ORDER=${QUANTIZATION_ORDER:-"default"}
# Save params
EXPORT_QUANTIZATION=${EXPORT_QUANTIZATION:-"pseudoquant"}
# Transform params
TRANSFORM_CLASS=${TRANSFORM_CLASS:-"hadamard"}
HADAMARD_GROUP_SIZE=${HADAMARD_GROUP_SIZE:-128}
# Evaluation params
EVAL_PERPLEXITY=${EVAL_PERPLEXITY:-0}
EVAL_OPENLLM=${EVAL_OPENLLM:-1}
LM_EVAL_BATCH_SIZE=${LM_EVAL_BATCH_SIZE:-32}
# Misc params
LOG_WANDB=${LOG_WANDB:-0}
DTYPE=${DTYPE:-"auto"}
CPU_OFFLOAD_ACTIVATIONS=${CPU_OFFLOAD_ACTIVATIONS:-0}
SAVE_DIR="./"
SCRIPT_ARGS=""

if [[ $GPTQ == 1 ]]; then
    SCRIPT_ARGS="${SCRIPT_ARGS} --gptq"
fi

if [[ $EVAL_PERPLEXITY == 1 ]]; then
    SCRIPT_ARGS="${SCRIPT_ARGS} --eval_perplexity"
fi

if [[ $EVAL_OPENLLM == 1 ]]; then
    SCRIPT_ARGS="${SCRIPT_ARGS} --eval_openllm"
fi

if [[ $LOG_WANDB == 1 ]]; then
    SCRIPT_ARGS="${SCRIPT_ARGS} --log_wandb"
fi

METHOD_NAME=""
if [[ $GPTQ == 1 ]]; then
    METHOD_NAME="GPTQ"
else
    METHOD_NAME="RTN"
fi

if [[ $CPU_OFFLOAD_MODULES == 1 ]]; then
    SCRIPT_ARGS="${SCRIPT_ARGS} --cpu_offload_modules"
fi

if [[ $CPU_OFFLOAD_ACTIVATIONS == 1 ]]; then
    SCRIPT_ARGS="${SCRIPT_ARGS} --cpu_offload_activations"
fi

export WANDB_PROJECT="FP-Quantization-Harness"
export WANDB_NAME=${MODEL}/${FORMAT}-w${W_BITS}-a${A_BITS}-${METHOD_NAME}-${TRANSFORM_CLASS}-transform

if [[ $EXPORT_QUANTIZATION == "realquant" || $EXPORT_QUANTIZATION == "pseudoquant" ]]; then
    SCRIPT_ARGS="${SCRIPT_ARGS} --export_quantized_model ${EXPORT_QUANTIZATION}"
    if [[ $EXPORT_QUANTIZATION == "realquant" ]]; then
        SAVE_DIR+=quantized_models
    else
        SAVE_DIR+=pseudoquantized_models
    fi
fi
    device=6
    CUDA_VISIBLE_DEVICES=$device python model_quant.py \
    --model_name_or_path=${MODEL} \
    --format=${FORMAT} \
    --w_bits=${W_BITS} \
    --a_bits=${A_BITS} \
    --w_group_size=${W_GROUP_SIZE} \
    --a_group_size=${A_GROUP_SIZE} \
    --transform_class=${TRANSFORM_CLASS} \
    --w_observer=${W_OBSERVER} \
    --quantization_order=${QUANTIZATION_ORDER} \
    $SCRIPT_ARGS \
    --hadamard_group_size=${HADAMARD_GROUP_SIZE} \
    --dataset_name_or_path=fineweb-edu \
    --num_sequences=${NUM_SEQUENCES} \
    --sequence_length=2048 \
    --dtype=${DTYPE} \
    --lm_eval_batch_size=${LM_EVAL_BATCH_SIZE} \
    --save_path "${SAVE_DIR}/${MODEL_ID}-${FORMAT}-w${W_BITS}-a${A_BITS}-${METHOD_NAME}-${TRANSFORM_CLASS}-transform" \
    --cpu_offload_activations \
    --cpu_offload_modules \
    --fuse_global_scale \
    --amp \
    --disable_thinking \
    --eval_openllm \
    2>&1| tee -alogs/fpquant_${model_name}_mxfp_acc.txt

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions