-
Notifications
You must be signed in to change notification settings - Fork 215
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Support benchmarking script by using real application trace #737
base: main
Are you sure you want to change the base?
Changes from 7 commits
c31c754
36f12fe
60edfe7
8630264
e8a6bab
fbb4e33
b02dabf
3db2882
0e29e91
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,27 +16,60 @@ | |
|
||
# Result files will be added to 'PATH_PREFIX' directory. | ||
PATH_PREFIX=`dirname "$0"` | ||
OUTPUT_FILE= | ||
FILE_NAME="result" | ||
MODEL="llama2-7b" | ||
TEMPERATURE=0.0 | ||
|
||
TOTAL=100 | ||
# TODO: Set your preferred request sizes and rates here. | ||
input_start=4 | ||
input_limit=$((2**12)) # 4K | ||
input_limit=$((2**11)) # 2K | ||
output_start=4 | ||
output_limit=$((2**12)) # 4K | ||
output_limit=$((2**9)) # 512 | ||
rate_start=1 | ||
rate_limit=$((2**6)) # 64 | ||
workload= | ||
dry_run=0 | ||
|
||
|
||
# Function to generate workload for specific input/output lengths | ||
generate_workload() { | ||
local input_len=$1 | ||
local output_len=$2 | ||
local api_key=$3 | ||
local num_prompts=$4 | ||
local model=$5 | ||
|
||
echo " input_len: $input_len" | ||
echo " output_len: $output_len" | ||
echo " api_key: $api_key" | ||
echo " num_prompts: $num_prompts" | ||
echo " model: $model" | ||
echo " temperature: $TEMPERATURE" | ||
echo "Generating workload for input=$input_len, output=$output_len, API_KEY=$api_key, num_prompts=$num_prompts, model=$model, temperature=$TEMPERATURE" | ||
|
||
python $PATH_PREFIX/gen_benchmark_prompt.py \ | ||
$workload \ | ||
--input-tokens "$input_len" \ | ||
--min-output-tokens "$output_len" \ | ||
--tolerance "0.2" \ | ||
--qps "2.0" \ | ||
--host "localhost" \ | ||
--port "8010" \ | ||
--api-key "$api_key" \ | ||
--total-prompts "$num_prompts" \ | ||
--model "$model" \ | ||
--temperature "$TEMPERATURE" | ||
} | ||
|
||
while [[ $# -gt 0 ]]; do | ||
case "$1" in | ||
-m|--model) | ||
MODEL="$2" | ||
MODEL=$2 | ||
shift 2 | ||
;; | ||
-o|--output) | ||
OUTPUT_FILE="$2" | ||
FILE_NAME="$2" | ||
shift 2 | ||
;; | ||
--input-start) | ||
|
@@ -71,6 +104,10 @@ while [[ $# -gt 0 ]]; do | |
LLM_API_KEY=$2 | ||
shift 2 | ||
;; | ||
--temperature) | ||
TEMPERATURE=$2 | ||
shift 2 | ||
;; | ||
--workload) | ||
workload="--workload_dataset_file $2" | ||
shift 2 | ||
|
@@ -82,31 +119,53 @@ while [[ $# -gt 0 ]]; do | |
esac | ||
done | ||
|
||
|
||
# Make sure the directory exists and clear output file | ||
if [[ -z "$OUTPUT_FILE" ]]; then | ||
OUTPUT_FILE="${PATH_PREFIX}/result/${MODEL}.jsonl" | ||
fi | ||
OUTPUT_FILE="${PATH_PREFIX}/result/${FILE_NAME}.jsonl" | ||
PROMPT_DIR="${PATH_PREFIX}/result/prompts" | ||
mkdir -p `dirname "$OUTPUT_FILE"` | ||
mkdir -p "$PROMPT_DIR" | ||
|
||
# Clear the workload directory | ||
echo "Clearing workload directory: $PROMPT_DIR" | ||
rm -rf "$PROMPT_DIR"/* | ||
|
||
# Clear the output file | ||
> "$OUTPUT_FILE" | ||
|
||
# Print the arguments (or use them in your script logic) | ||
echo "Start benchmark $MODEL, input tokens:[$input_start:$input_limit], output tokens:[$output_start:$output_limit], rates:[$rate_start:$rate_limit], save as: $OUTPUT_FILE" | ||
echo "Start benchmark $MODEL, input tokens:[$input_start:$input_limit], output tokens:[$output_start:$output_limit], rates:[$rate_start:$rate_limit], save as: $OUTPUT_FILE", workload: "$workload": | ||
|
||
|
||
if [[ $dry_run == 1 ]]; then | ||
echo "Dru run enabled, skip profiling." | ||
exit | ||
fi | ||
|
||
# Run the benchmark for each combination | ||
echo "Starting benchmark..." | ||
input_len=$input_start | ||
while [[ $input_len -le $input_limit ]]; do | ||
output_len=$output_start | ||
while [[ $output_len -le $output_limit ]]; do | ||
req_rate=$rate_start | ||
while [[ $req_rate -le $rate_limit ]]; do | ||
python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$req_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --stream $workload 1>>${OUTPUT_FILE} | ||
# Make sure all arguments are passed in the correct order | ||
generate_workload "$input_len" "$output_len" "$LLM_API_KEY" "$TOTAL" "$MODEL" | ||
|
||
# Convert rate_start to integer (multiply by 100 and remove decimals) | ||
req_rate=$(echo "$rate_start * 100" | bc | cut -d. -f1) | ||
rate_limit_scaled=$(echo "$rate_limit * 100" | bc | cut -d. -f1) | ||
while [[ $req_rate -le $rate_limit_scaled ]]; do | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it because -le can compare integers only, so we need to *100? And I suppose 0.125 is also somehow supported (inaccurate is ok) by removing decimals? Just in case, I would suggest setting req_rate at least 0.01 to avoid 0. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's add a comment here to tell the maintainer why 1000 is necessary. |
||
actual_rate=$(echo "scale=2; $req_rate / 100" | bc) | ||
|
||
WORKLOAD_FILE="$PROMPT_DIR/prompt_in${input_len}_out${output_len}.json" | ||
if [[ -f "$WORKLOAD_FILE" ]]; then | ||
python $PATH_PREFIX/gpu_benchmark.py --backend=vllm --port 8010 --model=$MODEL --request-rate=$actual_rate --num-prompts=$TOTAL --input-len $input_len --output-len $output_len --api-key "$LLM_API_KEY" --temperature "$TEMPERATURE" --workload_dataset_file "$WORKLOAD_FILE" --stream >> "$OUTPUT_FILE" | ||
fi | ||
req_rate=$((req_rate * 2)) | ||
done | ||
output_len=$((output_len * 2)) | ||
done | ||
input_len=$((input_len * 2)) | ||
input_len=$((input_len * 2)) | ||
done | ||
|
||
echo "Profiling finished." |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In fact, I don't think hardcoding all files being in ${PATH_PREFIX}/result/ is a good idea. In particular, python runtime is INSTALLED in a special folder if running as a docker, and it will become difficult to know where the output is. The same problem is true for PROMPT_DIR.