|
19 | 19 | gpu-arch-version: "12.4"
|
20 | 20 | timeout: 60
|
21 | 21 | script: |
|
| 22 | + set -xeou pipefail |
22 | 23 | echo "::group::Print machine info"
|
23 | 24 | uname -a
|
24 | 25 | echo "::endgroup::"
|
|
39 | 40 | echo "::endgroup::"
|
40 | 41 |
|
41 | 42 | echo "::group::Run inference"
|
42 |
| - export MODEL_PATH=checkpoints/stories15M/stories15M.pt |
| 43 | + export MODEL_DIR=checkpoints/stories15M/ |
| 44 | + export MODEL_PATH=${MODEL_DIR}/stories15M.pt |
43 | 45 | export MODEL_NAME=stories15M
|
44 |
| - export MODEL_DIR=/tmp |
| 46 | +
|
45 | 47 |
|
46 | 48 | for DTYPE in bfloat16 float16 float32; do
|
47 | 49 | ###################################################################
|
|
83 | 85 | echo "tests complete"
|
84 | 86 | echo "******************************************"
|
85 | 87 | echo "::endgroup::"
|
| 88 | +
|
| 89 | +
|
| 90 | + test-sdpa-backends-export: |
| 91 | + permissions: |
| 92 | + id-token: write |
| 93 | + contents: read |
| 94 | + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main |
| 95 | + with: |
| 96 | + runner: linux.g5.4xlarge.nvidia.gpu |
| 97 | + gpu-arch-type: cuda |
| 98 | + gpu-arch-version: "12.4" |
| 99 | + timeout: 60 |
| 100 | + script: | |
| 101 | + set -xeou pipefail |
| 102 | + echo "::group::Print machine info" |
| 103 | + uname -a |
| 104 | + echo "::endgroup::" |
| 105 | +
|
| 106 | + echo "::group::Download checkpoints" |
| 107 | + # Install requirements |
| 108 | + ./install/install_requirements.sh cuda |
| 109 | + pip3 list |
| 110 | + python3 -c 'import torch;print(f"torch: {torch.__version__, torch.version.git_version}")' |
| 111 | + echo "::endgroup::" |
| 112 | +
|
| 113 | + echo "::group::Download checkpoints" |
| 114 | + mkdir -p checkpoints/stories15M |
| 115 | + pushd checkpoints/stories15M |
| 116 | + wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt |
| 117 | + wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model |
| 118 | + popd |
| 119 | + echo "::endgroup::" |
| 120 | +
|
| 121 | + echo "::group::Run inference" |
| 122 | + export MODEL_DIR=checkpoints/stories15M/ |
| 123 | + export MODEL_PATH=${MODEL_DIR}/stories15M.pt |
| 124 | + export MODEL_NAME=stories15M |
| 125 | +
|
| 126 | + ./torchchat/utils/scripts/build_native.sh aoti |
| 127 | + |
| 128 | + for DEVICE in cpu cuda; do |
| 129 | + # depending on how the parameter passing works, may only be able to do bfloat16 for aoti_run, similar to runner-cuda-dtype.yml |
| 130 | + # (although the runner environment should not have an opinion what we us in the artifact, and we might suitably abstract that) |
| 131 | + for DTYPE in bfloat16 float16 float32; do |
| 132 | + for SDPA in 'math' 'flash_attention' 'efficient_attention' 'cudnn_attention'; do |
| 133 | + echo "***************************************************************" |
| 134 | + echo "*** $DEVICE $DTYPE $SDPA" |
| 135 | + ################################################################### |
| 136 | + # Export DSO and run with Python |
| 137 | + python torchchat.py export --output-dso dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} |
| 138 | + python torchchat.py generate --dso-path dso.so --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} --temperature 0 --prompt "Once upon a time" |
| 139 | + ################################################################### |
| 140 | + # Export AOTI and run with aoti_run |
| 141 | + python torchchat.py export --output-aoti /tmp/model.pt2 --checkpoint-path ${MODEL_PATH} --attention-backend ${SDPA} --device ${DEVICE} --dtype ${DTYPE} |
| 142 | + ./cmake-out/aoti_run /tmp/model.pt2 -z ${MODEL_DIR}/tokenizer.model -i "Once upon a time" |
| 143 | + ################################################################### |
| 144 | + done |
| 145 | + done |
| 146 | + done |
| 147 | +
|
| 148 | + echo "tests complete" |
| 149 | + echo "******************************************" |
| 150 | + echo "::endgroup::" |
0 commit comments