export HF_TOKEN=your_huggingface_token
export AWS_ACCESS_KEY_ID=your_s3_access_id
export AWS_SECRET_ACCESS_KEY=your_s3_access_key
Download the full training dataset from Hugging Face: The Stack V2 - Train Full IDs
bash download_huggingface.sh | xargs -n 1 -P 20 wget -c --no-check-certificate --header="Authorization: Bearer ${HF_TOKEN}"
2. Extract blob_id
, repo_name
, file_path
, encode
, language
from the-stack-v2-train-full-ids
into ./stackv2/blob_ids/
python extract_info.py
Data files will be stored in ./stackv2/blob_ids/
with the .jsonl
suffix.
# On machine A
nohup python get_s3.py 0 500 > log0_500.txt 2>&1 &
nohup python get_s3.py 500 1000 > log500_1000.txt 2>&1 &
nohup python get_s3.py 1000 1500 > log1000_1500.txt 2>&1 &
nohup python get_s3.py 1500 2000 > log1500_2000.txt 2>&1 &
nohup python get_s3.py 2000 2500 > log2000_2500.txt 2>&1 &
# On machine B
nohup python get_s3.py 2500 3000 > log2500_3000.txt 2>&1 &
nohup python get_s3.py 3000 3500 > log3000_3500.txt 2>&1 &
nohup python get_s3.py 3500 4000 > log3500_4000.txt 2>&1 &
nohup python get_s3.py 4000 4500 > log4000_4500.txt 2>&1 &
nohup python get_s3.py 4500 5000 > log4500_5000.txt 2>&1 &
TIPS: Each machine should have at least 24GB of memory ( It costs 15GB memory for 5 processors indeed).