Skip to content

Commit adc15e1

Browse files
goodship1jeffratjruwase
authored
Update curriculum-learning.md (deepspeedai#3031)
Co-authored-by: Jeff Rasley <[email protected]> Co-authored-by: Olatunji Ruwase <[email protected]>
1 parent 1f85569 commit adc15e1

File tree

2 files changed

+25
-24
lines changed

2 files changed

+25
-24
lines changed

docs/_tutorials/curriculum-learning.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ In our [paper](https://arxiv.org/abs/2108.06084) section 5.4 we demonstrate that
130130

131131
### 2.3 Token-based training termination
132132

133-
Because curriculum learning changes length of each sequence/sample during training, it is very hard/impossible to use number of steps/samples to terminate the training exactly at the desired number of tokens. Thus, we add a `--train-tokens` config for accurate token-based termination. We recommend increasing your original `--train-samples` or `--train-iters` to a large enough number (e.g., 3X of what you used for baseline), and set `--train-tokens` at the exact desired number of training tokens.
133+
Because curriculum learning changes the length of each sequence/sample during training, it is very hard/impossible to use a number of steps/samples to terminate the training exactly at the desired number of tokens. Thus, we add a `--train-tokens` config for accurate token-based termination. We recommend increasing your original `--train-samples` or `--train-iters` to a large enough number (e.g., 3X of what you used for baseline), and set `--train-tokens` at the exact desired number of training tokens.
134134

135135
### 2.4 Token-based LR decay
136136

setup.py

Lines changed: 24 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,11 @@
66
DeepSpeed library
77
88
To build wheel on Windows:
9-
1. Install pytorch, such as pytorch 1.12 + cuda 11.6
10-
2. Install visual cpp build tool
11-
3. Include cuda toolkit
12-
4. Launch cmd console with Administrator privilege for creating required symlink folders
9+
1. Install pytorch, such as pytorch 1.12 + cuda 11.6.
10+
2. Install visual cpp build tool.
11+
3. Include cuda toolkit.
12+
4. Launch cmd console with Administrator privilege for creating required symlink folders.
13+
1314
1415
Create a new wheel via the following command:
1516
build_win.bat
@@ -36,7 +37,7 @@
3637
from op_builder.all_ops import ALL_OPS
3738
from op_builder.builder import installed_cuda_version
3839

39-
# fetch rocm state
40+
# Fetch rocm state.
4041
is_rocm_pytorch = OpBuilder.is_rocm_pytorch()
4142
rocm_version = OpBuilder.installed_rocm_version()
4243

@@ -68,12 +69,12 @@ def fetch_requirements(path):
6869
'sd': fetch_requirements('requirements/requirements-sd.txt')
6970
}
7071

71-
# Add specific cupy version to both onebit extension variants
72+
# Add specific cupy version to both onebit extension variants.
7273
if torch_available and torch.cuda.is_available():
7374
cupy = None
7475
if is_rocm_pytorch:
7576
rocm_major, rocm_minor = rocm_version
76-
# XXX cupy support for rocm 5 is not available yet
77+
# XXX cupy support for rocm 5 is not available yet.
7778
if rocm_major <= 4:
7879
cupy = f"cupy-rocm-{rocm_major}-{rocm_minor}"
7980
else:
@@ -82,7 +83,7 @@ def fetch_requirements(path):
8283
extras_require['1bit'].append(cupy)
8384
extras_require['1bit_mpi'].append(cupy)
8485

85-
# Make an [all] extra that installs all needed dependencies
86+
# Make an [all] extra that installs all needed dependencies.
8687
all_extras = set()
8788
for extra in extras_require.items():
8889
for req in extra[1]:
@@ -91,7 +92,7 @@ def fetch_requirements(path):
9192

9293
cmdclass = {}
9394

94-
# For any pre-installed ops force disable ninja
95+
# For any pre-installed ops force disable ninja.
9596
if torch_available:
9697
from accelerator import get_accelerator
9798
cmdclass['build_ext'] = get_accelerator().build_extension().with_options(use_ninja=False)
@@ -104,7 +105,7 @@ def fetch_requirements(path):
104105
TORCH_MINOR = "0"
105106

106107
if torch_available and not torch.cuda.is_available():
107-
# Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486
108+
# Fix to allow docker builds, similar to https://github.com/NVIDIA/apex/issues/486.
108109
print("[WARNING] Torch did not find cuda available, if cross-compiling or running with cpu only "
109110
"you can ignore this message. Adding compute capability for Pascal, Volta, and Turing "
110111
"(compute capabilities 6.0, 6.1, 6.2)")
@@ -148,26 +149,26 @@ def op_enabled(op_name):
148149
op_compatible = builder.is_compatible()
149150
compatible_ops[op_name] = op_compatible
150151

151-
# If op is requested but not available, throw an error
152+
# If op is requested but not available, throw an error.
152153
if op_enabled(op_name) and not op_compatible:
153154
env_var = op_envvar(op_name)
154155
if env_var not in os.environ:
155156
builder.warning(f"One can disable {op_name} with {env_var}=0")
156157
abort(f"Unable to pre-compile {op_name}")
157158

158-
# if op is compatible but install is not enabled (JIT mode)
159+
# If op is compatible but install is not enabled (JIT mode).
159160
if is_rocm_pytorch and op_compatible and not op_enabled(op_name):
160161
builder.hipify_extension()
161162

162-
# If op install enabled, add builder to extensions
163+
# If op install enabled, add builder to extensions.
163164
if op_enabled(op_name) and op_compatible:
164165
assert torch_available, f"Unable to pre-compile {op_name}, please first install torch"
165166
install_ops[op_name] = op_enabled(op_name)
166167
ext_modules.append(builder.builder())
167168

168169
print(f'Install Ops={install_ops}')
169170

170-
# Write out version/git info
171+
# Write out version/git info.
171172
git_hash_cmd = "git rev-parse --short HEAD"
172173
git_branch_cmd = "git rev-parse --abbrev-ref HEAD"
173174
if command_exists('git') and 'DS_BUILD_STRING' not in os.environ:
@@ -200,38 +201,38 @@ def create_dir_symlink(src, dest):
200201
create_dir_symlink('..\\accelerator', '.\\deepspeed\\accelerator')
201202
egg_info.manifest_maker.template = 'MANIFEST_win.in'
202203

203-
# Parse the DeepSpeed version string from version.txt
204+
# Parse the DeepSpeed version string from version.txt.
204205
version_str = open('version.txt', 'r').read().strip()
205206

206207
# Build specifiers like .devX can be added at install time. Otherwise, add the git hash.
207-
# example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel
208+
# Example: DS_BUILD_STRING=".dev20201022" python setup.py sdist bdist_wheel.
208209

209-
# Building wheel for distribution, update version file
210+
# Building wheel for distribution, update version file.
210211
if 'DS_BUILD_STRING' in os.environ:
211-
# Build string env specified, probably building for distribution
212+
# Build string env specified, probably building for distribution.
212213
with open('build.txt', 'w') as fd:
213214
fd.write(os.environ.get('DS_BUILD_STRING'))
214215
version_str += os.environ.get('DS_BUILD_STRING')
215216
elif os.path.isfile('build.txt'):
216-
# build.txt exists, probably installing from distribution
217+
# build.txt exists, probably installing from distribution.
217218
with open('build.txt', 'r') as fd:
218219
version_str += fd.read().strip()
219220
else:
220-
# None of the above, probably installing from source
221+
# None of the above, probably installing from source.
221222
version_str += f'+{git_hash}'
222223

223224
torch_version = ".".join([TORCH_MAJOR, TORCH_MINOR])
224225
bf16_support = False
225-
# Set cuda_version to 0.0 if cpu-only
226+
# Set cuda_version to 0.0 if cpu-only.
226227
cuda_version = "0.0"
227228
nccl_version = "0.0"
228-
# Set hip_version to 0.0 if cpu-only
229+
# Set hip_version to 0.0 if cpu-only.
229230
hip_version = "0.0"
230231
if torch_available and torch.version.cuda is not None:
231232
cuda_version = ".".join(torch.version.cuda.split('.')[:2])
232233
if sys.platform != "win32":
233234
if isinstance(torch.cuda.nccl.version(), int):
234-
# This will break if minor version > 9
235+
# This will break if minor version > 9.
235236
nccl_version = ".".join(str(torch.cuda.nccl.version())[:2])
236237
else:
237238
nccl_version = ".".join(map(str, torch.cuda.nccl.version()[:2]))

0 commit comments

Comments
 (0)