How to Speed Up Transformer Training Using NVIDIA Apex (FusedAdam, FusedLayerNorm) and Native torch.amp

rewrite this content and keep HTML tags as is. This is content from rss feed and I don’t need their *Daily Debrief Newsletter*, their tags from bottom like this *Share this articleCategoriesTags*, Editorial Process section, phrases like *Featured image from Peakpx, chart from Tradingview.com*, SPECIAL OFFERS and similar sections – just remove such sections and save only article itself:

print(“\n### SECTION D: end-to-end Transformer (vanilla fp32 vs Apex fused + AMP) ###”)
VOCAB, D, NHEAD, LAYERS, SEQ, BATCH, STEPS = 2000, 256, 4, 4, 128, 32, 60
class Block(torch.nn.Module):
def __init__(self, d, nhead, norm_cls):
super().__init__()
self.attn = torch.nn.MultiheadAttention(d, nhead, batch_first=True)
self.ff = torch.nn.Sequential(torch.nn.Linear(d, 4 * d), torch.nn.GELU(),
torch.nn.Linear(4 * d, d))
self.n1, self.n2 = norm_cls(d), norm_cls(d)
def forward(self, x):
h = self.n1(x); x = x + self.attn(h, h, h, need_weights=False)[0]
return x + self.ff(self.n2(x))
class TinyTransformer(torch.nn.Module):
def __init__(self, norm_cls):
super().__init__()
self.emb = torch.nn.Embedding(VOCAB, D)
self.blocks = torch.nn.ModuleList([Block(D, NHEAD, norm_cls) for _ in range(LAYERS)])
self.norm = norm_cls(D)
self.head = torch.nn.Linear(D, VOCAB)
def forward(self, idx):
x = self.emb(idx)
for b in self.blocks:
x = b(x)
return self.head(self.norm(x))
g = torch.Generator(device=”cpu”).manual_seed(0)
data = torch.randint(0, VOCAB, (BATCH, SEQ + 1), generator=g).to(DEV)
inp, tgt = data[:, :-1], data[:, 1:]
lossfn = torch.nn.CrossEntropyLoss()
def run_training(use_apex):
torch.manual_seed(0)
norm_cls = (FusedLayerNorm if (use_apex and HAS_FLN and APEX_OK) else torch.nn.LayerNorm)
model = TinyTransformer(norm_cls).to(DEV)
if use_apex and HAS_AMP_C and APEX_OK:
optimizer = FusedAdam(model.parameters(), lr=3e-4)
else:
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4)
scaler = torch.amp.GradScaler(“cuda”, enabled=use_apex)
def one_step():
optimizer.zero_grad(set_to_none=True)
with torch.amp.autocast(“cuda”, dtype=torch.float16, enabled=use_apex):
logits = model(inp)
loss = lossfn(logits.reshape(-1, VOCAB), tgt.reshape(-1))
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
return loss
for _ in range(5):
one_step()
torch.cuda.synchronize()
t0 = time.perf_counter()
for _ in range(STEPS):
loss = one_step()
torch.cuda.synchronize()
dt = time.perf_counter() – t0
return loss.item(), (STEPS * BATCH * SEQ) / dt, dt
loss_v, tps_v, dt_v = run_training(use_apex=False)
print(f” vanilla (fp32, nn.LayerNorm, AdamW) : ”
f”{dt_v:5.2f}s | {tps_v:9.0f} tok/s | final loss {loss_v:.3f}”)
if APEX_OK and (HAS_AMP_C or HAS_FLN):
loss_a, tps_a, dt_a = run_training(use_apex=True)
print(f” apex (fp16, FusedLayerNorm, FusedAdam) : ”
f”{dt_a:5.2f}s | {tps_a:9.0f} tok/s | final loss {loss_a:.3f}”)
print(f” —-> speedup: {tps_a / tps_v:0.2f}x throughput”)
else:
print(” apex path SKIPPED (no fused kernels built)”)
print(“\n” + “=” * 78)
print(“DONE. Key takeaways:”)
print(” – FusedAdam/FusedLayerNorm/FusedRMSNorm are the still-relevant Apex pieces;”)
print(” speedups grow with model size & parameter count (tiny demo understates it).”)
print(” – apex.amp is deprecated -> prefer torch.amp.autocast + torch.amp.GradScaler.”)
print(” – FusedAdam composes cleanly with native torch.amp (Section D).”)
print(” – On real workloads, also try a larger model and bf16 autocast (no scaler needed).”)
print(“=” * 78)

How to Speed Up Transformer Training Using NVIDIA Apex (FusedAdam, FusedLayerNorm) and Native torch.amp

Media Advisory: MIT to establish regional quantum hub | MIT News

OpenAI governance frameworks secure enterprise AI deployments

Liquid AI Releases LFM2.5-8B-A1B: An On-Device MoE Model With 8.3B Total and 1.5B Active Parameters

Building AI models that understand chemical principles | MIT News

Thanks! We sent confirmation message to your inbox.

rewrite this title in other words: Asian Markets Trade Mixed | Nasdaq

How to Speed Up Transformer Training Using NVIDIA Apex (FusedAdam, FusedLayerNorm) and Native torch.amp

Radiant to Wind Down After Failing to Recover From 2024 Hack

Updated Essential AI Skills For 2026

How to Use Google Gemini Al (Full Tutorial)

Top Insights

EdgeX Blames Outsider for EDGE Token Crash as ZachXBT Alleges Insider Manipulation

rewrite this title in other words: Sosnick Warns Crypto’s ‘Tourists’ Are Cashing out as Bitcoin ETFs Bleed $1.42 Billion

How to Speed Up Transformer Training Using NVIDIA Apex (FusedAdam, FusedLayerNorm) and Native torch.amp

Related Posts