想问问有用过 torch.profiler 的老哥吗

33 天前
 younggod
{
    "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 1, "tid": 7,
    "ts": 1713351140570122, "dur": 386,
    "args": {
        "External id": 1529,
        "device": 1, "context": 1,
        "stream": 7, "correlation": 1529,
        "bytes": 163840000, "memory bandwidth (GB/s)": 424.2921773719471
    }
}
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.profiler
from torch.nn.parallel import DistributedDataParallel as DDP

def setup(rank, world_size):
    import os
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def cleanup():
    dist.destroy_process_group()

class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(6400, 6400)

    def forward(self, x):
        return self.fc(x)

def demo_basic(rank, world_size):
    setup(rank, world_size)
    
    # Create model and move it to GPU with id rank
    model = SimpleModel().to(rank)
    model = DDP(model, device_ids=[rank])
    
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    
    # Create a random tensor to simulate input data
    inputs = torch.randn(200, 6400).to(rank)
    labels = torch.randn(200, 6400).to(rank)
    
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),
        on_trace_ready=torch.profiler.tensorboard_trace_handler('./logs'),
        profile_memory=True,  # Track memory allocation/deallocation.
        with_stack=True
    ) as prof:
        for _ in range(10):
            outputs = model(inputs)
            loss = nn.functional.mse_loss(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            prof.step()
    
    cleanup()

def main():
    world_size = 2
    torch.multiprocessing.spawn(demo_basic,
                                args=(world_size,),
                                nprocs=world_size,
                                join=True)

if __name__ == "__main__":
    main()

第一个代码块是第二个代码块的日志的一份信息,我想测试两个卡之间的通信带宽,但是这个代码块一中的 bandwidth 有点看不懂了,为什么能达到 400+GB/s,硬件时 PCIE 4.0 x16 单机双卡 4090, 我用 https://github.com/NVIDIA/cuda-samples/tree/master/Samples/5_Domain_Specific/p2pBandwidthLatencyTest 测试了 p2p=disable 时的带宽(见代码块三),求老哥/师傅们解惑

Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)
   D\D     0      1 
     0 919.12   2.28 
     1   2.49 812.51
497 次点击
所在节点    机器学习
0 条回复

这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。

https://www.v2ex.com/t/1033379

V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。

V2EX is a community of developers, designers and creative people.

© 2021 V2EX