想问问有用过 torch.profiler 的老哥吗

{
    "ph": "X", "cat": "gpu_memcpy", "name": "Memcpy DtoD (Device -> Device)", "pid": 1, "tid": 7,
    "ts": 1713351140570122, "dur": 386,
    "args": {
        "External id": 1529,
        "device": 1, "context": 1,
        "stream": 7, "correlation": 1529,
        "bytes": 163840000, "memory bandwidth (GB/s)": 424.2921773719471
    }
}

import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.profiler
from torch.nn.parallel import DistributedDataParallel as DDP

def setup(rank, world_size):
    import os
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)
    torch.cuda.set_device(rank)

def cleanup():
    dist.destroy_process_group()

class SimpleModel(nn.Module):
    def __init__(self):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(6400, 6400)

    def forward(self, x):
        return self.fc(x)

def demo_basic(rank, world_size):
    setup(rank, world_size)
    
    # Create model and move it to GPU with id rank
    model = SimpleModel().to(rank)
    model = DDP(model, device_ids=[rank])
    
    optimizer = optim.SGD(model.parameters(), lr=0.01)
    
    # Create a random tensor to simulate input data
    inputs = torch.randn(200, 6400).to(rank)
    labels = torch.randn(200, 6400).to(rank)
    
    with torch.profiler.profile(
        activities=[torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA],
        schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),
        on_trace_ready=torch.profiler.tensorboard_trace_handler('./logs'),
        profile_memory=True,  # Track memory allocation/deallocation.
        with_stack=True
    ) as prof:
        for _ in range(10):
            outputs = model(inputs)
            loss = nn.functional.mse_loss(outputs, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            prof.step()
    
    cleanup()

def main():
    world_size = 2
    torch.multiprocessing.spawn(demo_basic,
                                args=(world_size,),
                                nprocs=world_size,
                                join=True)

if __name__ == "__main__":
    main()

第一个代码块是第二个代码块的日志的一份信息，我想测试两个卡之间的通信带宽，但是这个代码块一中的 bandwidth 有点看不懂了，为什么能达到 400+GB/s,硬件时 PCIE 4.0 x16 单机双卡 4090, 我用 https://github.com/NVIDIA/cuda-samples/tree/master/Samples/5_Domain_Specific/p2pBandwidthLatencyTest 测试了 p2p=disable 时的带宽（见代码块三）,求老哥/师傅们解惑

Bidirectional P2P=Disabled Bandwidth Matrix (GB/s)
   D\D     0      1 
     0 919.12   2.28 
     1   2.49 812.51