不客气。我很久没鼓捣过这玩意儿了,我记得 numba 的各路 jit 处理这个问题是很嗨的,你要是经常遇到这种问题,去看看 numba 吧,有 GPU 加速更开心 ^_^
In [1]: from numba import guvectorize, float64
In [2]: a = np.random.random((100000,2,2))
In [3]: b = np.random.random((100000,2,2))
In [4]: c = np.matmul(a, b)
In [5]: d = np.array([np.dot(i,j) for i,j in zip(a,b)])
In [6]: @
guvectorize([(float64[:,:],float64[:,:], float64[:,:])], '(n,m),(m,n)->(n,n)', target='parallel') #target='cpu','gpu'
   ...: def mydot(a,b,res):
   ...:     for i in range(res.shape[0]):
   ...:         for j in range(res.shape[1]):
   ...:             tmp = 0.
   ...:             for k in range(a.shape[1]):
   ...:                 tmp += a[i, k] * b[k, j]
   ...:             res[i, j] = tmp
   ...:             
In [7]: e = mydot(a, b)
In [8]: np.allclose(c,e)
Out[8]: True
In [9]: np.allclose(c,d)
Out[9]: True
In [10]: %timeit mydot(a,b)
234 µs ± 4.02 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
In [11]: %timeit np.array(list(map(np.dot, a, b)))
210 ms ± 2.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [12]: %timeit np.array([np.dot(i,j) for i, j in zip(a,b)])
235 ms ± 5.87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [13]: %timeit np.matmul(a,b)
41.1 ms ± 90 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)