In the project DPDK, a bug is found with a special sequence of inlined intrinsics code when AVX512F optimization is enabled. Summary of the issue: - CPU: Intel Skylake - Linux environment: Ubuntu 18.04 - Compiler: GCC 7 or 8 (7.3.0-27ubuntu1~18.04 or 8.2.0-1ubuntu2~18.04) - Compiler optimizations: -march=native, -O1 and higher - Scenario: testpmd application crashes when it starts forwarding - Behaviour: AVX2 version of rte_memcpy() fails if optimized for AVX512 - Context: several nested inline functions - Workaround: disable AVX512 optimization with -mno-avx512f The URL of the bug report in DPDK project is https://bugs.dpdk.org/show_bug.cgi?id=97 Steps to reproduce: - run Ubuntu 18.04 on Skylake CPU CPU check: grep -m1 avx512 /proc/cpuinfo - compile DPDK mlx5 PMD sudo apt-get install rdma-core git clone -b v18.11-rc1 http://dpdk.org/git/dpdk cd dpdk make defconfig sed -ri 's,(MLX5_PMD=)n,\1y,' build/.config sed -ri 's,(KMOD=)y,\1n,' build/.config sed -ri 's,(UIO=)y,\1n,' build/.config make -j EXTRA_CFLAGS=-ggdb - match bad instruction pattern gdb -batch -ex 'file build/app/testpmd' -ex 'set disassembly-flavor intel' \ -ex 'disassemble/rs mlx5_tx_burst' | grep 'vmovdqu.\? .*\*8+0x[2-6]\]' AVX512F is disabled in dpdk-18.11-rc2: http://git.dpdk.org/dpdk/commit/?id=8d07c82b The DPDK code can be browsed at http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcpy.h#n305 Call stack of the inline functions: mlx5_tx_complete rte_mempool_put_bulk rte_mempool_generic_put __mempool_generic_put rte_memcpy rte_memcpy_generic rte_mov128 rte_mov32 _mm256_loadu_si256 Code of the low-level functions: static __rte_always_inline void rte_mov32(uint8_t *dst, const uint8_t *src) { __m256i ymm0; ymm0 = _mm256_loadu_si256((const __m256i *)src); _mm256_storeu_si256((__m256i *)dst, ymm0); } static inline void rte_mov128(uint8_t *dst, const uint8_t *src) { rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); } The bug appears in all calls of the inline function mlx5_tx_complete(). When disabling AVX512F, we see a different memory offset: --- bad-rte_mov128-avx512-enabled +++ good-rte_mov128-avx512-disabled - vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x0] + vmovdqu xmm0,XMMWORD PTR [rax*8+0x0] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x10],0x1 vmovups XMMWORD PTR [rsi],xmm0 vextracti128 XMMWORD PTR [rsi+0x10],ymm0,0x1 - vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x2] + vmovdqu xmm0,XMMWORD PTR [rax*8+0x20] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x30],0x1 vmovups XMMWORD PTR [rsi+0x20],xmm0 vextracti128 XMMWORD PTR [rsi+0x30],ymm0,0x1 - vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x4] + vmovdqu xmm0,XMMWORD PTR [rax*8+0x40] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x50],0x1 vmovups XMMWORD PTR [rsi+0x40],xmm0 vextracti128 XMMWORD PTR [rsi+0x50],ymm0,0x1 - vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6] + vmovdqu xmm0,XMMWORD PTR [rax*8+0x60] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x70],0x1 vmovups XMMWORD PTR [rsi+0x60],xmm0 vextracti128 XMMWORD PTR [rsi+0x70],ymm0,0x1 The result is corrupting copied data with rte_memcpy() from: 0x0000000109c43e00 0x0000000109c434c0 0x0000000109c42b80 0x0000000109c42240 0x0000000109c41900 0x0000000109c40fc0 0x0000000109c40680 0x0000000109c3fd40 0x0000000109c3f400 0x0000000109c3eac0 0x0000000109c3e180 0x0000000109c3d840 0x0000000109c3cf00 0x0000000109c3c5c0 0x0000000109c3bc80 0x0000000109c3b340 to: 0x0000000109c43e00 0x0000000109c434c0 0x0000000109c42b80 0x0000000109c42240 0x34c00000000109c4 0x2b800000000109c4 0x0000000109c40680 0x0000000109c3fd40 0x09c434c000000001 0x09c42b8000000001 0x0000000109c3e180 0x0000000109c3d840 0x000109c434c00000 0x000109c42b800000 0x0000000109c3bc80 0x0000000109c3b340 If needed to do further analysis, we can isolate the calls to rte_mov128() with this patch: --- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h +++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h @@ -330,13 +330,19 @@ rte_mov64(uint8_t *dst, const uint8_t *src) * Copy 128 bytes from one location to another, * locations should not overlap. */ +#include <rte_atomic.h> +static volatile int dpdk_bug97_marker __attribute__((used)); static inline void rte_mov128(uint8_t *dst, const uint8_t *src) { + dpdk_bug97_marker = 0xdbdb97be; /* sequence begins */ + rte_mb(); rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32); rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32); rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32); rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32); + rte_mb(); + dpdk_bug97_marker = 0xdbdb97ed; /* sequence ends */ } The disassembled sequence can be found with this kind of sed command: gdb -batch -ex 'file build/app/testpmd' -ex 'set disassembly-flavor intel' \ -ex 'disassemble/rs mlx5_tx_burst' | sed -rn 's,.*0x00.*:[[:space:]]*([0-9a-f][0-9a-f][[:space:]])*,,p' | sed '/0xdbdb97be/,/0xdbdb97ed/!d' | sed '/0xdbdb97ed/s,$,\n---,' Note: mlx5_tx_burst() is one example of buggy function, because calling mlx5_tx_complete(), which is the top of the call stack above. You can find below all four inline calls to rte_mov128() from mlx5_tx_burst(). Only the third call (from mlx5_tx_complete()) has the offsets bug. mov DWORD PTR [rip+0x937b56],0xdbdb97be # 0xde0db0 <dpdk_bug97_marker> lea r13,[rdx+0x80] mov QWORD PTR [rbp-0xe0],r8 lea r8,[rcx+0x80] mfence vmovdqu8 xmm0,XMMWORD PTR [rdx] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x10],0x1 vmovups XMMWORD PTR [rcx],xmm0 vextracti128 XMMWORD PTR [rcx+0x10],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x20] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x30],0x1 vmovups XMMWORD PTR [rcx+0x20],xmm0 vextracti128 XMMWORD PTR [rcx+0x30],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x40] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x50],0x1 vmovups XMMWORD PTR [rcx+0x40],xmm0 vextracti128 XMMWORD PTR [rcx+0x50],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x60] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x70],0x1 vmovups XMMWORD PTR [rcx+0x60],xmm0 vextracti128 XMMWORD PTR [rcx+0x70],ymm0,0x1 mfence mov DWORD PTR [rip+0x937acb],0xdbdb97ed # 0xde0db0 <dpdk_bug97_marker> --- mov DWORD PTR [rip+0x937912],0xdbdb97be # 0xde0db0 <dpdk_bug97_marker> lea rsi,[rax-0x80] lea r12,[rdx+0x80] mfence vmovdqu8 xmm0,XMMWORD PTR [rdx] mov rdi,QWORD PTR [rbp-0x60] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x10],0x1 lea rcx,[rdi+0xb0] vmovups XMMWORD PTR [rdi+0x30],xmm0 vextracti128 XMMWORD PTR [rdi+0x40],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x20] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x30],0x1 vmovups XMMWORD PTR [rdi+0x50],xmm0 vextracti128 XMMWORD PTR [rdi+0x60],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x40] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x50],0x1 vmovups XMMWORD PTR [rdi+0x70],xmm0 vextracti128 XMMWORD PTR [rdi+0x80],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x60] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x70],0x1 vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x30],0x1 vmovups XMMWORD PTR [rdi+0x50],xmm0 vextracti128 XMMWORD PTR [rdi+0x60],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x40] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x50],0x1 vmovups XMMWORD PTR [rdi+0x70],xmm0 vextracti128 XMMWORD PTR [rdi+0x80],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rdx+0x60] vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x70],0x1 vmovups XMMWORD PTR [rdi+0x90],xmm0 vextracti128 XMMWORD PTR [rdi+0xa0],ymm0,0x1 mfence mov DWORD PTR [rip+0x93787c],0xdbdb97ed # 0xde0db0 <dpdk_bug97_marker> --- mov DWORD PTR [rip+0x936fd2],0xdbdb97be # 0xde0db0 <dpdk_bug97_marker> lea rcx,[rax-0x80] sub rdx,0xffffffffffffff80 mfence mov rax,QWORD PTR [rbp-0xa0] vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x0] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x10],0x1 vmovups XMMWORD PTR [rdx-0x80],xmm0 vextracti128 XMMWORD PTR [rdx-0x70],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x2] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x30],0x1 vmovups XMMWORD PTR [rdx-0x60],xmm0 vextracti128 XMMWORD PTR [rdx-0x50],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x4] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x50],0x1 vmovups XMMWORD PTR [rdx-0x40],xmm0 vextracti128 XMMWORD PTR [rdx-0x30],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6] vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x70],0x1 lea rax,[r9+0x80] vmovups XMMWORD PTR [rdx-0x20],xmm0 vextracti128 XMMWORD PTR [rdx-0x10],ymm0,0x1 mfence mov DWORD PTR [rip+0x936f24],0xdbdb97ed # 0xde0db0 <dpdk_bug97_marker> --- mov DWORD PTR [rip+0x936e2f],0xdbdb97be # 0xde0db0 <dpdk_bug97_marker> lea rdi,[r9+0x80] add rax,0xffffffffffffff80 mfence vmovdqu8 xmm0,XMMWORD PTR [r9] sub rdx,0xffffffffffffff80 vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x10],0x1 vmovups XMMWORD PTR [rdx-0x80],xmm0 vextracti128 XMMWORD PTR [rdx-0x70],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [r9+0x20] vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x30],0x1 vmovups XMMWORD PTR [rdx-0x60],xmm0 vextracti128 XMMWORD PTR [rdx-0x50],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [r9+0x40] vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x50],0x1 vmovups XMMWORD PTR [rdx-0x40],xmm0 vextracti128 XMMWORD PTR [rdx-0x30],ymm0,0x1 vmovdqu8 xmm0,XMMWORD PTR [r9+0x60] vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x70],0x1 vmovups XMMWORD PTR [rdx-0x20],xmm0 vextracti128 XMMWORD PTR [rdx-0x10],ymm0,0x1 mfence mov DWORD PTR [rip+0x936da9],0xdbdb97ed # 0xde0db0 <dpdk_bug97_marker>
Can you please attach a pre-processed test case of a file that shows the bug? It's ok if it doesn't run, as long as the problem is clearly identified in the assembler. Then the test case could be likely minimized.
This is very likely this Binutils bug (as incorrectly encodes displacement): https://sourceware.org/bugzilla/show_bug.cgi?id=23465 (so should works fine with binutils 2.29 or 2.31) In GCC this was reported as PR 86735 and dups.
Since this only report of this, I am going to mark this as a dup of bug 86735 which means it was a binutils bug. *** This bug has been marked as a duplicate of bug 86735 ***