88096 – wrong inline AVX512F optimization

Bug 88096 - wrong inline AVX512F optimization

Summary: wrong inline AVX512F optimization

Status:	RESOLVED DUPLICATE of bug 86735

Alias:	None

Product:	gcc
Classification:	Unclassified
Component:	target (show other bugs)
Version:	8.2.0

Importance:	P3 normal
Target Milestone:	---
Assignee:	Not yet assigned to anyone

URL:
Keywords:	wrong-code

Depends on:
Blocks:

Reported:	2018-11-19 14:48 UTC by Thomas Monjalon
Modified:	2021-10-11 01:38 UTC (History)
CC List:	2 users (show)

See Also:
Host:
Target:	x86_64--, i?86--
Build:
Known to work:
Known to fail:
Last reconfirmed:

Attachments
Add an attachment (proposed patch, testcase, etc.)

Note You need to log in before you can comment on or make changes to this bug.

Description Thomas Monjalon 2018-11-19 14:48:47 UTC

In the project DPDK, a bug is found with a special sequence of
inlined intrinsics code when AVX512F optimization is enabled.

Summary of the issue:
	- CPU: Intel Skylake
	- Linux environment: Ubuntu 18.04
	- Compiler: GCC 7 or 8 (7.3.0-27ubuntu1~18.04 or 8.2.0-1ubuntu2~18.04)
	- Compiler optimizations: -march=native, -O1 and higher
	- Scenario: testpmd application crashes when it starts forwarding
	- Behaviour: AVX2 version of rte_memcpy() fails if optimized for AVX512
	- Context: several nested inline functions
	- Workaround: disable AVX512 optimization with -mno-avx512f

The URL of the bug report in DPDK project is
        https://bugs.dpdk.org/show_bug.cgi?id=97

Steps to reproduce:
	- run Ubuntu 18.04 on Skylake CPU
		CPU check: grep -m1 avx512 /proc/cpuinfo
	- compile DPDK mlx5 PMD
		sudo apt-get install rdma-core
		git clone -b v18.11-rc1 http://dpdk.org/git/dpdk
		cd dpdk
		make defconfig
		sed -ri 's,(MLX5_PMD=)n,\1y,' build/.config
		sed -ri 's,(KMOD=)y,\1n,' build/.config
		sed -ri 's,(UIO=)y,\1n,' build/.config
		make -j EXTRA_CFLAGS=-ggdb
	- match bad instruction pattern
		gdb -batch -ex 'file build/app/testpmd' -ex 'set disassembly-flavor intel' \
		-ex 'disassemble/rs mlx5_tx_burst' | grep 'vmovdqu.\? .*\*8+0x[2-6]\]'

AVX512F is disabled in dpdk-18.11-rc2:
	http://git.dpdk.org/dpdk/commit/?id=8d07c82b

The DPDK code can be browsed at
	http://git.dpdk.org/dpdk/tree/lib/librte_eal/common/include/arch/x86/rte_memcpy.h#n305

Call stack of the inline functions:

        mlx5_tx_complete
          rte_mempool_put_bulk
            rte_mempool_generic_put
              __mempool_generic_put
                rte_memcpy
                  rte_memcpy_generic
                    rte_mov128
                      rte_mov32
                        _mm256_loadu_si256

Code of the low-level functions:

	static __rte_always_inline void
	rte_mov32(uint8_t *dst, const uint8_t *src)
	{
		__m256i ymm0;
		ymm0 = _mm256_loadu_si256((const __m256i *)src);
		_mm256_storeu_si256((__m256i *)dst, ymm0);
	}
	static inline void
	rte_mov128(uint8_t *dst, const uint8_t *src)
	{
		rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
		rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
		rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
		rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
	}

The bug appears in all calls of the inline function mlx5_tx_complete().
When disabling AVX512F, we see a different memory offset:

--- bad-rte_mov128-avx512-enabled
+++ good-rte_mov128-avx512-disabled
-	vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x0]
+	vmovdqu xmm0,XMMWORD PTR [rax*8+0x0]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x10],0x1
	vmovups XMMWORD PTR [rsi],xmm0
	vextracti128 XMMWORD PTR [rsi+0x10],ymm0,0x1
-	vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x2]
+	vmovdqu xmm0,XMMWORD PTR [rax*8+0x20]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x30],0x1
	vmovups XMMWORD PTR [rsi+0x20],xmm0
	vextracti128 XMMWORD PTR [rsi+0x30],ymm0,0x1
-	vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x4]
+	vmovdqu xmm0,XMMWORD PTR [rax*8+0x40]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x50],0x1
	vmovups XMMWORD PTR [rsi+0x40],xmm0
	vextracti128 XMMWORD PTR [rsi+0x50],ymm0,0x1
-	vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6]
+	vmovdqu xmm0,XMMWORD PTR [rax*8+0x60]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x70],0x1
	vmovups XMMWORD PTR [rsi+0x60],xmm0
	vextracti128 XMMWORD PTR [rsi+0x70],ymm0,0x1

The result is corrupting copied data with rte_memcpy()
from:
    0x0000000109c43e00    0x0000000109c434c0
    0x0000000109c42b80    0x0000000109c42240
    0x0000000109c41900    0x0000000109c40fc0
    0x0000000109c40680    0x0000000109c3fd40
    0x0000000109c3f400    0x0000000109c3eac0
    0x0000000109c3e180    0x0000000109c3d840
    0x0000000109c3cf00    0x0000000109c3c5c0
    0x0000000109c3bc80    0x0000000109c3b340
to:
    0x0000000109c43e00    0x0000000109c434c0
    0x0000000109c42b80    0x0000000109c42240
    0x34c00000000109c4    0x2b800000000109c4
    0x0000000109c40680    0x0000000109c3fd40
    0x09c434c000000001    0x09c42b8000000001
    0x0000000109c3e180    0x0000000109c3d840
    0x000109c434c00000    0x000109c42b800000
    0x0000000109c3bc80    0x0000000109c3b340

If needed to do further analysis, we can isolate the calls to rte_mov128()
with this patch:

--- a/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
+++ b/lib/librte_eal/common/include/arch/x86/rte_memcpy.h
@@ -330,13 +330,19 @@ rte_mov64(uint8_t *dst, const uint8_t *src)
  * Copy 128 bytes from one location to another,
  * locations should not overlap.
  */
+#include <rte_atomic.h>
+static volatile int dpdk_bug97_marker __attribute__((used));
 static inline void
 rte_mov128(uint8_t *dst, const uint8_t *src)
 {
+	dpdk_bug97_marker = 0xdbdb97be; /* sequence begins */
+	rte_mb();
 	rte_mov32((uint8_t *)dst + 0 * 32, (const uint8_t *)src + 0 * 32);
 	rte_mov32((uint8_t *)dst + 1 * 32, (const uint8_t *)src + 1 * 32);
 	rte_mov32((uint8_t *)dst + 2 * 32, (const uint8_t *)src + 2 * 32);
 	rte_mov32((uint8_t *)dst + 3 * 32, (const uint8_t *)src + 3 * 32);
+	rte_mb();
+	dpdk_bug97_marker = 0xdbdb97ed; /* sequence ends */
 }

The disassembled sequence can be found with this kind of sed command:

	gdb -batch -ex 'file build/app/testpmd' -ex 'set disassembly-flavor intel' \
		-ex 'disassemble/rs mlx5_tx_burst' |
	sed -rn 's,.*0x00.*:[[:space:]]*([0-9a-f][0-9a-f][[:space:]])*,,p' |
	sed '/0xdbdb97be/,/0xdbdb97ed/!d' | sed '/0xdbdb97ed/s,$,\n---,'

Note: mlx5_tx_burst() is one example of buggy function,
because calling mlx5_tx_complete(), which is the top of the call stack above.

You can find below all four inline calls to rte_mov128() from mlx5_tx_burst().
Only the third call (from mlx5_tx_complete()) has the offsets bug.

	mov    DWORD PTR [rip+0x937b56],0xdbdb97be        # 0xde0db0 <dpdk_bug97_marker>
	lea    r13,[rdx+0x80]
	mov    QWORD PTR [rbp-0xe0],r8
	lea    r8,[rcx+0x80]
	mfence 
	vmovdqu8 xmm0,XMMWORD PTR [rdx]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x10],0x1
	vmovups XMMWORD PTR [rcx],xmm0
	vextracti128 XMMWORD PTR [rcx+0x10],ymm0,0x1
	vmovdqu8 xmm0,XMMWORD PTR [rdx+0x20]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x30],0x1
	vmovups XMMWORD PTR [rcx+0x20],xmm0
	vextracti128 XMMWORD PTR [rcx+0x30],ymm0,0x1
	vmovdqu8 xmm0,XMMWORD PTR [rdx+0x40]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x50],0x1
	vmovups XMMWORD PTR [rcx+0x40],xmm0
	vextracti128 XMMWORD PTR [rcx+0x50],ymm0,0x1
	vmovdqu8 xmm0,XMMWORD PTR [rdx+0x60]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x70],0x1
	vmovups XMMWORD PTR [rcx+0x60],xmm0
	vextracti128 XMMWORD PTR [rcx+0x70],ymm0,0x1
	mfence 
	mov    DWORD PTR [rip+0x937acb],0xdbdb97ed        # 0xde0db0 <dpdk_bug97_marker>
	---
	mov    DWORD PTR [rip+0x937912],0xdbdb97be        # 0xde0db0 <dpdk_bug97_marker>
	lea    rsi,[rax-0x80]
	lea    r12,[rdx+0x80]
	mfence 
	vmovdqu8 xmm0,XMMWORD PTR [rdx]
	mov    rdi,QWORD PTR [rbp-0x60]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x10],0x1
	lea    rcx,[rdi+0xb0]
	vmovups XMMWORD PTR [rdi+0x30],xmm0
	vextracti128 XMMWORD PTR [rdi+0x40],ymm0,0x1
	vmovdqu8 xmm0,XMMWORD PTR [rdx+0x20]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x30],0x1
	vmovups XMMWORD PTR [rdi+0x50],xmm0
	vextracti128 XMMWORD PTR [rdi+0x60],ymm0,0x1
	vmovdqu8 xmm0,XMMWORD PTR [rdx+0x40]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x50],0x1
	vmovups XMMWORD PTR [rdi+0x70],xmm0
	vextracti128 XMMWORD PTR [rdi+0x80],ymm0,0x1
	vmovdqu8 xmm0,XMMWORD PTR [rdx+0x60]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x70],0x1
	vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x30],0x1
	vmovups XMMWORD PTR [rdi+0x50],xmm0
	vextracti128 XMMWORD PTR [rdi+0x60],ymm0,0x1
	vmovdqu8 xmm0,XMMWORD PTR [rdx+0x40]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x50],0x1
	vmovups XMMWORD PTR [rdi+0x70],xmm0
	vextracti128 XMMWORD PTR [rdi+0x80],ymm0,0x1
	vmovdqu8 xmm0,XMMWORD PTR [rdx+0x60]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rdx+0x70],0x1
	vmovups XMMWORD PTR [rdi+0x90],xmm0
	vextracti128 XMMWORD PTR [rdi+0xa0],ymm0,0x1
	mfence 
	mov    DWORD PTR [rip+0x93787c],0xdbdb97ed        # 0xde0db0 <dpdk_bug97_marker>
	---
	mov    DWORD PTR [rip+0x936fd2],0xdbdb97be        # 0xde0db0 <dpdk_bug97_marker>
	lea    rcx,[rax-0x80]
	sub    rdx,0xffffffffffffff80
	mfence 
	mov    rax,QWORD PTR [rbp-0xa0]
	vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x0]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x10],0x1
	vmovups XMMWORD PTR [rdx-0x80],xmm0
	vextracti128 XMMWORD PTR [rdx-0x70],ymm0,0x1
	vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x2]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x30],0x1
	vmovups XMMWORD PTR [rdx-0x60],xmm0
	vextracti128 XMMWORD PTR [rdx-0x50],ymm0,0x1
	vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x4]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x50],0x1
	vmovups XMMWORD PTR [rdx-0x40],xmm0
	vextracti128 XMMWORD PTR [rdx-0x30],ymm0,0x1
	vmovdqu8 xmm0,XMMWORD PTR [rax*8+0x6]
	vinserti128 ymm0,ymm0,XMMWORD PTR [rax*8+0x70],0x1
	lea    rax,[r9+0x80]
	vmovups XMMWORD PTR [rdx-0x20],xmm0
	vextracti128 XMMWORD PTR [rdx-0x10],ymm0,0x1
	mfence 
	mov    DWORD PTR [rip+0x936f24],0xdbdb97ed        # 0xde0db0 <dpdk_bug97_marker>
	---
	mov    DWORD PTR [rip+0x936e2f],0xdbdb97be        # 0xde0db0 <dpdk_bug97_marker>
	lea    rdi,[r9+0x80]
	add    rax,0xffffffffffffff80
	mfence 
	vmovdqu8 xmm0,XMMWORD PTR [r9]
	sub    rdx,0xffffffffffffff80
	vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x10],0x1
	vmovups XMMWORD PTR [rdx-0x80],xmm0
	vextracti128 XMMWORD PTR [rdx-0x70],ymm0,0x1
	vmovdqu8 xmm0,XMMWORD PTR [r9+0x20]
	vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x30],0x1
	vmovups XMMWORD PTR [rdx-0x60],xmm0
	vextracti128 XMMWORD PTR [rdx-0x50],ymm0,0x1
	vmovdqu8 xmm0,XMMWORD PTR [r9+0x40]
	vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x50],0x1
	vmovups XMMWORD PTR [rdx-0x40],xmm0
	vextracti128 XMMWORD PTR [rdx-0x30],ymm0,0x1
	vmovdqu8 xmm0,XMMWORD PTR [r9+0x60]
	vinserti128 ymm0,ymm0,XMMWORD PTR [r9+0x70],0x1
	vmovups XMMWORD PTR [rdx-0x20],xmm0
	vextracti128 XMMWORD PTR [rdx-0x10],ymm0,0x1
	mfence 
	mov    DWORD PTR [rip+0x936da9],0xdbdb97ed        # 0xde0db0 <dpdk_bug97_marker>

Comment 1 Andi Kleen 2018-11-29 00:21:25 UTC

Can you please attach a pre-processed test case of a file that shows the bug?

It's ok if it doesn't run, as long as the problem is clearly identified in the assembler.

Then the test case could be likely minimized.

Comment 2 Alexander Monakov 2018-11-29 14:09:48 UTC

This is very likely this Binutils bug (as incorrectly encodes displacement):

https://sourceware.org/bugzilla/show_bug.cgi?id=23465

(so should works fine with binutils 2.29 or 2.31)

In GCC this was reported as PR 86735 and dups.

Comment 3 Andrew Pinski 2021-10-11 01:38:45 UTC

Since this only report of this, I am going to mark this as a dup of bug 86735 which means it was a binutils bug.

*** This bug has been marked as a duplicate of bug 86735 ***