x64 | metaclassofnilblog

This blog post is about the folly of implementing your own synchronization primitives without thinking about what compilers are allowed to do. If you’re not into low-level C/x64 parallelism programming then you can safely skip it

In the Insieme project, we use one double-ended work stealing queue per hardware thread which can be independently accessed (read and write) at both ends. It’s implemented as a circular buffer with a 64 bit control word.

// =========== Circular work buffers
// front = top, back = bottom
// top INCLUSIVE, bottom EXCLUSIVE
// Length needs to be a power of 2!
//
//  8 |       |
//    |-------|
//  7 |       |  <- top_update
//  6 |       |
//    |-------|
//  5 |#######|  <- top_val
//  4 |#######|
//  3 |#######|
//    |-------|
//  2 |       |  <- bot_val
//  1 |       |
//    |-------|
//  0 |       |  <- bot_update

typedef union _irt_cwb_state {
	uint64 all;
	struct {
		union {
			uint32 top;
			struct {
				uint16 top_val;
				uint16 top_update;
			};
		};
		union {
			uint32 bot;
			struct {
				uint16 bot_val;
				uint16 bot_update;
			};
		};
	};
} irt_cwb_state;

typedef struct _irt_circular_work_buffer {
	volatile irt_cwb_state state;
	irt_work_item* items[IRT_CWBUFFER_LENGTH];
} irt_circular_work_buffer;

#define IRT_CWBUFFER_LENGTH 32
#define IRT_CWBUFFER_MASK (IRT_CWBUFFER_LENGTH-1)

// =========== Circular work buffers

// front = top, back = bottom

// top INCLUSIVE, bottom EXCLUSIVE

// Length needs to be a power of 2!

// 8 | |

// |-------|

// 7 | | <- top_update

// 6 | |

// |-------|

// 5 |#######| <- top_val

// 4 |#######|

// 3 |#######|

// |-------|

// 2 | | <- bot_val

// 1 | |

// |-------|

// 0 | | <- bot_update

typedef union _irt_cwb_state {

uint64 all;

struct {

union {

uint32 top;

struct {

uint16 top_val;

uint16 top_update;

};

union {

uint32 bot;

struct {

uint16 bot_val;

uint16 bot_update;

};

} irt_cwb_state;

typedef struct _irt_circular_work_buffer {

volatile irt_cwb_state state;

irt_work_item* items[IRT_CWBUFFER_LENGTH];

} irt_circular_work_buffer;

#define IRT_CWBUFFER_LENGTH 32

#define IRT_CWBUFFER_MASK (IRT_CWBUFFER_LENGTH-1)

The original code for adding a new item to this queue looked something like this:

void irt_cwb_push_front(irt_circular_work_buffer* wb, irt_work_item* wi) {
	// check feasibility
	irt_cwb_state state, newstate;
	for(;;) {
		state.all = wb->state.all;
		if(state.top_update != state.top_val) continue; // operation in progress on top
		// check for space
		newstate.all = state.all;
		newstate.top_update = (newstate.top_update+1) & IRT_CWBUFFER_MASK;
		if(newstate.top_update == state.bot_update 
			|| newstate.top_update == state.bot_val) continue; // not enough space in buffer, would be full after op
		// if we reach this point and no changes happened, we can perform our op
		if(irt_atomic_bool_compare_and_swap(&wb->state.all, state.all, newstate.all)) break; // repeat if state change since check
	}

	// write actual data to buffer
	wb->items[newstate.top_update] = wi;
	// finish operation
	wb->state.top_val = newstate.top_update;
}

void irt_cwb_push_front(irt_circular_work_buffer* wb, irt_work_item* wi) {

// check feasibility

irt_cwb_state state, newstate;

for(;;) {

state.all = wb->state.all;

if(state.top_update != state.top_val) continue; // operation in progress on top

// check for space

newstate.all = state.all;

newstate.top_update = (newstate.top_update+1) & IRT_CWBUFFER_MASK;

if(newstate.top_update == state.bot_update

|| newstate.top_update == state.bot_val) continue; // not enough space in buffer, would be full after op

// if we reach this point and no changes happened, we can perform our op

if(irt_atomic_bool_compare_and_swap(&wb->state.all, state.all, newstate.all)) break; // repeat if state change since check

}

// write actual data to buffer

wb->items[newstate.top_update] = wi;

// finish operation

wb->state.top_val = newstate.top_update;

}

Now, this generally worked fine in practice, but in unit tests around ever 21 millionth insertion failed. After chasing a few wrong leads I figured out that setting newstate to volatile fixed the issue. The problem with this, of course, is that it makes no sense. It’s a local variable stored on the stack of the executing thread – it can not be accessed by any other thread.

In the end, to understand the issue, looking into the generated assembler code for both versions was required. Here’s what gcc does in the nonvolatile version:

.LVL402:
.LBB2802:
	.loc 15 80 0
	movabs	r8, -4294901761
	.p2align 4,,10
	.p2align 3
.L409:
	.loc 15 76 0
	mov	rax, QWORD PTR [rdi]
	.loc 15 77 0
	mov	rdx, rax                 ; copy state
	shr	rdx, 16                  ; move top_update right
	cmp	dx, ax                   ; compare top_update w/ top_val
	jne	.L409                    ; if not equal restart
	.loc 15 80 0
	add	edx, 1                   ; top_update+1
	mov	rcx, rax                 ; get original state to c
	and	edx, 15                  ; & IRT_CWBUFFER_MASK
	and	rcx, r8                  ; delete bits 17 to 32 from original state in rcx
	movzx  r9d, dx                  ; r9d = lower 32 bits / zero extend top_update into 32 bits of r9
	sal	r9, 16                   ; move top_val back into position
	or     rcx, r9                  ; enter new top_update into original state
	.loc 15 81 0
	mov	r9, rax
	shr	r9, 48                   ; get bot_update in r9
	cmp	dx, r9w                  ; compare bot_update with newstate.top_update
	je     .L409                    ; no space, retry
	mov	r9, rax                  ; \
	shr	r9, 32                   ;  -> same story, bot_val
	cmp	dx, r9w                  ; /
	je	.L409 
	.loc 15 84 0
	lock cmpxchg	QWORD PTR [rdi], rcx
	jne	.L409
	.loc 15 88 0
	mov	rax, rdx
	.loc 15 90 0
	mov	WORD PTR [rdi], dx
	.loc 15 88 0
	and	eax, 15
	mov	QWORD PTR [rdi+8+rax*8], rsi
.LBE2802:
	.loc 15 91 0
	ret

.LVL402:

.LBB2802:

.loc 15 80 0

movabs r8, -4294901761

.p2align 4,,10

.p2align 3

.L409:

.loc 15 76 0

mov rax, QWORD PTR [rdi]

.loc 15 77 0

mov rdx, rax ; copy state

shr rdx, 16 ; move top_update right

cmp dx, ax ; compare top_update w/ top_val

jne .L409 ; if not equal restart

.loc 15 80 0

add edx, 1 ; top_update+1

mov rcx, rax ; get original state to c

and edx, 15 ; & IRT_CWBUFFER_MASK

and rcx, r8 ; delete bits 17 to 32 from original state in rcx

movzx r9d, dx ; r9d = lower 32 bits / zero extend top_update into 32 bits of r9

sal r9, 16 ; move top_val back into position

or rcx, r9 ; enter new top_update into original state

.loc 15 81 0

mov r9, rax

shr r9, 48 ; get bot_update in r9

cmp dx, r9w ; compare bot_update with newstate.top_update

je .L409 ; no space, retry

mov r9, rax ; \

shr r9, 32 ; -> same story, bot_val

cmp dx, r9w ; /

je .L409

.loc 15 84 0

lock cmpxchg QWORD PTR [rdi], rcx

jne .L409

.loc 15 88 0

mov rax, rdx

.loc 15 90 0

mov WORD PTR [rdi], dx

.loc 15 88 0

and eax, 15

mov QWORD PTR [rdi+8+rax*8], rsi

.LBE2802:

.loc 15 91 0

ret

And here’s the volatile one:

.L409:
.LBB2766:
	.loc 16 76 0
	mov	rax, QWORD PTR [rdi]
	.loc 16 77 0
	mov	rdx, rax
	shr	rdx, 16
	cmp	dx, ax
	jne	.L409
	.loc 16 79 0
	mov	QWORD PTR [rsp-16], rax
	.loc 16 80 0
	movzx	edx, WORD PTR [rsp-14]
	add	edx, 1
	and	edx, 15
	mov	WORD PTR [rsp-14], dx
	.loc 16 81 0
	movzx	ecx, WORD PTR [rsp-14]
	mov	rdx, rax
	shr	rdx, 48
	cmp	cx, dx
	je	.L409
	movzx	ecx, WORD PTR [rsp-14]
	mov	rdx, rax
	shr	rdx, 32
	cmp	cx, dx
	je	.L409
	.loc 16 84 0
	mov	rdx, QWORD PTR [rsp-16]
	lock cmpxchg	QWORD PTR [rdi], rdx
	jne	.L409
	.loc 16 88 0
	movzx	eax, WORD PTR [rsp-14]
	movzx	eax, ax
	mov	QWORD PTR [rdi+8+rax*8], rsi
	.loc 16 90 0
	movzx	eax, WORD PTR [rsp-14]
	mov	WORD PTR [rdi], ax
.LBE2766:
	.loc 16 91 0
	ret

.L409:

.LBB2766:

.loc 16 76 0

mov rax, QWORD PTR [rdi]

.loc 16 77 0

mov rdx, rax

shr rdx, 16

cmp dx, ax

jne .L409

.loc 16 79 0

mov QWORD PTR [rsp-16], rax

.loc 16 80 0

movzx edx, WORD PTR [rsp-14]

add edx, 1

and edx, 15

mov WORD PTR [rsp-14], dx

.loc 16 81 0

movzx ecx, WORD PTR [rsp-14]

mov rdx, rax

shr rdx, 48

cmp cx, dx

je .L409

movzx ecx, WORD PTR [rsp-14]

mov rdx, rax

shr rdx, 32

cmp cx, dx

je .L409

.loc 16 84 0

mov rdx, QWORD PTR [rsp-16]

lock cmpxchg QWORD PTR [rdi], rdx

jne .L409

.loc 16 88 0

movzx eax, WORD PTR [rsp-14]

movzx eax, ax

mov QWORD PTR [rdi+8+rax*8], rsi

.loc 16 90 0

movzx eax, WORD PTR [rsp-14]

mov WORD PTR [rdi], ax

.LBE2766:

.loc 16 91 0

ret

As you can see from the comments in the first version, we started interpreting the assembly from the top. That was a mistake. If you look at the last few lines, you can see the culprit. The line mov QWORD PTR [rdi+8+rax*8], rsi corresponds to wb->items[newstate.top_update] = wi; . In the non-volatile version, gcc decides to move that line below the unlocking of the data structure. This is a perfectly valid transformation, since there are no dependencies between the two lines (gcc is obviously unaware of any parallelism going on).

There are many ways to fix the issue: add a memory barrier ( __sync_synchronize in gcc), do the assignment using an atomic exchange operation, or if you want to stay in pure C: (wb->items[newstate.top_update] = wi) && (wb->state.top_val = newstate.top_update); . Which is admittedly ugly, and only works since wi is never NULL . Sadly, all of these options have a slight performance penalty. If anyone knows any other portable way to enforce the ordering of operations in this case, I’d be happy to hear about it.

And that’s it, more or less. Lessons learned: take care when implementing your own synchronizations. If you think you are taking care, take more care. And when comparing assembly, look at the obvious differences before starting to interpret the code top down.

metaclassofnilblog

Tag Archives: x64

New GeDoSaTo Version, Status Update

DX11 / 64 bit status

Download

Implementing your own synchronisation primitives is tricky