MASM: Bubble Sorting in Assembler lol!
Introduction
This is a scrappy post.
Bubble Sort is an awful algorithm except, apparently, when your list is small and already mostly sorted … Brilliant.
But it is simple, so I decided to try one in assembler and came up with a couple of methods. I also added a C version just to introduce a touch of competition. However, this did drop me down a bit of a rabbit hole.
Method 1: Use Last Element Address as Upper Bounds
Instead of using a register as a counter, I use lea
to calculate the last address of the array and use that to check if we’ve reached the end.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
; ==============================================================================
; NOTES
; ==============================================================================
; Volatile Registers: RAX, RCX, RDX, R8, R9, R10
; ==============================================================================
; Case-sensitivity
;
; There's actually an assembler option to switch this on, but
; will leave here for now.
; It's required when we're mixing assembler with C/C++
OPTION CASEMAP:NONE
; Typedefs
int32_t TYPEDEF SDWORD
.CODE
; ==============================================================================
; bubble_sort(uint32_t *array, uint32_t array_len)
; ==============================================================================
bubble_sort_asm PROC
; Want this all in registers, no vars!
; Storage
; =======
; RAX = Copy of array address used to reset scan (and subsequent return value)
; RBX = Address of last element of the array (must preserve on stack!)
; RCX = Paramater: Contains address of array
; RDX = Parameter: Contains length of array (Int32_t)
; R8D = Store for current value (n) being compared
; R9D = Store for (n+1) so it can be used to compare against n
; R10b = 1/0 based on whether we swapped values during last array scan
; Reserve 32-bytes home space, 1-byte to save RBX
sub rsp, 40
; (R)(E)BX is not volatile so have to preserve it
push rbx
; store array address in rax so, 1) it's ready to return when
; done 2) we can use it again when we're resetting the count
; for each array scan.
mov rax, rcx
; load rbx with address of last element in the array. We'll use
; this value in a `cmp` to check whether we've reached the end
; of the array (rather than use up another register/var with a
; separate counter)
; to avoid array[0]..array[Len-1] shennanigans
dec rdx
; numbers we're sorting are all 4-byte (32-bit) signed ints
lea rbx, [rcx + (rdx*4)]
; set R10b to 'false' (0)
xor r10b, r10b
check: ; have we reached end of the array? Check by comparing the
; address in rcx with the address in rbx (see above)
cmp rcx, rbx
jb compare
; we are at the end of the array, but did we swap any values
; during the previous scan?
cmp r10b, 0
; no, so we must be done sorting!
je done
; yep, so reset to start another scan through the array.
mov rcx, rax
xor r10b, r10b
compare: mov r8d, int32_t ptr [rcx]
mov r9d, int32_t ptr [rcx+4]
cmp r8d, r9d
jng continue
; we need to swap as the next elem is higher than the current
; one.
; already stored rcx value in r8d, so now get the n+1
; value from r9d
mov [rcx+4], r8d
mov [rcx], r9d
; Set r10b to true
mov r10b, 1
continue: add rcx, 4
jmp check
done: pop rbx
add rsp, 40
ret
bubble_sort_asm ENDP
; END of source file
END
Method 2: Use a Counter to Detect End of the Loop
Pretty much just as the title says. Register R8d is loaded with the array length and decremented on each pass.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
; ==============================================================================
; NOTES
; ==============================================================================
; Volatile Registers: RAX, RCX, RDX, R8, R9, R10
; ==============================================================================
; Case-sensitivity
;
; There's actually an assembler option to switch this on, but
; will leave here for now.
; It's required when we're mixing assembler with C/C++
OPTION CASEMAP:NONE
; Typedefs
int32_t TYPEDEF SDWORD
.CODE
externdef clock:proc
; ==============================================================================
; bubble_sort(uint32_t *array, uint32_t array_len)
; ==============================================================================
bubble_sort_asm PROC
; Want this all in registers, no vars!
; Storage
; =======
; RAX = Preserve array address / Return Sorteed Array Pointer
; RCX = Array pointer passed as parameter
; RDX = Array Length passed as parameter
; R8D = Counter
; R9D = Stores n for comparison
; R10D = Stores n+1 for comparison
; R11b = 1/0 on whether we swapped values during previous scan
sub rsp, 32
; store array address in rax so we can use it again when we're
; resetting for each array scan and have it ready for when we're
; all done.
mov rax, rcx
; store length so we can subtract as the counter
sub edx, 1
mov r8d, edx
; set R10b to 'false' (0)
xor r11b, r11b
; have we gone through all the elements? Check by comparing r8d
; (our counter) with 0
check: cmp r8d, 0
jne compare
; Did we exchange any values?
cmp r11b, 0
; Nope, we're done
je done
; 'Fraid so, reset for another loop
mov rcx, rax
mov r8d, edx
xor r11b, r11b
compare: ; We pull rcc[n] and rcx[n+1] into registers for comparison
mov r9d, [rcx]
mov r10d, [rcx+4]
cmp r9d, r10d
jle continue
; Swap
mov [rcx+4], r9d
mov [rcx], r10d
mov r11b, 1
continue: dec r8d
add rcx, 4
jmp check
done: add rsp, 32
ret
bubble_sort_asm ENDP
; END of source file
END
Here’s the Problem …
When I time execution against either of these functions they are consistently two seconds longer than the C counterpart. This is against the same array of integers with the same values generated in the same order. I really thought the assembler version would be quicker. What’s so special about the C compiled version?
I should note that, I call the functions from within a shell C program where they’re referenced as extern
s.
In the end I couldn’t resist pulling the compiled executable into IDA and reverse-engineering the compiled function disassembly to see if the cl.exe
had pulled any sneaky tricks.
To begin with, here’s the C version of the function.
C Version
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
void bubble_sort_c(int32_t *array, const int32_t array_len) {
bool swapped = true;
int32_t temp;
do {
swapped = false;
for(int i=0; i<array_len-1; i++) {
if (array[i] > array[i+1]) {
temp = array[i+1];
array[i+1] = array[i];
array[i] = temp;
swapped = true;
}
}
} while(swapped == true);
}
Disassembly
…And now the function disassembly. The table, below, describes what each register is used for.
Register | Purpose | |
---|---|---|
RDI | Address of array (element 0) | |
ECX | Initially used to store current array value (left-hand side), then loaded with next array value (right-hand side) just before comparison | |
R8d | Loaded with value of current element (left-hand value) | |
R9 | Contains address of current element (used as left-hand value) | |
R10d | Used as loop counter from number of elements in array down to zero | |
R11b | Used for ‘swapped’ flag (1/0) |
Now, the listing with my comments on the right. It’s important to point out that this function was inlined by the compiler whereas the assembler ones were not. This may well have a lot to do with the performance difference.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
mov ebx, eax ; Preserve EAX in EBX
nop word ptr [rax+rax+00h] ; Byte-Alignment (not a real instruction)
loc_1400010D0:
mov ecx, [rdi] ; RDI points to our array. Here we move the first value into ECX.
xor r11b, r11b ; Set R11b to 0
mov r9, rdi ; Copy address in RDI to R9 (first element of our array)
mov r10d, 3E7h ; Move 1000 (number of elements) into R10d, for upper bounds check
xchg ax, ax ; Byte-Alignment (not a real instruction, same op-code as NOP)
loc_1400010E0:
mov r8d, ecx ; Copy value in ECX into R8d (left-hand value)
mov ecx, [r9+4] ; Copy next array value (right-hand value) into ECX
cmp r8d, ecx ; Compare both values
jle short loc_1400010F9 ; If value in R8d (left-hand) is less than value in ECX (right-hand), no sorting required so jump to loc_1400010F9
mov [r9], ecx ; If not, then put value in ECX (right-hand value) into address pointed to by R9 (which is address of left-hand value)
mov r11b, 1 ; Set R11b to 1 to indicate we have swapped values in the array
mov ecx, r8d ; Move value in R8d (left-hand value) into ECX
mov [r9+4], r8d ; Move value in R8d (left-hand value) into address of right-hand value (we've basically swapped)
loc_1400010F9:
add r9, 4 ; Move to next element of array
sub r10, 1 ; Subtract 1 from the length of array
jnz short loc_1400010E0 ; If the above operation did not result in 0 then jump to loc_1400010E0 (top of loop)
cmp r11b, 1 ; Check if we swapped in last array scan
jz short loc_1400010D0 ; If yes, then jump back to top of function
Nothing miraculous that I can see. On an array with 100,000 elements this is consistently 2 seconds faster that my own assembler implementations(s), despite it being pretty similar and seemingly not using any clever or esoteric instructions. I suspect it has a lot to do with the fact that the function was inlined and probably related to the two lines used for byte-alignment. I’m not going to worry too much about it, though. It’s actually quite pleasing to see a compiler putting the effort into squeezing performance out of even one of Computer Science’s shittier algorithms. No job is too small!