I am writing some small programs for a semester project that demonstrates the benefits of inlining assembly code into higher level languages (specifically into C++). I wrote a C++ program that uses brute force to find all prime numbers up to an upper-bound. Primes between 2 and 1000 using straightforward C++ took 41 seconds. With inlined assembly it took 15 seconds.
Here is where I get confused. I wrote the same program in Java and it only took 12 seconds! How is this possible? Because Java is still actively being optimized while C++ is old school?
Here are the algorithms. If the full project code is requested, it will be provided. The C++ and Java code is identical yet takes nearly 3 times longer for execution time in C++ on my Windows 7, x64, Core i7, 16GB DDR3 system.
Globals: int[] primes, int uBound=1000
C++ (41 seconds):
void primesUpToC() {
int value = 2;
int index = 0;
bool prime;
while(index<uBound) {
prime = true;
for(int j=2; j<=value-1; j++) {
for(int k=2; k<=value-1; k++) {
if(j*k==value) {
prime = false;
break;
}
}
if(!prime)
break;
}
if(prime)
primes[index++] = value;
value++;
}
}
C++ w/ASM (15 seconds):
void primesUpToASM() {
int i;
int value = 2;
int index = 0;
_asm {
mov edi,primes
mov ecx,uBound
mov esi,value
L1:
cmp index,ecx
jge W1
push ecx
mov i,2d
mov ecx,esi
dec ecx
L2:
cmp i,ecx
jge W2
push ecx
mov ebx,2d
mov ecx,esi
dec ecx
L3:
cmp ebx,ecx
jge W3
mov eax,i
mul ebx
cmp eax,esi
jne NOT_EQUAL
pop ecx
jmp NOT_PRIME
NOT_EQUAL:
inc ebx
jmp L3
W3:
pop ecx
inc i
jmp L2
W2:
PRIME:
mov [edi],esi
add edi,4d
inc index
NOT_PRIME:
inc esi
pop ecx
jmp L1
W1:
}
}
Java (12 seconds):
public void primesUpToJava() {
int value = 2;
int index = 0;
boolean prime;
while(index<uBound) {
prime = true;
for(int j=2; j<=value-1; j++) {
for(int k=2; k<=value-1; k++) {
if(j*k==value) {
prime = false;
break;
}
}
if(!prime)
break;
}
if(prime)
primes[index++] = value;
value++;
}
}
PS I realize there are better ways to compute primes, but I wanted to show the efficiency of the inlined assembly.