Here comes ...
' revision 3+, it stands for Multiplication of INtegers in C using strings.
In the ZIP archive both Intel v12.1 and Microsoft v16 Windows executables are given along with source.
Revision 3+ has optimized main loop, as a side effect now a new CPU clock pseudo-measure has emerged:
MokujINs - it stands for number of cycles of main loop of MUL function made per second.
My 'Bonboniera' gave 77 MegaMokujINs i.e. 77+ million iterations per second.
At each iteration/cycle a digit vs digit multiplication is made i.e. the two operands are a single digit long.
In fact this console tool is a tiny-finy benchmarker which produces 315,653 digits long number.
Its performance depends on CPU clock and L2 cache speed mostly.
Interestingly, Microsoft v16 outspeeds Intel v12.1 on both 'DEFAULT' and 'TURBO' tests.
Code:
// MokujIN, a string multiplier, written by Kaze, 2012-Nov-07, revision 3+.
// Free download at: www.sanmayce.com/Downloads/MokujIN.zip
// Revision 3:
// Computing 2^^1048576 took 0,454 seconds with '/TURBO' with Intel v12.1 on T7500 2200MHz.
// Computing 2^^1048576 took 1,856 seconds without '/TURBO' with Intel v12.1 on T7500 2200MHz.
// Computing 2^^1048576 took 0,426 seconds with '/TURBO' with Microsoft v16 on T7500 2200MHz.
// Computing 2^^1048576 took 1,678 seconds without '/TURBO' with Microsoft v16 on T7500 2200MHz.
// On August 23rd, 2008, a UCLA computer in the GIMPS PrimeNet network discovered the 45th (biggest so far (2010-July)) known Mersenne prime,
// 2^43,112,609-1, a mammoth 12,978,189 digit number!
// Now, who can wait for 'MokujIN.exe 2 43112609 /turbo' to complete?!
/*
D:\_KAZE\MokujIN>MokujIN_Microsoft_32-bit_Version_16.exe /help
MokujIN, Multiplication of INtegers, written by Kaze, 2012-Nov-06, revision 3.
Usage: MokujIN [Number Power [/turbo]]
Note1: Power is signed 32bit integer i.e. up to 2^31-1=2,147,483,647.
Note2: Multiplicand or/and Multiplier cannot exceed 12978189 digits.
Note3: TURBO regime is several times (only) faster.
Example1: D:\_KAZE\MokujIN>MokujIN.exe 18446744073709551616 2
340282366920938463463374607431768211456
Example2: D:\_KAZE\MokujIN>MokujIN.exe
Multiplicand: 18446744073709551616
Multiplier : 18446744073709551616
Result : 340282366920938463463374607431768211456
D:\_KAZE\MokujIN>
*/
#include <stdio.h>
#include <time.h>
#define memory_size 12978189 // equals MAX(Multiplicand,Multiplier)
unsigned char* MUL(unsigned char* Result, unsigned char* Multiplicand, unsigned char* Multiplier)
{
/*
'QuickBASIC was/is an excellent environment/compiler, eh.
FUNCTION MUL$ (Multiplicand$, Multiplier$)
'DimSum = CandLength + ErLength
'REDIM Result%(1 TO DimSum)
FOR SF = 1 TO CandLength
FOR QB = 1 TO ErLength
CarryFlag = 0
Cycle = QB - 1 + SF
Tiller = Cand%(SF) * Er%(QB)
Result%(Cycle) = Tiller MOD 10 + Result%(Cycle)
IF Result%(Cycle) >= 10 THEN
Result%(Cycle) = Result%(Cycle) - 10
CarryFlag = 1
END IF
NextNumPos = Cycle + 1
Result%(NextNumPos) = Result%(NextNumPos) + CarryFlag + Tiller \ 10
DO WHILE Result%(NextNumPos) >= 10
Result%(NextNumPos) = Result%(NextNumPos) - 10
NextNumPos = NextNumPos + 1
Result%(NextNumPos) = Result%(NextNumPos) + 1
LOOP
NEXT QB
NEXT SF
END FUNCTION
*/
// 'unsigned long long' makes the computation ugly with 32bit code, so going down to 'unsigned long':
// 8:14:04.78
// 8:25:46.22
// vs
// 1:41:43.84
// 1:52:33.59
// Or almost a minute ugliness!
unsigned long SF, QB, Cycle, NextNumPos;
unsigned long ResLength, CandLength, ErLength;
unsigned char CarryFlag, TillerMostSignificantDigit, TillerLeastSignificantDigit;
// 0..9 * 0..9
unsigned char MSDarray[10][10] = {
0,0,0,0,0,0,0,0,0,0, // 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9,
0,0,0,0,0,0,0,0,0,0, // 1x0, 1x1, 1x2, 1x3, 1x4, 1x5, 1x6, 1x7, 1x8, 1x9,
0,0,0,0,0,1,1,1,1,1, // 2x0, 2x1, 2x2, 2x3, 2x4, 2x5, 2x6, 2x7, 2x8, 2x9,
0,0,0,0,1,1,1,2,2,2, // 3x0, 3x1, 3x2, 3x3, 3x4, 3x5, 3x6, 3x7, 3x8, 3x9,
0,0,0,1,1,2,2,2,3,3, // 4x0, 4x1, 4x2, 4x3, 4x4, 4x5, 4x6, 4x7, 4x8, 4x9,
0,0,1,1,2,2,3,3,4,4, // 5x0, 5x1, 5x2, 5x3, 5x4, 5x5, 5x6, 5x7, 5x8, 5x9,
0,0,1,1,2,3,3,4,4,5, // 6x0, 6x1, 6x2, 6x3, 6x4, 6x5, 6x6, 6x7, 6x8, 6x9,
0,0,1,2,2,3,4,4,5,6, // 7x0, 7x1, 7x2, 7x3, 7x4, 7x5, 7x6, 7x7, 7x8, 7x9,
0,0,1,2,3,4,4,5,6,7, // 8x0, 8x1, 8x2, 8x3, 8x4, 8x5, 8x6, 8x7, 8x8, 8x9,
0,0,1,2,3,4,5,6,7,8 // 9x0, 9x1, 9x2, 9x3, 9x4, 9x5, 9x6, 9x7, 9x8, 9x9
};
unsigned char LSDarray[10][10] = {
0,0,0,0,0,0,0,0,0,0, // 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9,
0,1,2,3,4,5,6,7,8,9, // 1x0, 1x1, 1x2, 1x3, 1x4, 1x5, 1x6, 1x7, 1x8, 1x9,
0,2,4,6,8,0,2,4,6,8, // 2x0, 2x1, 2x2, 2x3, 2x4, 2x5, 2x6, 2x7, 2x8, 2x9,
0,3,6,9,2,5,8,1,4,7, // 3x0, 3x1, 3x2, 3x3, 3x4, 3x5, 3x6, 3x7, 3x8, 3x9,
0,4,8,2,6,0,4,8,2,6, // 4x0, 4x1, 4x2, 4x3, 4x4, 4x5, 4x6, 4x7, 4x8, 4x9,
0,5,0,5,0,5,0,5,0,5, // 5x0, 5x1, 5x2, 5x3, 5x4, 5x5, 5x6, 5x7, 5x8, 5x9,
0,6,2,8,4,0,6,2,8,4, // 6x0, 6x1, 6x2, 6x3, 6x4, 6x5, 6x6, 6x7, 6x8, 6x9,
0,7,4,1,8,5,2,9,6,3, // 7x0, 7x1, 7x2, 7x3, 7x4, 7x5, 7x6, 7x7, 7x8, 7x9,
0,8,6,4,2,0,8,6,4,2, // 8x0, 8x1, 8x2, 8x3, 8x4, 8x5, 8x6, 8x7, 8x8, 8x9,
0,9,8,7,6,5,4,3,2,1 // 9x0, 9x1, 9x2, 9x3, 9x4, 9x5, 9x6, 9x7, 9x8, 9x9
};
// With above look-up tables (they replace all nasty *MULs, %MODs) we have a boost:
// 1:41:43.84
// 1:52:33.59 or 52*60+33-(41*60+43)= 650s
// vs
// 3:00:38.26
// 3:08:11.38 or 08*60+11-(00*60+38)= 453s
// Or 3+ minutes ugliness!
CandLength =strlen(Multiplicand);
ErLength =strlen(Multiplier);
memset(Result, 0, (CandLength+ErLength)+1);
if ( (ErLength == 1 && Multiplier[0] == 0+'0') || (CandLength == 1 && Multiplicand[0] == 0+'0') ) {
Result[0] = 0+'0';
return Result;
}
// In C the offset starts from 0 whereas in QuickBasic from 1, therefore '<' becomes '<='.
for (SF=1; SF<=CandLength; SF++) {
for (QB=1; QB<=ErLength; QB++) {
CarryFlag = 0;
Cycle = QB - 1 + SF;
// In C the offset starts from 0 whereas in QuickBasic from 1, therefore '[SF]' becomes '[SF-1]'.
// Here the strings are not reversed as in QB arrays, so [CandLength-SF+1] becomes [(CandLength-SF+1)-1].
//Tiller = (Multiplicand[(CandLength-SF+1)-1]-'0') * (Multiplier[(ErLength-QB+1)-1]-'0');
// LOOK-UP TABLE BOOST: Above COMMENTED line becomes next two:
TillerLeastSignificantDigit = LSDarray[ Multiplicand[(CandLength-SF+1)-1]-'0' ][ Multiplier[(ErLength-QB+1)-1]-'0' ];
TillerMostSignificantDigit = MSDarray[ Multiplicand[(CandLength-SF+1)-1]-'0' ][ Multiplier[(ErLength-QB+1)-1]-'0' ];
//Result[Cycle-1] = (Tiller%10) + Result[Cycle-1];
// LOOK-UP TABLE BOOST: Above COMMENTED line becomes next one:
Result[Cycle-1] = TillerLeastSignificantDigit + Result[Cycle-1];
if ( Result[Cycle-1] >= 10 ) {
Result[Cycle-1] = Result[Cycle-1] - 10;
CarryFlag = 1;
}
NextNumPos = Cycle + 1;
//Result[NextNumPos-1] = Result[NextNumPos-1] + CarryFlag + (unsigned char)(Tiller/10);
// LOOK-UP TABLE BOOST: Above COMMENTED line becomes next one:
Result[NextNumPos-1] = Result[NextNumPos-1] + CarryFlag + TillerMostSignificantDigit;
while (Result[NextNumPos-1] >= 10) {
Result[NextNumPos-1] = Result[NextNumPos-1] - 10;
NextNumPos = NextNumPos + 1;
Result[NextNumPos-1] = Result[NextNumPos-1] + 1;
}
}
}
//Here we have the result (REVERSED) in ASCII codes i.e. '0' stands for ASCII 000.
//ResLength is either (CandLength+ErLength) or (CandLength+ErLength-1).
ResLength=(CandLength+ErLength);
if (Result[ResLength-1] == 0x00) ResLength--;
//The last thing to be done: the in-place reversal:
for (SF=1; SF<=ResLength/2; SF++) {
CarryFlag = Result[SF-1];
Result[SF-1] = Result[(ResLength-SF+1)-1]+'0';
Result[(ResLength-SF+1)-1] = CarryFlag+'0';
}
if (ResLength%2 != 0)
Result[(ResLength/2+1)-1] = Result[(ResLength/2+1)-1]+'0';
//Terminate it:
Result[ResLength] = 0;
return Result;
}
int main(int argc, char *argv[])
{
signed int n, p, p31;
unsigned char* pointerflushUNALIGN;
unsigned char* Multiplicand;
unsigned char* Multiplier;
time_t t1, t2;
unsigned long long DPS;
FILE *fp_out;
if (argc != 3 && argc != 4) {
printf("MokujIN, Multiplication of INtegers, written by Kaze, 2012-Nov-07, revision 3+.\n");
printf("Usage: MokujIN [Number Power [/turbo|/stats]]\n");
printf("Note0: With no parameters given, interactive multiplication regime is on.\n");
printf("Note1: Power is signed 32bit integer i.e. up to 2^31-1=2,147,483,647.\n");
printf("Note2: Multiplicand or/and Multiplier cannot exceed %d digits.\n", memory_size);
printf("Note3: '/turbo' regime is several times (only) faster.\n");
printf("Note4: '/stats' is '/turbo' too, also dumps the result to 'MokujIN.txt'.\n");
printf("Note5: When power is big enough (as in Example3) MokujINs (or the prosaic DPS) is an useful CPU clock pseudo-measure.\n");
printf("Example1: D:\\_KAZE\\MokujIN>MokujIN.exe 18446744073709551616 2\n");
printf("340282366920938463463374607431768211456\n");
printf("Example2: D:\\_KAZE\\MokujIN>MokujIN.exe\n");
printf("Multiplicand: 18446744073709551616\n");
printf("Multiplier : 18446744073709551616\n");
printf("Result : 340282366920938463463374607431768211456\n");
printf("Example3: D:\\_KAZE\\MokujIN>MokujIN.exe 2 1048576 /stats\n");
printf("...\n");
printf("Multiplying performance for operands 39457 digits long: 77842742 MokujINs i.e. digits per second.\n");
printf("Multiplying performance for operands 78914 digits long: 77842742 MokujINs i.e. digits per second.\n");
printf("Multiplying performance for operands 157827 digits long: 77358266 MokujINs i.e. digits per second.\n");
printf("Dumping the result to 'MokujIN.txt' ... OK\n");
}
pointerflushUNALIGN = (unsigned char*)malloc( memory_size*2+1 ); //+1 because we need sentinel
if( pointerflushUNALIGN == NULL ) {
printf("MokujIN: Needed memory allocation denied!\n");
return( 1 );
}
Multiplicand = (unsigned char*)malloc( memory_size*2+1 ); //+1 because we need sentinel
if( Multiplicand == NULL ) {
printf("MokujIN: Needed memory allocation denied!\n");
return( 1 );
}
Multiplier = (unsigned char*)malloc( memory_size*2+1 ); //+1 because we need sentinel
if( Multiplier == NULL ) {
printf("MokujIN: Needed memory allocation denied!\n");
return( 1 );
}
if (argc == 3 || argc == 4) {
p = atoi(argv[2]); //_atoi64(argv[2]);
if (p < 2) {
printf("MokujIN: Power should be 2 or greater!\n");
free(pointerflushUNALIGN); free(Multiplicand); free(Multiplier);
return( 2 );
}
memcpy(Multiplicand, argv[1], strlen(argv[1])); Multiplicand[strlen(argv[1])]=0; // Not assuming the allocated pool is ZEROed!
memcpy(Multiplier, argv[1], strlen(argv[1])); Multiplier[strlen(argv[1])]=0;
if (argc == 4) {
p31 = 0;
while (p>>1) {
p31++;
p = p>>1;
}
p = atoi(argv[2]);
p = p - (1<<p31);
while (p31--) {
(void) time(&t1);
MUL(pointerflushUNALIGN, Multiplicand, Multiplier);
(void) time(&t2);
if (t2 <= t1) {t2 = t1; t2++;}
if ( strcmp("/stats\0",argv[3]) == 0 ) {
DPS = (unsigned long long)strlen(Multiplier);
DPS *= DPS;
DPS = DPS/(unsigned long long)(t2-t1);
printf("Multiplying performance for operands %lu digits long: %lu MokujINs i.e. digits per second.\n", strlen(Multiplier), DPS);
}
memcpy(Multiplicand, pointerflushUNALIGN, strlen(pointerflushUNALIGN)); Multiplicand[strlen(pointerflushUNALIGN)]=0;
memcpy(Multiplier, pointerflushUNALIGN, strlen(pointerflushUNALIGN)); Multiplier[strlen(pointerflushUNALIGN)]=0;
}
memcpy(Multiplier, argv[1], strlen(argv[1])); Multiplier[strlen(argv[1])]=0;
while (p--) {
MUL(pointerflushUNALIGN, Multiplicand, Multiplier);
memcpy(Multiplicand, pointerflushUNALIGN, strlen(pointerflushUNALIGN)); Multiplicand[strlen(pointerflushUNALIGN)]=0;
}
} else {
while (p-->1) {
MUL(pointerflushUNALIGN, Multiplicand, Multiplier);
memcpy(Multiplicand, pointerflushUNALIGN, strlen(pointerflushUNALIGN)); Multiplicand[strlen(pointerflushUNALIGN)]=0;
}
}
if ( argc == 4 && strcmp("/stats\0",argv[3]) == 0 ) {
if( ( fp_out = fopen( "MokujIN.txt", "wb+" ) ) == NULL )
{ printf( "MokujIN: Can't create file 'MokujIN.txt'.\n" ); return( 4 ); }
printf("Dumping the result to 'MokujIN.txt' ... ");
fprintf( fp_out, "%s\r\n", Multiplicand );
fclose(fp_out);
printf("OK\n");
} else
printf("%s\n", Multiplicand);
free(pointerflushUNALIGN); free(Multiplicand); free(Multiplier);
return 0;
} else if (argc == 1) {
printf("\nMultiplicand: ");
scanf("%s", Multiplicand);
printf("Multiplier : ");
scanf("%s", Multiplier);
printf("Result : ");
printf("%s\n", MUL(pointerflushUNALIGN, Multiplicand, Multiplier));
free(pointerflushUNALIGN); free(Multiplicand); free(Multiplier);
return 0;
} else {
free(pointerflushUNALIGN); free(Multiplicand); free(Multiplier);
return 3;
}
}
On my 'Bonboniera' laptop it took 54 minutes i.e. 3000+ seconds - should be 3- seconds.
I expect at least a week of crunching.
But our final number (12+ million digits) needs two ten times longer numbers than above ones, so:
Ouch, it is 128 hours or one week.