#include #include #include #include #include #include #include /* matrix multiply tests -- C language, version 1.0, May 1993 * * compile with -DN= * * I usually run a script file * time.script 500 >500.times * where the script file contains * cc -O -DN=$1 mm.c * a.out -n (I suggest at least two runs per method to * a.out -n alert you to variations. Five or ten runs * a.out -t each, giving avg. and std dev. of times is * a.out -t best.) * ... * * Contact Mark Smotherman (mark@cs.clemson.edu) for questions, comments, * and to report results showing wide variations. E.g., a wide variation * appeared on an IBM RS/6000 Model 320 with "cc -O -DN=500 mm.c" (xlc * compiler): * 500x500 mm - normal algorithm utime 230.81 secs * 500x500 mm - normal algorithm utime 230.72 secs * 500x500 mm - temporary variable in loop utime 231.00 secs * 500x500 mm - temporary variable in loop utime 230.79 secs * 500x500 mm - unrolled inner loop, factor of 8 utime 232.09 secs * 500x500 mm - unrolled inner loop, factor of 8 utime 231.84 secs * 500x500 mm - pointers used to access matrices utime 230.74 secs * 500x500 mm - pointers used to access matrices utime 230.45 secs * 500x500 mm - blocking, factor of 32 utime 60.40 secs * 500x500 mm - blocking, factor of 32 utime 60.57 secs * 500x500 mm - interchanged inner loops utime 27.36 secs * 500x500 mm - interchanged inner loops utime 27.40 secs * 500x500 mm - 20x20 subarray (from D. Warner) utime 9.49 secs * 500x500 mm - 20x20 subarray (from D. Warner) utime 9.50 secs * 500x500 mm - 20x20 subarray (from T. Maeno) utime 9.10 secs * 500x500 mm - 20x20 subarray (from T. Maeno) utime 9.05 secs * * The algorithms can also be sensitive to TLB thrashing. On a 600x600 * test an IBM RS/6000 Model 30 showed variations depending on relative * location of the matrices. (The model 30 has 64 TLB entries organized * as 2-way set associative.) * * 600x600 mm - 20x20 subarray (from T. Maeno) utime 19.12 secs * 600x600 mm - 20x20 subarray (from T. Maeno) utime 19.23 secs * 600x600 mm - 20x20 subarray (from D. Warner) utime 18.87 secs * 600x600 mm - 20x20 subarray (from D. Warner) utime 18.64 secs * 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 17.70 secs * 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 17.76 secs * * Changing the declaration to include 10000 dummy entries between the * b and c matrices (suggested by T. Maeno), i.e., * * double a[N][N],b[N][N],dummy[10000],c[N][N],d[N][N],bt[N][N]; * * 600x600 mm - 20x20 subarray (from T. Maeno) utime 16.41 secs * 600x600 mm - 20x20 subarray (from T. Maeno) utime 16.40 secs * 600x600 mm - 20x20 subarray (from D. Warner) utime 16.68 secs * 600x600 mm - 20x20 subarray (from D. Warner) utime 16.67 secs * 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 16.97 secs * 600x600 mm - 20x20 btranspose (Warner/Smotherman) utime 16.98 secs * * I hope to add other algorithms (e.g., Strassen-Winograd) in the near * future. */ // allocate a double matrix with subscript range m[0..nrow-1][0..ncol-1] double **matrix(int nrow, int ncol) { int i; double **m; // allocate pointers to rows m=(double **) malloc((size_t)(nrow*sizeof(double*))); if (!m) exit(22); // allocate rows and set pointers to them m[0]=(double *) malloc((size_t)((nrow*ncol)*sizeof(double))); if (!m[0]) exit(33); for(i=1;i1){ switch(argv[i][0]){ case '-': alg_choice = argv[i][1]; break; case '+': sscanf(argv[i],"%d",&N); break; case '%': sscanf(argv[i],"%d",&TIMES); break; default: sscanf(argv[i],"%d",&parm); } argc--; i++; } switch(alg_choice){ case 'n': printf("%3dx%3d mm - normal algorithm ",N,N); break; case 'v': printf("%3dx%3d mm - temporary variable in loop ",N,N); break; case 'p': printf("%3dx%3d mm - pointers used to access matrices ",N,N); break; case 't': printf("%3dx%3d mm - transposed b matrix ",N,N); break; case 'i': printf("%3dx%3d mm - interchanged inner loops ",N,N); break; case 'u': if((parm!=4)&&(parm!=8)&&(parm!=16)){ printf("%s: unrolling factor of %d not implemented\n",argv[0],parm); printf(" usage: %s -