当使用mallocs和frees实现一些矩阵操作时,我遇到了一些问题。
Var声明:
double **a, **b, *c; //in
double **d; //out
A必须是NxK_MAX矩阵,B K_MAXxN,c是K_MAX长度的向量,d是NxN矩阵
Malloc:
N = atoi (argv[1]);
a = (double **) malloc (N*sizeof (double *));
b = (double **) malloc (K_MAX*sizeof (double *));
d = (double **) malloc (N*sizeof (double *));
c = (double *) malloc (K_MAX * sizeof (double));
ind = (int *) malloc (N * sizeof (int));
for (int i=0; i<N; i++){
a[i] = (double *) malloc (K_MAX*sizeof (double));
d[i] = (double *) malloc (N*sizeof (double));
}
for (int i = 0; i < K_MAX; i++){
b[i] = (double *) malloc (N * sizeof (double));
}
自由:
for (int x=0; x<N; x++){
free (a[x]);
free (d[x]);
}
for (int x= 0; x<K_MAX; x++){
free (b[x]);
}
free (a);
free (b);
free (d);
free (ind);
而且,我也不明白为什么我总是得到正确的结果。也许,这是一件很愚蠢的事,但我看不出来。
完整代码:
/*
MEJORAS IMPLEMENTADAS:
-Blocking -> 5
-Loop unrolling -> 2
-Reordenación de procedimientos
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pmmintrin.h>
#define K_MAX 8
#define CLS 64
#define block_size CLS / sizeof (double)
double _random (double min, double max);
void start_counter();
double get_counter();
double mhz();
/* Initialize the cycle counter */
static unsigned cyc_hi = 0;
static unsigned cyc_lo = 0;
/* Set *hi and *lo to the high and low order bits of the cycle counter.
Implementation requires assembly code to use the rdtsc instruction. */
void access_counter(unsigned *hi, unsigned *lo)
{
asm("rdtsc; movl %%edx,%0; movl %%eax,%1" /* Read cycle counter */
: "=r" (*hi), "=r" (*lo) /* and move results to */
: /* No input */ /* the two outputs */
: "%edx", "%eax");
}
/* Record the current value of the cycle counter. */
void start_counter()
{
access_counter(&cyc_hi, &cyc_lo);
}
/* Return the number of cycles since the last call to start_counter. */
double get_counter()
{
unsigned ncyc_hi, ncyc_lo;
unsigned hi, lo, borrow;
double result;
/* Get cycle counter */
access_counter(&ncyc_hi, &ncyc_lo);
/* Do double precision subtraction */
lo = ncyc_lo - cyc_lo;
borrow = lo > ncyc_lo;
hi = ncyc_hi - cyc_hi - borrow;
result = (double) hi * (1 << 30) * 4 + lo;
if (result < 0) {
fprintf(stderr, "Error: counter returns neg value: %.0f\n", result);
}
return result;
}
double mhz(int verbose, int sleeptime)
{
double rate;
start_counter();
sleep(sleeptime);
rate = get_counter() / (1e6*sleeptime);
if (verbose)
printf("\n Processor clock rate = %.1f MHz\n", rate);
return rate;
}
int main(int argc, char *argv[]){
double ck;
int N, *ind;
double **a, **b, *c; //Valores de entrada
double **d, f; //Variables de salida
FILE *fp;
if (argc < 2){
printf ("Faltan argumentos\n");
exit (1);
}
if (!(fp = fopen ("resultados.txt", "a"))){
printf ("No se pudo abrir archivo");
exit(1);
}
srand (1);
N = atoi (argv[1]);
a = (double **) malloc (N*sizeof (double *));
b = (double **) malloc (K_MAX*sizeof (double *));
d = (double **) malloc (N*sizeof (double *));
c = (double *) malloc (K_MAX * sizeof (double));
ind = (int *) malloc (N * sizeof (int));
for (int i=0; i<N; i++){
a[i] = (double *) malloc (K_MAX*sizeof (double));
d[i] = (double *) malloc (N*sizeof (double));
}
for (int i = 0; i < K_MAX; i++){
b[i] = (double *) malloc (N * sizeof (double));
}
for (int i=0; i<N; i++){
for (int j=0; j<K_MAX; j++){
a[i][j] = _random (-2.0, 2.0);
}
}
for (int i=0; i<K_MAX; i++){
for (int j=0; j<N; j++){
b[i][j] = _random (-2.0, 2.0);
}
c [i] = _random (-2.0, 2.0);
}
start_counter();
/* Poñer aquí o código a medir */
/* for (int i=0; i<N; i+=block_size){ //Blocking
for (int j=0; j<N; j+=block_size){
for (int ii = i; ii < i+block_size; ii++){
for (int jj = j; jj < j+block_size; jj++){
d[ii][jj] = 0.0;
for (int k = 0; k < K_MAX; k++){
d[ii][jj] += 2 * a[ii][k] * ( b[k][jj]- c[k]);
}
}
}
}
} */
int i = 0;
int j = 0;
for ( ; i<N; i+=block_size){ //Blocking + Loop unrolling
for ( ; j<N; j+=block_size){
for (int ii = i; ii < i+block_size; ii+=2){ //El numero de operaciones en cada desenrollo debe ser divisor del block_size
for (int jj = j; jj < j+block_size; jj+=2){
d[ii][jj] = 0.0;
d[ii+1][jj] = 0.0;
d[ii][jj+1] = 0.0;
d[ii+1][jj+1] = 0.0;
//Unrolling
d[ii][jj] += 2 * a[ii][0] * ( b[0][jj]- c[0]);
d[ii+1][jj] += 2 * a[ii+1][0] * ( b[0][jj]- c[0]);
d[ii][jj+1] += 2 * a[ii][0] * ( b[0][jj+1]- c[0]);
d[ii+1][jj+1] += 2 * a[ii+1][0] * ( b[0][jj+1]- c[0]);
d[ii][jj] += 2 * a[ii][1] * ( b[1][jj]- c[1]);
d[ii+1][jj] += 2 * a[ii+1][1] * ( b[1][jj]- c[1]);
d[ii][jj+1] += 2 * a[ii][1] * ( b[1][jj+1]- c[1]);
d[ii+1][jj+1] += 2 * a[ii+1][1] * ( b[1][jj+1]- c[1]);
d[ii][jj] += 2 * a[ii][2] * ( b[2][jj]- c[2]);
d[ii+1][jj] += 2 * a[ii+1][2] * ( b[2][jj]- c[2]);
d[ii][jj+1] += 2 * a[ii][2] * ( b[2][jj+1]- c[2]);
d[ii+1][jj+1] += 2 * a[ii+1][2] * ( b[2][jj+1]- c[2]);
d[ii][jj] += 2 * a[ii][3] * ( b[3][jj]- c[3]);
d[ii+1][jj] += 2 * a[ii+1][3] * ( b[3][jj]- c[3]);
d[ii][jj+1] += 2 * a[ii][3] * ( b[3][jj+1]- c[3]);
d[ii+1][jj+1] += 2 * a[ii+1][3] * ( b[3][jj+1]- c[3]);
d[ii][jj] += 2 * a[ii][4] * ( b[4][jj]- c[4]);
d[ii+1][jj] += 2 * a[ii+1][4] * ( b[4][jj]- c[4]);
d[ii][jj+1] += 2 * a[ii][4] * ( b[4][jj+1]- c[4]);
d[ii+1][jj+1] += 2 * a[ii+1][4] * ( b[4][jj+1]- c[4]);
d[ii][jj] += 2 * a[ii][5] * ( b[5][jj]- c[5]);
d[ii+1][jj] += 2 * a[ii+1][5] * ( b[5][jj]- c[5]);
d[ii][jj+1] += 2 * a[ii][5] * ( b[5][jj+1]- c[5]);
d[ii+1][jj+1] += 2 * a[ii+1][5] * ( b[5][jj+1]- c[5]);
d[ii][jj] += 2 * a[ii][6] * ( b[6][jj]- c[6]);
d[ii+1][jj] += 2 * a[ii+1][6] * ( b[6][jj]- c[6]);
d[ii][jj+1] += 2 * a[ii][6] * ( b[6][jj+1]- c[6]);
d[ii+1][jj+1] += 2 * a[ii+1][6] * ( b[6][jj+1]- c[6]);
d[ii][jj] += 2 * a[ii][7] * ( b[7][jj]- c[7]);
d[ii+1][jj] += 2 * a[ii+1][7] * ( b[7][jj]- c[7]);
d[ii][jj+1] += 2 * a[ii][7] * ( b[7][jj+1]- c[7]);
d[ii+1][jj+1] += 2 * a[ii+1][7] * ( b[7][jj+1]- c[7]);
}
}
}
}
for (; i<N; i++){
for (; j<N; j++){
d[i][j] = 0.0;
d[i][j] += 2 * a[i][0] * ( b[0][j]- c[0]);
d[i][j] += 2 * a[i][1] * ( b[1][j]- c[1]);
d[i][j] += 2 * a[i][2] * ( b[2][j]- c[2]);
d[i][j] += 2 * a[i][3] * ( b[3][j]- c[3]);
d[i][j] += 2 * a[i][4] * ( b[4][j]- c[4]);
d[i][j] += 2 * a[i][5] * ( b[5][j]- c[5]);
d[i][j] += 2 * a[i][6] * ( b[6][j]- c[6]);
d[i][j] += 2 * a[i][7] * ( b[7][j]- c[7]);
}
}
/* for (int i=0; i<N; i+=block_size){ //Blocking + Loop unrolling
for (int j=0; j<N; j+=block_size){
for (int ii = i; ii < i+block_size; ii+=5){ //El numero de operaciones en cada desenrollo debe ser divisor del block_size
for (int jj = j; jj < j+block_size; jj+=5){
d[ii][jj] = 0.0;
d[ii+1][jj] = 0.0;
d[ii+2][jj] = 0.0;
d[ii+3][jj] = 0.0;
d[ii+4][jj] = 0.0;
d[ii][jj+1] = 0.0;
d[ii][jj+2] = 0.0;
d[ii][jj+3] = 0.0;
d[ii][jj+4] = 0.0;
d[ii+1][jj+1] = 0.0;
d[ii+1][jj+2] = 0.0;
d[ii+1][jj+3] = 0.0;
d[ii+1][jj+4] = 0.0;
d[ii+2][jj+1] = 0.0;
d[ii+2][jj+2] = 0.0;
d[ii+2][jj+3] = 0.0;
d[ii+2][jj+4] = 0.0;
d[ii+3][jj+1] = 0.0;
d[ii+3][jj+2] = 0.0;
d[ii+3][jj+3] = 0.0;
d[ii+3][jj+4] = 0.0;
d[ii+4][jj+1] = 0.0;
d[ii+4][jj+2] = 0.0;
d[ii+4][jj+3] = 0.0;
d[ii+4][jj+4] = 0.0;
for (int k = 0; k < K_MAX; k++){
d[ii][jj] += 2 * a[ii][k] * ( b[k][jj]- c[k]);
d[ii+1][jj] += 2 * a[ii+1][k] * ( b[k][jj]- c[k]);
d[ii+2][jj] += 2 * a[ii+2][k] * ( b[k][jj]- c[k]);
d[ii][jj+1] += 2 * a[ii][k] * ( b[k][jj+1]- c[k]);
d[ii+1][jj+1] += 2 * a[ii+1][k] * ( b[k][jj+1]- c[k]);
d[ii+2][jj+1] += 2 * a[ii+2][k] * ( b[k][jj+1]- c[k]);
d[ii][jj+2] += 2 * a[ii][k] * ( b[k][jj+2]- c[k]);
d[ii+1][jj+2] += 2 * a[ii+1][k] * ( b[k][jj+2]- c[k]);
d[ii+2][jj+2] += 2 * a[ii+2][k] * ( b[k][jj+2]- c[k]);
d[ii][jj+3] += 2 * a[ii][k] * ( b[k][jj+3]- c[k]);
d[ii+1][jj+3] += 2 * a[ii+1][k] * ( b[k][jj+3]- c[k]);
d[ii+2][jj+3] += 2 * a[ii+2][k] * ( b[k][jj+3]- c[k]);
d[ii+3][jj] += 2 * a[ii+3][k] * ( b[k][jj]- c[k]);
d[ii+3][jj+1] += 2 * a[ii+3][k] * ( b[k][jj+1]- c[k]);
d[ii+3][jj+2] += 2 * a[ii+3][k] * ( b[k][jj+2]- c[k]);
d[ii+3][jj+3] += 2 * a[ii+3][k] * ( b[k][jj+3]- c[k]);
d[ii+4][jj] += 2 * a[ii+4][k] * ( b[k][jj]- c[k]);
d[ii+4][jj+1] += 2 * a[ii+4][k] * ( b[k][jj+1]- c[k]);
d[ii+4][jj+2] += 2 * a[ii+4][k] * ( b[k][jj+2]- c[k]);
d[ii+4][jj+3] += 2 * a[ii+4][k] * ( b[k][jj+3]- c[k]);
d[ii][jj+4] += 2 * a[ii][k] * ( b[k][jj+4]- c[k]);
d[ii+1][jj+4] += 2 * a[ii+1][k] * ( b[k][jj+4]- c[k]);
d[ii+2][jj+4] += 2 * a[ii+2][k] * ( b[k][jj+4]- c[k]);
d[ii+3][jj+4] += 2 * a[ii+3][k] * ( b[k][jj+4]- c[k]);
d[ii+4][jj+4] += 2 * a[ii+4][k] * ( b[k][jj+4]- c[k]);
}
}
}
}
} */
/* for (int i=0; i<N; i+=2){
for (int j=0; j<N; j+=2){
d[i][j] = 0.0;
for (int k = 0; k < K_MAX; k++){
d[i][j] += 2 * a[i][k] * ( b[k][j]- c[k]);
d[i+1][j] += 2 * a[i+1][k] * ( b[k][j]- c[k]);
d[i][j+1] += 2 * a[i][k] * ( b[k][j+1]- c[k]);
d[i+1][j+1] += 2 * a[i+1][k] * ( b[k][j+1]- c[k]);
}
}
} */
/* for (int i=0; i<N; i+=5){
for (int j=0; j<N; j+=5){
d[i][j] = 0.0;
for (int k = 0; k < K_MAX; k++){
d[i][j] += 2 * a[i][k] * ( b[k][j]- c[k]);
d[i+1][j] += 2 * a[i+1][k] * ( b[k][j]- c[k]);
d[i][j+1] += 2 * a[i][k] * ( b[k][j+1]- c[k]);
d[i+1][j+1] += 2 * a[i+1][k] * ( b[k][j+1]- c[k]);
d[i][j+2] += 2 * a[i][k] * ( b[k][j+2]- c[k]);
d[i+1][j+2] += 2 * a[i+1][k] * ( b[k][j+2]- c[k]);
d[i][j+3] += 2 * a[i][k] * ( b[k][j+3]- c[k]);
d[i+1][j+3] += 2 * a[i+1][k] * ( b[k][j+3]- c[k]);
d[i+2][j] += 2 * a[i+2][k] * ( b[k][j]- c[k]);
d[i+2][j+1] += 2 * a[i+2][k] * ( b[k][j+1]- c[k]);
d[i+2][j+2] += 2 * a[i+2][k] * ( b[k][j+2]- c[k]);
d[i+2][j+3] += 2 * a[i+2][k] * ( b[k][j+3]- c[k]);
d[i+3][j] += 2 * a[i+3][k] * ( b[k][j]- c[k]);
d[i+3][j+1] += 2 * a[i+3][k] * ( b[k][j+1]- c[k]);
d[i+3][j+2] += 2 * a[i+3][k] * ( b[k][j+2]- c[k]);
d[i+3][j+3] += 2 * a[i+3][k] * ( b[k][j+3]- c[k]);
d[i+4][j] += 2 * a[i+4][k] * ( b[k][j]- c[k]);
d[i+4][j+1] += 2 * a[i+4][k] * ( b[k][j+1]- c[k]);
d[i+4][j+2] += 2 * a[i+4][k] * ( b[k][j+2]- c[k]);
d[i+4][j+3] += 2 * a[i+4][k] * ( b[k][j+3]- c[k]);
d[i][j+4] += 2 * a[i][k] * ( b[k][j+4]- c[k]);
d[i+1][j+4] += 2 * a[i+1][k] * ( b[k][j+4]- c[k]);
d[i+2][j+4] += 2 * a[i+2][k] * ( b[k][j+4]- c[k]);
d[i+3][j+4] += 2 * a[i+3][k] * ( b[k][j+4]- c[k]);
d[i+4][j+4] += 2 * a[i+4][k] * ( b[k][j+4]- c[k]);
}
}
} */
f = 0.0;
for (int i=0; i<N; i++){
f+= d[ind[i]][ind[i]]/2;
}
/*Fin codigo a medir*/
ck=get_counter();
printf ("f=%lf\n", f);
fprintf (fp, "%lf, ", ck);
fclose (fp);
printf("\n Clocks=%1.10lf \n",ck);
for (int x=0; x<N; x++){
free (a[x]);
free (d[x]);
}
for (int x= 0; x<K_MAX; x++){
free (b[x]);
}
free (a);
free (b);
free (d);
free (c);
free (ind);
/* Esta rutina imprime a frecuencia de reloxo estimada coas rutinas start_counter/get_counter */
mhz(1,1);
return 0;
}
double _random (double min, double max){ //Funcion que genera un double aleatorio cuyo valor absoluto esta entre min y max
double r = min + ((double)rand()/((double)RAND_MAX /(max - min)));
while (abs(r) < 1 || abs (r) >= 2)
r = min + ((double)rand()/((double)RAND_MAX /(max - min)));
return r;
}
发布于 2022-05-01 17:55:34
valgrind显示您的代码正在各地执行。
==1520==
==1520== Invalid write of size 8
==1520== at 0x1097EB: main (sh.cpp:158)
==1520== Address 0x4a4a5c0 is 0 bytes after a block of size 80 alloc'd
==1520== at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==1520== by 0x1095E1: main (sh.cpp:111)
==1520==
==1520== Invalid write of size 8
==1520== at 0x10981B: main (sh.cpp:159)
==1520== Address 0x4a4a6d0 is 0 bytes after a block of size 80 alloc'd
==1520== at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==1520== by 0x1095E1: main (sh.cpp:111)
==1520==
==1520== Invalid write of size 8
==1520== at 0x10984B: main (sh.cpp:160)
==1520== Address 0x4a4a5c8 is 8 bytes after a block of size 80 alloc'd
==1520== at 0x483B7F3: malloc (in /usr/lib/x86_64-linux-gnu/valgrind/vgpreload_memcheck-amd64-linux.so)
==1520== by 0x1095E1: main (sh.cpp:111)
很多人中的前几个,我建议你自己去办。
https://stackoverflow.com/questions/72078742
复制相似问题