Is it possible to speed up an array operation in C on a computer with a multi-core AMD CPU? I list the code below.
UDP packets arrive every 512ns, and the computation in the array below (where the data is accumulated in else{} loop) takes more than 512ns. This loop keep up when the packets arrive at one microsecond interval. My question: can one multi-thread the array accumulation and hence speed up the computation? Currently the program 'top' shows that the code uses 100% CPU when packets arrive at 512ns interval. Thanks in advance for any inputs/suggestions.
#define NC 64
#define NP 4
void* get_next_buf(mystr_t* str, uint64_t* size)
{
char buf0[8500];
long long pktnum, *ptr;
int i,j,J, offset ;
ssize_t recsize;
socklen_t fromlen;
int pktdiff;
recsize = recvfrom(str->sock, (void *)buf0, 8224, 0, (struct sockaddr *)&str->sa, &fromlen);
if (recsize < 0) {
fprintf(stderr,"error reading udp packet\n");
return 0;
}
/* clear the array for accumulate*/
memset(str->data, 0, 2*NCHAN*NPOL*sizeof(short));
/* swap bytes to extract packet counter*/
ptr = (long long *)buf0;
pktnum=BSWAP_64( *ptr ) & 0x00ffffffffffffff;
// got one packet of data. If a pakcet was missed, return array with zeroes
pktdiff = pktnum - str->prev_pkt_cnt;
if ( pktdiff != 1){
str->bad_pkt_cnt++;
fprintf (stderr,"%d+",pktdiff);
str->prev_pkt_cnt = pktnum;
*size = 2*sizeof(short)*NC*NP;
return (void*) str->data;
}
//packet arrived in correct order, accumulate and return the array
else {
J = 8192/(NC*NP);
for (i=0;i<J;i++){
for (j=0;j<NC;j=j++){
offset = i*NC*NP;
((short *)str->data)[j] += (short)(buf0[8+j+offset]);
((short *)str->data)[j+64] += (short)(buf0[8+64+j+offset]);
((short *)str->data)[j+128] += (short)(buf0[8+128+j+offset]);
((short *)str->data)[j+192] += (short)(buf0[8+192+j+offset]);
}
}
*size = sizeof(short)*NC*NP;
str->prev_pkt_cnt = pktnum;
/*return the acquired data buffer */
return (void*) str->data;
}
}
offsetcalculation can go on the outer level out of the nested loops (but this should have been noticed by the compiler already if you used suitable optimisation settings). The same with NC * NP (can be precalculated)j=j++, it should just be++j. Probably won't help, but won't hurt either. Just do it.NCHAN, NPOLandNC, NP?