Hi!
"Alexey" == Alexey Botchkov <holyfoot@askmonty.org> writes:
Alexey> Hi, Monty. Alexey> I've looked at that part of the code, and tried to make the improved Alexey> versions of uint6korr and mi_uint6korr. Alexey> The small program attached that benchmarks these. Alexey> Depending on enabled macros it loops uint6korr, hf_uint6korr, Alexey> mi_uint6korr and hf_mi_uint6korr respectively. It performs 2 loops on Alexey> each functions first runs it once, the second - twice, so we can Alexey> calculate how much time was spent on the operation itself. Alexey> The results i got so far are: Alexey> elapsed 103 seconds on korr6-1 Alexey> elapsed 190 seconds on korr6-2 Alexey> elapsed 50 seconds on hf_korr6-1 Alexey> elapsed 79 seconds on hf_korr6-2 Alexey> elapsed 106 seconds on mi6-1 Alexey> elapsed 195 seconds on mi6-2 Alexey> elapsed 56 seconds on hf_mi6B-1 Alexey> elapsed 88 seconds on hf_mi6-2 Alexey> So the Alexey> hf_uint6korr is 3 times faster than uint6korr. Alexey> hf_mi_uint6korr is 2.8 times faster than mi_uint6korr. Alexey> You're welcome to check the code out. Thanks. What is important is to get fast versions of mi_uint3korr (Used a lot in ma_dynrec.c and gis) mi_uint4korr mi_uint5korr mi_uint6korr mi_uint7korr mi_uint8korr (Used for some variables) uint5korr uint6korr (Used a lot in Aria) uint8korr (I am including the full test so that anyone can comment upon this) ---------------------------------------------------------------------- #include <stdio.h> #include <time.h> #include <malloc.h> #define TEST_KORR6 #define TEST_HF_KORR6 #define TEST_MI6 #define TEST_HF_MI6 #define uint6korr(A) ((ulonglong)(((uint32) ((uchar) (A)[0])) + \ (((uint32) ((uchar) (A)[1])) << 8) + \ (((uint32) ((uchar) (A)[2])) << 16) + \ (((uint32) ((uchar) (A)[3])) << 24)) + \ (((ulonglong) ((uchar) (A)[4])) << 32) + \ (((ulonglong) ((uchar) (A)[5])) << 40)) #define mi_uint6korr(A) ((ulonglong)(((uint32) (((const uchar*) (A))[5])) +\ (((uint32) (((const uchar*) (A))[4])) << 8) +\ (((uint32) (((const uchar*) (A))[3])) << 16) +\ (((uint32) (((const uchar*) (A))[2])) << 24)) +\ (((ulonglong) (((uint32) (((const uchar*) (A))[1])) +\ (((uint32) (((const uchar*) (A))[0]) << 8)))) <<\ 32)) #define hf_uint6korr(A) (((ulonglong) ((uint32 *) (A))[0]) + (((ulonglong) ((uint16 *) (A))[2]) << 32)) #define hf_mi_uint6korr(src, dest) \ __asm__ ( \ "bswapq %1;" \ "mov %1, %0;" \ :"=r"(dest) \ :"r"(hf_uint6korr(src)<<16) \ : \ ) typedef unsigned long long int ulonglong; typedef unsigned int uint32; typedef unsigned char uchar; typedef unsigned short uint16; time_t t0, t1; ulonglong i; #define GM 10000000000LL #define BAS 2000000 int main() { ulonglong *pb, *pb2; char *art, *art2; ulonglong ci; art= malloc(6*BAS); pb= malloc(sizeof(ulonglong)*BAS); for (i=0; i<6*BAS; i++) art[i]= (char)i; art2= malloc(6*BAS); pb2= malloc(sizeof(ulonglong)*BAS); for (i=0; i<6*BAS; i++) art2[i]= (char)i; #ifdef TEST_KORR6 ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0; pb[ci]= uint6korr(art+ci*6); ci++; } t1= time(0); printf("elapsed %d seconds on korr6-1\n", t1 - t0); ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0; pb[ci]= uint6korr(art+ci*6); pb2[ci]= uint6korr(art2+ci*6); ci++; } t1= time(0); printf("elapsed %d seconds on korr6-2\n", t1 - t0); #endif /*KORR6*/ #ifdef TEST_HF_KORR6 ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0; pb[ci]= hf_uint6korr(art+ci*6); ci++; } t1= time(0); printf("elapsed %d seconds on hf_korr6-1\n", t1 - t0); ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0; pb[ci]= hf_uint6korr(art+ci*6); pb2[ci]= hf_uint6korr(art2+ci*6); ci++; } t1= time(0); printf("elapsed %d seconds on hf_korr6-2\n", t1 - t0); #endif /*HF_KORR6*/ #ifdef TEST_MI6 ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0; pb[ci]= mi_uint6korr(art+ci*6); ci++; } t1= time(0); printf("elapsed %d seconds on mi6-1\n", t1 - t0); ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0; pb[ci]= mi_uint6korr(art+ci*6); pb2[ci]= mi_uint6korr(art2+ci*6); ci++; } t1= time(0); printf("elapsed %d seconds on mi6-2\n", t1 - t0); #endif /*MI6*/ #ifdef TEST_HF_MI6 ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0; hf_mi_uint6korr(art+ci*6, pb[ci]); ci++; } t1= time(0); printf("elapsed %d seconds on hf_mi6B-1\n", t1 - t0); ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0; hf_mi_uint6korr(art+ci*6, pb[ci]); hf_mi_uint6korr(art2+ci*6, pb2[ci]); ci++; } t1= time(0); printf("elapsed %d seconds on hf_mi6-2\n", t1 - t0); #endif /*HF_MI6*/ return 0; } Looks ok. Did you try with: #define hf_uint6korr(A) (*((uint64 *) A) & 0xffffffffffffLL) That may be faster, with the extra cost problem of reading 2 bytes extra, so this only safe to use if one knows that there is a few extra bytes in the buffer. Can you do a patch to my_global.h and myisampack and add the new functions there. What to do: - All improvements for now only for x64 architecture - Replace uint6korr(A) and any other function that can be improved without reading extra bytes with a faster version. - Improve with faster variants: mi_uint3korr mi_uint4korr mi_uint5korr mi_uint6korr mi_uint7korr mi_uint8korr uint5korr uint6korr uint8korr If the above suggested version of uint6korr() that reads extra bytes is notable faster, add a function: uint6korr_unsafe() Which of course should map to uint6korr for other architectures. ok? Regards, Monty