Hi all. Only question i have with this is if there's a possibility to make the hf_mi_uint6korr(dest,src) macro a function. So we can write as usual dest= hf_mi_uint6korr(src) Didn't find how that can nicely be done with the assembler code. Best regards. HF 23.01.2014 16:20, Michael Widenius wrote:
Hi!
"Alexey" == Alexey Botchkov <holyfoot@askmonty.org> writes: Alexey> Hi, Monty. Alexey> I've looked at that part of the code, and tried to make the improved Alexey> versions of uint6korr and mi_uint6korr.
Alexey> The small program attached that benchmarks these. Alexey> Depending on enabled macros it loops uint6korr, hf_uint6korr, Alexey> mi_uint6korr and hf_mi_uint6korr respectively. It performs 2 loops on Alexey> each functions first runs it once, the second - twice, so we can Alexey> calculate how much time was spent on the operation itself.
Alexey> The results i got so far are: Alexey> elapsed 103 seconds on korr6-1 Alexey> elapsed 190 seconds on korr6-2 Alexey> elapsed 50 seconds on hf_korr6-1 Alexey> elapsed 79 seconds on hf_korr6-2 Alexey> elapsed 106 seconds on mi6-1 Alexey> elapsed 195 seconds on mi6-2 Alexey> elapsed 56 seconds on hf_mi6B-1 Alexey> elapsed 88 seconds on hf_mi6-2
Alexey> So the Alexey> hf_uint6korr is 3 times faster than uint6korr. Alexey> hf_mi_uint6korr is 2.8 times faster than mi_uint6korr.
Alexey> You're welcome to check the code out.
Thanks.
What is important is to get fast versions of
mi_uint3korr (Used a lot in ma_dynrec.c and gis) mi_uint4korr mi_uint5korr mi_uint6korr mi_uint7korr mi_uint8korr (Used for some variables)
uint5korr uint6korr (Used a lot in Aria) uint8korr
(I am including the full test so that anyone can comment upon this)
---------------------------------------------------------------------- #include <stdio.h> #include <time.h> #include <malloc.h>
#define TEST_KORR6 #define TEST_HF_KORR6 #define TEST_MI6 #define TEST_HF_MI6
#define uint6korr(A) ((ulonglong)(((uint32) ((uchar) (A)[0])) + \ (((uint32) ((uchar) (A)[1])) << 8) + \ (((uint32) ((uchar) (A)[2])) << 16) + \ (((uint32) ((uchar) (A)[3])) << 24)) + \ (((ulonglong) ((uchar) (A)[4])) << 32) + \ (((ulonglong) ((uchar) (A)[5])) << 40)) #define mi_uint6korr(A) ((ulonglong)(((uint32) (((const uchar*) (A))[5])) +\ (((uint32) (((const uchar*) (A))[4])) << 8) +\ (((uint32) (((const uchar*) (A))[3])) << 16) +\ (((uint32) (((const uchar*) (A))[2])) << 24)) +\ (((ulonglong) (((uint32) (((const uchar*) (A))[1])) +\ (((uint32) (((const uchar*) (A))[0]) << 8)))) <<\ 32))
#define hf_uint6korr(A) (((ulonglong) ((uint32 *) (A))[0]) + (((ulonglong) ((uint16 *) (A))[2]) << 32)) #define hf_mi_uint6korr(src, dest) \ __asm__ ( \ "bswapq %1;" \ "mov %1, %0;" \ :"=r"(dest) \ :"r"(hf_uint6korr(src)<<16) \ : \ )
typedef unsigned long long int ulonglong; typedef unsigned int uint32; typedef unsigned char uchar; typedef unsigned short uint16;
time_t t0, t1; ulonglong i;
#define GM 10000000000LL #define BAS 2000000
int main() { ulonglong *pb, *pb2; char *art, *art2; ulonglong ci;
art= malloc(6*BAS); pb= malloc(sizeof(ulonglong)*BAS); for (i=0; i<6*BAS; i++) art[i]= (char)i;
art2= malloc(6*BAS); pb2= malloc(sizeof(ulonglong)*BAS); for (i=0; i<6*BAS; i++) art2[i]= (char)i;
#ifdef TEST_KORR6 ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0;
pb[ci]= uint6korr(art+ci*6); ci++; } t1= time(0); printf("elapsed %d seconds on korr6-1\n", t1 - t0);
ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0; pb[ci]= uint6korr(art+ci*6); pb2[ci]= uint6korr(art2+ci*6); ci++; } t1= time(0); printf("elapsed %d seconds on korr6-2\n", t1 - t0); #endif /*KORR6*/
#ifdef TEST_HF_KORR6 ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0;
pb[ci]= hf_uint6korr(art+ci*6); ci++; } t1= time(0); printf("elapsed %d seconds on hf_korr6-1\n", t1 - t0);
ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0; pb[ci]= hf_uint6korr(art+ci*6); pb2[ci]= hf_uint6korr(art2+ci*6); ci++; } t1= time(0); printf("elapsed %d seconds on hf_korr6-2\n", t1 - t0); #endif /*HF_KORR6*/
#ifdef TEST_MI6 ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0;
pb[ci]= mi_uint6korr(art+ci*6); ci++; } t1= time(0); printf("elapsed %d seconds on mi6-1\n", t1 - t0);
ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0; pb[ci]= mi_uint6korr(art+ci*6); pb2[ci]= mi_uint6korr(art2+ci*6); ci++; } t1= time(0); printf("elapsed %d seconds on mi6-2\n", t1 - t0); #endif /*MI6*/
#ifdef TEST_HF_MI6 ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0; hf_mi_uint6korr(art+ci*6, pb[ci]); ci++; } t1= time(0); printf("elapsed %d seconds on hf_mi6B-1\n", t1 - t0);
ci= 0; t0= time(0); for (i=0; i<GM; i++) { if (ci >= BAS) ci= 0; hf_mi_uint6korr(art+ci*6, pb[ci]); hf_mi_uint6korr(art2+ci*6, pb2[ci]); ci++; } t1= time(0); printf("elapsed %d seconds on hf_mi6-2\n", t1 - t0); #endif /*HF_MI6*/
return 0; }
Looks ok.
Did you try with:
#define hf_uint6korr(A) (*((uint64 *) A) & 0xffffffffffffLL)
That may be faster, with the extra cost problem of reading 2 bytes extra, so this only safe to use if one knows that there is a few extra bytes in the buffer.
Can you do a patch to my_global.h and myisampack and add the new functions there.
What to do:
- All improvements for now only for x64 architecture
- Replace uint6korr(A) and any other function that can be improved without reading extra bytes with a faster version.
- Improve with faster variants: mi_uint3korr mi_uint4korr mi_uint5korr mi_uint6korr mi_uint7korr mi_uint8korr uint5korr uint6korr uint8korr
If the above suggested version of uint6korr() that reads extra bytes is notable faster, add a function:
uint6korr_unsafe()
Which of course should map to uint6korr for other architectures.
ok?
Regards, Monty