Hi!
>>>>> "Alexey" == Alexey Botchkov <holyfoot(a)askmonty.org> writes:
Alexey> Hi, Monty.
Alexey> I've looked at that part of the code, and tried to make the improved
Alexey> versions of uint6korr and mi_uint6korr.
Alexey> The small program attached that benchmarks these.
Alexey> Depending on enabled macros it loops uint6korr, hf_uint6korr,
Alexey> mi_uint6korr and hf_mi_uint6korr respectively. It performs 2 loops on
Alexey> each functions first runs it once, the second - twice, so we can
Alexey> calculate how much time was spent on the operation itself.
Alexey> The results i got so far are:
Alexey> elapsed 103 seconds on korr6-1
Alexey> elapsed 190 seconds on korr6-2
Alexey> elapsed 50 seconds on hf_korr6-1
Alexey> elapsed 79 seconds on hf_korr6-2
Alexey> elapsed 106 seconds on mi6-1
Alexey> elapsed 195 seconds on mi6-2
Alexey> elapsed 56 seconds on hf_mi6B-1
Alexey> elapsed 88 seconds on hf_mi6-2
Alexey> So the
Alexey> hf_uint6korr is 3 times faster than uint6korr.
Alexey> hf_mi_uint6korr is 2.8 times faster than mi_uint6korr.
Alexey> You're welcome to check the code out.
Thanks.
What is important is to get fast versions of
mi_uint3korr (Used a lot in ma_dynrec.c and gis)
mi_uint4korr
mi_uint5korr
mi_uint6korr
mi_uint7korr
mi_uint8korr (Used for some variables)
uint5korr
uint6korr (Used a lot in Aria)
uint8korr
(I am including the full test so that anyone can comment upon this)
----------------------------------------------------------------------
#include <stdio.h>
#include <time.h>
#include <malloc.h>
#define TEST_KORR6
#define TEST_HF_KORR6
#define TEST_MI6
#define TEST_HF_MI6
#define uint6korr(A) ((ulonglong)(((uint32) ((uchar) (A)[0])) + \
(((uint32) ((uchar) (A)[1])) << 8) + \
(((uint32) ((uchar) (A)[2])) << 16) + \
(((uint32) ((uchar) (A)[3])) << 24)) + \
(((ulonglong) ((uchar) (A)[4])) << 32) + \
(((ulonglong) ((uchar) (A)[5])) << 40))
#define mi_uint6korr(A) ((ulonglong)(((uint32) (((const uchar*) (A))[5])) +\
(((uint32) (((const uchar*) (A))[4])) << 8) +\
(((uint32) (((const uchar*) (A))[3])) << 16) +\
(((uint32) (((const uchar*) (A))[2])) << 24)) +\
(((ulonglong) (((uint32) (((const uchar*) (A))[1])) +\
(((uint32) (((const uchar*) (A))[0]) << 8)))) <<\
32))
#define hf_uint6korr(A) (((ulonglong) ((uint32 *) (A))[0]) + (((ulonglong) ((uint16 *) (A))[2]) << 32))
#define hf_mi_uint6korr(src, dest) \
__asm__ ( \
"bswapq %1;" \
"mov %1, %0;" \
:"=r"(dest) \
:"r"(hf_uint6korr(src)<<16) \
: \
)
typedef unsigned long long int ulonglong;
typedef unsigned int uint32;
typedef unsigned char uchar;
typedef unsigned short uint16;
time_t t0, t1;
ulonglong i;
#define GM 10000000000LL
#define BAS 2000000
int main()
{
ulonglong *pb, *pb2;
char *art, *art2;
ulonglong ci;
art= malloc(6*BAS);
pb= malloc(sizeof(ulonglong)*BAS);
for (i=0; i<6*BAS; i++)
art[i]= (char)i;
art2= malloc(6*BAS);
pb2= malloc(sizeof(ulonglong)*BAS);
for (i=0; i<6*BAS; i++)
art2[i]= (char)i;
#ifdef TEST_KORR6
ci= 0;
t0= time(0);
for (i=0; i<GM; i++)
{
if (ci >= BAS)
ci= 0;
pb[ci]= uint6korr(art+ci*6);
ci++;
}
t1= time(0);
printf("elapsed %d seconds on korr6-1\n", t1 - t0);
ci= 0;
t0= time(0);
for (i=0; i<GM; i++)
{
if (ci >= BAS)
ci= 0;
pb[ci]= uint6korr(art+ci*6);
pb2[ci]= uint6korr(art2+ci*6);
ci++;
}
t1= time(0);
printf("elapsed %d seconds on korr6-2\n", t1 - t0);
#endif /*KORR6*/
#ifdef TEST_HF_KORR6
ci= 0;
t0= time(0);
for (i=0; i<GM; i++)
{
if (ci >= BAS)
ci= 0;
pb[ci]= hf_uint6korr(art+ci*6);
ci++;
}
t1= time(0);
printf("elapsed %d seconds on hf_korr6-1\n", t1 - t0);
ci= 0;
t0= time(0);
for (i=0; i<GM; i++)
{
if (ci >= BAS)
ci= 0;
pb[ci]= hf_uint6korr(art+ci*6);
pb2[ci]= hf_uint6korr(art2+ci*6);
ci++;
}
t1= time(0);
printf("elapsed %d seconds on hf_korr6-2\n", t1 - t0);
#endif /*HF_KORR6*/
#ifdef TEST_MI6
ci= 0;
t0= time(0);
for (i=0; i<GM; i++)
{
if (ci >= BAS)
ci= 0;
pb[ci]= mi_uint6korr(art+ci*6);
ci++;
}
t1= time(0);
printf("elapsed %d seconds on mi6-1\n", t1 - t0);
ci= 0;
t0= time(0);
for (i=0; i<GM; i++)
{
if (ci >= BAS)
ci= 0;
pb[ci]= mi_uint6korr(art+ci*6);
pb2[ci]= mi_uint6korr(art2+ci*6);
ci++;
}
t1= time(0);
printf("elapsed %d seconds on mi6-2\n", t1 - t0);
#endif /*MI6*/
#ifdef TEST_HF_MI6
ci= 0;
t0= time(0);
for (i=0; i<GM; i++)
{
if (ci >= BAS)
ci= 0;
hf_mi_uint6korr(art+ci*6, pb[ci]);
ci++;
}
t1= time(0);
printf("elapsed %d seconds on hf_mi6B-1\n", t1 - t0);
ci= 0;
t0= time(0);
for (i=0; i<GM; i++)
{
if (ci >= BAS)
ci= 0;
hf_mi_uint6korr(art+ci*6, pb[ci]);
hf_mi_uint6korr(art2+ci*6, pb2[ci]);
ci++;
}
t1= time(0);
printf("elapsed %d seconds on hf_mi6-2\n", t1 - t0);
#endif /*HF_MI6*/
return 0;
}
Looks ok.
Did you try with:
#define hf_uint6korr(A) (*((uint64 *) A) & 0xffffffffffffLL)
That may be faster, with the extra cost problem of reading 2 bytes
extra, so this only safe to use if one knows that there is a few extra
bytes in the buffer.
Can you do a patch to my_global.h and myisampack and add the new
functions there.
What to do:
- All improvements for now only for x64 architecture
- Replace uint6korr(A) and any other function that can be improved
without reading extra bytes with a faster version.
- Improve with faster variants:
mi_uint3korr
mi_uint4korr
mi_uint5korr
mi_uint6korr
mi_uint7korr
mi_uint8korr
uint5korr
uint6korr
uint8korr
If the above suggested version of uint6korr() that reads extra bytes
is notable faster, add a function:
uint6korr_unsafe()
Which of course should map to uint6korr for other architectures.
ok?
Regards,
Monty