本文介绍了一种在C ++中进行基本的128位整数计算的有效方法?的处理方法,对大家解决问题具有一定的参考价值,需要的朋友们下面随着小编来一起学习吧!

问题描述

几年前,我需要一个方法来做一些基本的128位整数数学与Cuda:

现在我有同样的问题,但这次我需要在32位嵌入式系统(Intel Edison)上运行一些基本的128位算术(总和,位移和乘法),不支持128位任何类型。但是,有直接支持的64位整数(unsigned long long int)。



我试着天真地使用上次在CPU上回答我的asm代码,但是我遇到了一堆错误。我真的没有经验的asm,所以:什么是最有效的方式,有64位整数,实现添加,乘法和位移位128位?



谢谢 由于OP尚未接受回答< hint> ;< hint> ;,我已经附加了一些代码。



使用上面讨论的库可能是个好主意。虽然你今天可能只需要几个功能,最终你可能会发现你需要一个。然后再一个。直到最终你最终写入,调试和维护自己的128位数学库。这是浪费你的时间和精力。



这就是说。如果你决定自己滚动:



1)你先前问过的cuda问题已经有c代码用于乘法。有没有一些问题吗?



2)这种转移可能不会从使用asm中受益,所以c解决方案在这里也有意义。 可能是这样的方法吗?

  my_uint128_t lshift_uint128(const my_uint128_t a,int b)
{
my_uint128_t res;
if(b< 32){
res.x = a.x<< b;
res.y =(a.y (32-b));
res.z =(a.z (32-b));
res.w =(a.w<< b)| (a.z>(32-b));
} elseif(b ...
}

return res;
}

更新:支持SHLD / SHRD,这里的一个替代方法可能会比上面的'c'代码更好。 与所有代码看起来更快,你应该测试它。

  inline 
unsigned int __shld ,unsigned int from,unsigned int c)
{
unsigned int res;

if(__builtin_constant_p(into)&&
__builtin_constant_p(from)&&
__builtin_constant_p(c))
{
res = (到<< c)| (从>(32-c));
}
else
{
asm(shld%b3,%2,%0
:= rm 0(into),r(from),ic(c)
:cc
}

return res;
}

inline
unsigned int __shrd(unsigned int into,unsigned int from,unsigned int c)
{
unsigned int res;

if(__builtin_constant_p(into)&&
__builtin_constant_p(from)&&
__builtin_constant_p(c))
{
res = (到> c)| (从<<(32-c));
}
else
{
asm(shrd%b3,%2,%0
:= rm(res)
: 0(into),r(from),ic(c)
:cc
}

return res;
}

my_uint128_t lshift_uint128(const my_uint128_t a,unsigned int b)
{
my_uint128_t res;

if(b res.x = a.x<< b;
res.y = __shld(a.y,a.x,b);
res.z = __shld(a.z,a.y,b);
res.w = __shld(a.w,a.z,b);
} else if(b res.x = 0;
res.y = a.x<< (b-32);
res.z = __shld(a.y,a.x,b - 32);
res.w = __shld(a.z,a.y,b - 32);
} else if(b res.x = 0;
res.y = 0;
res.z = a.x< (b-64);
res.w = __shld(a.y,a.x,b - 64);
} else if(b res.x = 0;
res.y = 0;
res.z = 0;
res.w = a.x<< (b-96);
} else {
memset(& res,0,sizeof(res));
}

return res;
}

my_uint128_t rshift_uint128(const my_uint128_t a,unsigned int b)
{
my_uint128_t res;

if(b res.x = __shrd(a.x,a.y,b);
res.y = __shrd(a.y,a.z,b);
res.z = __shrd(a.z,a.w,b);
res.w = a.w>> b;
} else if(b< 64){
res.x = __shrd(a.y,a.z,b - 32);
res.y = __shrd(a.z,a.w,b - 32);
res.z = a.w>> (b-32);
res.w = 0;
} else if(b res.x = __shrd(a.z,a.w,b-64);
res.y = a.w>> (b-64);
res.z = 0;
res.w = 0;
} else if(b res.x = a.w> (b-96);
res.y = 0;
res.z = 0;
res.w = 0;
} else {
memset(& res,0,sizeof(res));
}

return res;
}

3)添加可能会受益于asm。您可以尝试这样:

  struct my_uint128_t 
{
unsigned int x;
unsigned int y;
unsigned int z;
unsigned int w;
};

my_uint128_t add_uint128(const my_uint128_t a,const my_uint128_t b)
{
my_uint128_t res;

asm(addl%5,%[resx] \\\
\t
adcl%7,%[resy] \\\
\t
adcl%9,%[resz] \\\
\t
adcl%11,%[resw] \\\
\t
:[resx]=& (res.x),[resy]=& r(res.y),
[resz]=& r(res.z),[resw]=& r
:%0(ax),irm(bx),
%1(ay),irm (az),irm(bz),
%3(aw),irm(bw)
:cc

return res;
}

我只是打破了这一点,所以使用风险自负。我没有爱迪生,但这与x86。



更新:如果你只是做积累(认为 to + = from ,而不是上面的代码 c = a + b ),这个代码可能更好地为您服务:

  inline 
void addto_uint128(my_uint128_t * to,const my_uint128_t from)
{
asm %[fromx],%[fromx],%[fromx],%[tox] \\\
\t
adcl% [toz] \\\
\t
adcl%[fromw],%[tow] \\\
\t
:[tox]+& r ; x),[toy]+& r(to> y),
[toz]+& r(to> z),[tow]+& r (from> w)
:[fromx]irm(from.x),[fromy]irm(from.y),
[fromz]irm z),[fromw]irm(from.w)
:cc);
}


Some years ago I needed a way to do some basic 128 bit integer math with Cuda:128 bit integer on cuda?.Now I am having the same problem, but this time I need to run some basic 128 bit arithmetics (sums, bitshifts and multiplications) on a 32 bit embedded system (Intel Edison) that does not support 128 bits of any kind. There are, however, 64 bit integers supported directly (unsigned long long int).

I tried naively to use the asm code that was answered to me last time on the CPU, but I got a bunch of errors. I am really not experienced with asm, so: what is the most efficient way, having 64 bit integers, to implement additions, multiplications and bit shifting in 128 bits?

Thank you very much

解决方案

Update: Since the OP hasn't accepted an answer yet <hint><hint>, I've attached a bit more code.

Using the libraries discussed above is probably a good idea. While you might only need a few functions today, eventually you may find that you need one more. Then one more after that. Until eventually you end up writing, debugging and maintaining your own 128bit math library. Which is a waste of your time and effort.

That said. If you are determined to roll your own:

1) The cuda question you asked previously already has c code for multiplication. Was there some problem with it?

2) The shift probably won't benefit from using asm, so a c solution makes sense to me here as well. Maybe an approach like this?

my_uint128_t lshift_uint128 (const my_uint128_t a, int b)
{
   my_uint128_t res;
   if (b < 32) {    
      res.x = a.x << b;
      res.y = (a.y << b) | (a.x >> (32 - b));
      res.z = (a.z << b) | (a.y >> (32 - b));
      res.w = (a.w << b) | (a.z >> (32 - b));
   } elseif (b < 64) {
      ...
   }

   return res;
}

Update: Since it appears that the Edison may support SHLD/SHRD, here's an alternative which might be more performant than the 'c' code above. As with all code purporting to be faster, you should test it.

inline
unsigned int __shld(unsigned int into, unsigned int from, unsigned int c)
{
   unsigned int res;

   if (__builtin_constant_p(into) &&
       __builtin_constant_p(from) &&
       __builtin_constant_p(c))
   {
      res = (into << c) | (from >> (32 - c));
   }
   else
   {
      asm("shld %b3, %2, %0"
          : "=rm" (res)
          : "0" (into), "r" (from), "ic" (c)
          : "cc");
   }

   return res;
}

inline
unsigned int __shrd(unsigned int into, unsigned int from, unsigned int c)
{
   unsigned int res;

   if (__builtin_constant_p(into) && 
       __builtin_constant_p(from) && 
       __builtin_constant_p(c))
   {
      res = (into >> c) | (from << (32 - c));
   }
   else
   {
      asm("shrd %b3, %2, %0"
          : "=rm" (res)
          : "0" (into), "r" (from), "ic" (c)
          : "cc");
   }

   return res;
}

my_uint128_t lshift_uint128 (const my_uint128_t a, unsigned int b)
{
   my_uint128_t res;

   if (b < 32) {
      res.x = a.x << b;
      res.y = __shld(a.y, a.x, b);
      res.z = __shld(a.z, a.y, b);
      res.w = __shld(a.w, a.z, b);
   } else if (b < 64) {
      res.x = 0;
      res.y = a.x << (b - 32);
      res.z = __shld(a.y, a.x, b - 32);
      res.w = __shld(a.z, a.y, b - 32);
   } else if (b < 96) {
      res.x = 0;
      res.y = 0;
      res.z = a.x << (b - 64);
      res.w = __shld(a.y, a.x, b - 64);
   } else if (b < 128) {
      res.x = 0;
      res.y = 0;
      res.z = 0;
      res.w = a.x << (b - 96);
   } else {
      memset(&res, 0, sizeof(res));
   }

   return res;
}

my_uint128_t rshift_uint128 (const my_uint128_t a, unsigned int b)
{
   my_uint128_t res;

   if (b < 32) {
      res.x = __shrd(a.x, a.y, b);
      res.y = __shrd(a.y, a.z, b);
      res.z = __shrd(a.z, a.w, b);
      res.w = a.w >> b;
   } else if (b < 64) {
      res.x = __shrd(a.y, a.z, b - 32);
      res.y = __shrd(a.z, a.w, b - 32);
      res.z = a.w >> (b - 32);
      res.w = 0;
   } else if (b < 96) {
      res.x = __shrd(a.z, a.w, b - 64);
      res.y = a.w >> (b - 64);
      res.z = 0;
      res.w = 0;
   } else if (b < 128) {
      res.x = a.w >> (b - 96);
      res.y = 0;
      res.z = 0;
      res.w = 0;
   } else {
      memset(&res, 0, sizeof(res));
   }

   return res;
}

3) The addition may benefit from asm. You could try this:

struct my_uint128_t
{
   unsigned int x;
   unsigned int y;
   unsigned int z;
   unsigned int w;
};

my_uint128_t add_uint128 (const my_uint128_t a, const my_uint128_t b)
{
   my_uint128_t res;

    asm ("addl %5, %[resx]\n\t"
         "adcl %7, %[resy]\n\t"
         "adcl %9, %[resz]\n\t"
         "adcl %11, %[resw]\n\t"
         : [resx] "=&r" (res.x), [resy] "=&r" (res.y), 
           [resz] "=&r" (res.z), [resw] "=&r" (res.w)
         : "%0"(a.x), "irm"(b.x), 
           "%1"(a.y), "irm"(b.y), 
           "%2"(a.z), "irm"(b.z), 
           "%3"(a.w), "irm"(b.w)
         : "cc");

   return res;
}

I just dashed this off, so use at your own risk. I don't have an Edison, but this works with x86.

Update: If you are just doing accumulation (think to += from instead of the code above which is c = a + b), this code might serve you better:

inline
void addto_uint128 (my_uint128_t *to, const my_uint128_t from)
{
   asm ("addl %[fromx], %[tox]\n\t"
        "adcl %[fromy], %[toy]\n\t"
        "adcl %[fromz], %[toz]\n\t"
        "adcl %[fromw], %[tow]\n\t"
        : [tox] "+&r"(to->x), [toy] "+&r"(to->y), 
          [toz] "+&r"(to->z), [tow] "+&r"(to->w)
        : [fromx] "irm"(from.x), [fromy] "irm"(from.y), 
          [fromz] "irm"(from.z), [fromw] "irm"(from.w)
        : "cc");
}

这篇关于一种在C ++中进行基本的128位整数计算的有效方法?的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!

10-28 21:54