代码之家  ›  专栏  ›  技术社区  ›  Pavel P

如何清除霓虹灯第一条非零车道以外的所有车道?

  •  1
  • Pavel P  · 技术社区  · 7 年前

    我在一个uint32x4的霓虹灯寄存器里有一个面具。在这个掩码中,至少设置了4个整数中的1个(例如,0xffffffff),但是,在寄存器中可能设置了多个项。如何确保只设置一个?

    在C伪代码中:

    uint32x4_t clearmask(uint32x4_t m)
    {
             if (m[0]) { m[1] = m[2] = m[3] = 0; }
        else if (m[1]) { m[2] = m[3] = 0; }
        else if (m[2]) { m[3] = 0; }
        return m;
    }
    

    基本上,我想清除除一条固定车道以外的所有车道。明显 straightforward implementation in neon 可以是:

    uint32x4_t cleanmask(uint32x4_t m)
    {
        uint32x4_t mx;
        mx = vdupq_lane_u32(vget_low_u32(vmvnq_u32(m)), 0);
        mx = vsetq_lane_u32(0xffffffff, mx, 0);
        m = vandq_u32(m, mx);
    
        mx = vdupq_lane_u32(vget_low_u32(vmvnq_u32(m)), 1);
        mx = vsetq_lane_u32(0xffffffff, mx, 1);
        m = vandq_u32(m, mx);
    
        mx = vdupq_lane_u32(vget_high_u32(vmvnq_u32(m)), 0);
        mx = vsetq_lane_u32(0xffffffff, mx, 2);
        m = vandq_u32(m, mx);
    
        return m;
    }
    

    如何在Arm Neon中更有效地实现这一点?

    2 回复  |  直到 7 年前
        1
  •  2
  •   Peter Cordes    7 年前

    Very simple

    vceq.u32    q1, q0, #0
    vmov.i8     d7, #0xff
    vext.8      q2, q3, q1, #12
    
    vand        q0, q0, q2
    vand        d1, d1, d2
    vand        d1, d1, d4
    

    aarch64

    cmeq    v1.4s, v0.4s, #0
    movi    v31.16b, #0xff
    
    ext     v2.16b, v31.16b, v1.16b, #12
    ext     v3.16b, v31.16b, v1.16b, #8
    ext     v4.16b, v31.16b, v1.16b, #4
    
    and     v0.16b, v0.16b, v2.16b
    and     v0.16b, v0.16b, v3.16b
    and     v0.16b, v0.16b, v4.16b
    

    ext / vext

    v0 = [  d   c   b   a ]
    
    v2 = [ !c  !b  !a  -1 ]
    v3 = [ !b  !a  -1  -1 ]
    v4 = [ !a  -1  -1  -1 ]
    

    d

    c a b


    mvn

        2
  •  1
  •   Peter Cordes    7 年前

    vdupq_lane_u32(vget_low_u32(m), 1); vdup.32 q9, d16[1]

    uint32x4_t cleanmask_xor(uint32x4_t m)
    {
        //                 {  a    b    c   d }
        uint32x4_t maska = {  0, ~0U, ~0U, ~0U};
        uint32x4_t maskb = {~0U,   0, ~0U, ~0U};
        uint32x4_t maskc = {~0U, ~0U,   0, ~0U};
    
        uint32x4_t tmp = vdupq_lane_u32(vget_low_u32(m), 0);
        uint32x4_t aflip = tmp ^ maska;
        m &= aflip;  // if a was non-zero, the rest are zero
    
        tmp = vdupq_lane_u32(vget_low_u32(m), 1);
        uint32x4_t bflip = tmp ^ maskb;
        m &= bflip;  // if b was non-zero, the rest are zero
    
        tmp = vdupq_lane_u32(vget_high_u32(m), 0);
        uint32x4_t cflip = tmp ^ maskc;
        m &= cflip;  // if b was non-zero, the rest are zero
    
        return m;
    }
    

    Godbolt

    /* design notes
      [ a   b   c   d ]
      [ a  ~a  ~a  ~a ] 
    
    &:[ a   0   0   0 ]
    or[ 0   b   c   d ]
    
    = [ e   f   g   h  ]
      [ ~f  f   ~f  ~f ]  // not b, because f can be zero when b isn't
    
    = [ i   j   k   l ]
      ...
    */
    

    vmov.32 d1[0], r3 -1 -1U veor vmvn

    #if 1
        // clang sets up the address of each constant separately
        //                 {  a    b    c   d }
        uint32x4_t maska = {  0, ~0U, ~0U, ~0U};
        uint32x4_t maskb = {~0U,   0, ~0U, ~0U};
        uint32x4_t maskc = {~0U, ~0U,   0, ~0U};
    #else
        static const uint32_t maskbuf[] = 
          { -1U, -1U, 0, -1U, -1U, -1U};
        // unaligned loads.
        // or load one + shuffle?
    #endif