/* msm-swrender.c
 *
 * Copyright (c) 2009, Code Aurora Forum. All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *     * Redistributions of source code must retain the above copyright
 *       notice, this list of conditions and the following disclaimer.
 *     * Redistributions in binary form must reproduce the above copyright
 *       notice, this list of conditions and the following disclaimer in the
 *       documentation and/or other materials provided with the distribution.
 *     * Neither the name of Code Aurora nor
 *       the names of its contributors may be used to endorse or promote
 *       products derived from this software without specific prior written
 *       permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NON-INFRINGEMENT ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include "msm-render.h"

// Shared software blit code.
#include "msm.h"
#include "msm-drm.h"
#include "msm-swblits.h"

// Should only be needed for working around a kernel issue that fails to save/restore Neon registers for userspace signal handlers.
#define MASK_SIGNALS (TRUE)
#if (MASK_SIGNALS)
#include "signal.h"  // Needed only for masking signals.
#endif // (MASK_SIGNALS)


/* Return TRUE if the two rectangles described in the blit request overlap. */
static inline BOOL
isOverlap(MSMBlitRec *blit, int bpp)
{
    if (blit->src->priv[0] != blit->dst->priv[0])
	return FALSE;

    int src_x1 = blit->srcRect->x;
    int src_x2 = src_x1 + blit->srcRect->w;
    int src_y1 = blit->srcRect->y;
    int src_y2 = src_y1 + blit->srcRect->h;

    int dst_x1 = blit->dstRect->x;
    int dst_x2 = dst_x1 + blit->dstRect->w;
    int dst_y1 = blit->dstRect->y;
    int dst_y2 = dst_y1 + blit->dstRect->h;

    return (src_x2 >= dst_x1 && src_x1 < dst_x2)      // 'X' coordinates overlap
	&& (src_y2 >= dst_y1 && src_y1 < dst_y2);  // and 'Y' coordinates overlap.
}


/* A copy is compatible with the MDP if the source and destination rectangles do not overlap. */
BOOL
isCopyMDPCompatible(MSMBlitRec *blit, int bpp)
{
#if MDP_BLIT_REQ_VERSION < 2
    if (blit->src->flags == MSM_BLIT_GEM ||
	blit->dst->flags == MSM_BLIT_GEM)
	return FALSE;
#endif

    /* Can't use the hardware if the src or the dest surface is
     * in kmem */

    if (blit->src->flags == MSM_BLIT_GEM &&
	msm_drm_bo_get_memtype((struct msm_drm_bo *) blit->src->priv[0]) == MSM_DRM_MEMTYPE_KMEM)
	return FALSE;

    if (blit->dst->flags == MSM_BLIT_GEM &&
	msm_drm_bo_get_memtype((struct msm_drm_bo *) blit->dst->priv[0]) == MSM_DRM_MEMTYPE_KMEM)
	return FALSE;

    return (!isOverlap(blit, bpp));
}


/* Alignment check macro functions used to determine if two pointers are aligned with a specified granularity. */
#define SW_CHECK_ALIGNMENT(ALIGNMENT_BYTE_SIZE,dst,src,REQUIRED_ALIGNMENT) \
   (((int) (dst) % (ALIGNMENT_BYTE_SIZE)) == (REQUIRED_ALIGNMENT) \
    && ((int) (src) % (ALIGNMENT_BYTE_SIZE)) == (REQUIRED_ALIGNMENT))

/* Alignment check macro functions used to determine if two pointers (along with pitches) are aligned with a specified granularity.        */
/* (Having the pitches aligned, as well as the pointers, insures that all pointers when incremented by the pitches will still be aligned.) */
#define SW_CHECK_PITCHED_ALIGNMENT(ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,REQUIRED_ALIGNMENT) \
   (((int) (dst) % (ALIGNMENT_BYTE_SIZE)) == (REQUIRED_ALIGNMENT) \
    && ((int) (src) % (ALIGNMENT_BYTE_SIZE)) == (REQUIRED_ALIGNMENT) \
    && (abs(dpitch) % (ALIGNMENT_BYTE_SIZE)) == 0 \
    && (abs(spitch) % (ALIGNMENT_BYTE_SIZE)) == 0)


/* Copy a row of 16bpp pixels, for fixed-size widths.                                    */
/* (Pointers are assumed to be half-word-aligned, which should be guaranteed for 16bpp.) */
static inline BOOL
swCopy16BppSmallFixedWidths1Row_Unaligned(unsigned char *dst, unsigned char *src, int w, int xdir)
{
   // Try to copy the following pixels using 16-bit alignment, or higher alignments if available.
   // Also, unroll loops as much as possible to prevent stores from interferring with subsequent loads.
   switch(w) {
      // NOTE: Several callers of this code assume that all calls with w<=8 will succeeed and return TRUE.
      case  0: return TRUE;
               break;
      case  1: {
                  uint16_t src1  = *(uint16_t *) (src+0*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*BYTES_PER_UINT16_T)  = src1;
                  return TRUE;
               }
               break;
      case  2:  if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
                  uint32_t src1  = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0*BYTES_PER_UINT32_T)  = src1;
                  return TRUE;}
               else {
                  uint16_t src1  = *(uint16_t *) (src+0*BYTES_PER_UINT16_T);
                  uint16_t src2  = *(uint16_t *) (src+1*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*BYTES_PER_UINT16_T)  = src1;
                  *(uint16_t *) (dst+1*BYTES_PER_UINT16_T)  = src2;
                  return TRUE;
               }
               break;
      case  3: if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
                  uint32_t src1  = *(uint32_t *) (src+0);
                  uint16_t src2  = *(uint16_t *) (src+1*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0)                     = src1;
                  *(uint16_t *) (dst+1*BYTES_PER_UINT32_T)  = src2;
                  return TRUE;
               } else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,2)){
                  uint16_t src1  = *(uint16_t *) (src+0);
                  uint32_t src2  = *(uint32_t *) (src+1*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0)                     = src1;
                  *(uint32_t *) (dst+1*BYTES_PER_UINT16_T)  = src2;
                  return TRUE;
               } else {
                  uint16_t src1  = *(uint16_t *) (src+0*BYTES_PER_UINT16_T);
                  uint16_t src2  = *(uint16_t *) (src+1*BYTES_PER_UINT16_T);
                  uint16_t src3  = *(uint16_t *) (src+2*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*BYTES_PER_UINT16_T)  = src1;
                  *(uint16_t *) (dst+1*BYTES_PER_UINT16_T)  = src2;
                  *(uint16_t *) (dst+2*BYTES_PER_UINT16_T)  = src3;
                  return TRUE;
               }
               break;
      case  4: if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
                  return TRUE;
               }
               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
                  uint32_t src1  = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
                  uint32_t src2  = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0*BYTES_PER_UINT32_T)  = src1;
                  *(uint32_t *) (dst+1*BYTES_PER_UINT32_T)  = src2;
                  return TRUE;
               }
               else {
                  uint16_t src1  = *(uint16_t *) (src+0*BYTES_PER_UINT16_T);
                  uint16_t src2  = *(uint16_t *) (src+1*BYTES_PER_UINT16_T);
                  uint16_t src3  = *(uint16_t *) (src+2*BYTES_PER_UINT16_T);
                  uint16_t src4  = *(uint16_t *) (src+3*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*BYTES_PER_UINT16_T)  = src1;
                  *(uint16_t *) (dst+1*BYTES_PER_UINT16_T)  = src2;
                  *(uint16_t *) (dst+2*BYTES_PER_UINT16_T)  = src3;
                  *(uint16_t *) (dst+3*BYTES_PER_UINT16_T)  = src4;
                  return TRUE;
               }
               break;
      case  5: if (xdir >= 0) {
                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 1, xdir);
                  return TRUE;
               } else {
                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 1, xdir);
                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
                  return TRUE;
               }
               break;
      case  6: if (xdir >= 0)
               {
                  if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,2 * BYTES_PER_UINT16_T)) {
                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 2, xdir);
                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 2 * BYTES_PER_UINT16_T, src + 2 * BYTES_PER_UINT16_T, 4, xdir);
                     return TRUE;
                  } else {
                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 2, xdir);
                     return TRUE;
                  }
               } else {
                  if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,2 * BYTES_PER_UINT16_T)) {
                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 2 * BYTES_PER_UINT16_T, src + 2 * BYTES_PER_UINT16_T, 4, xdir);
                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 2, xdir);
                     return TRUE;
                  } else {
                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 2, xdir);
                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
                     return TRUE;
                  }
               }
               break;
      case  7: if (xdir >= 0) {
                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
                  return TRUE;
               } else {
                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir);
                  swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 4, xdir);
                  return TRUE;
               }
               break;
      case  8: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
                  return TRUE;
               }
               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
                  return TRUE;
               }
               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
                  uint32_t src1  = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
                  uint32_t src2  = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
                  uint32_t src3  = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
                  uint32_t src4  = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0*BYTES_PER_UINT32_T)  = src1;
                  *(uint32_t *) (dst+1*BYTES_PER_UINT32_T)  = src2;
                  *(uint32_t *) (dst+2*BYTES_PER_UINT32_T)  = src3;
                  *(uint32_t *) (dst+3*BYTES_PER_UINT32_T)  = src4;
                  return TRUE;
               }
               else {
                  uint16_t src1  = *(uint16_t *) (src+0*BYTES_PER_UINT16_T);
                  uint16_t src2  = *(uint16_t *) (src+1*BYTES_PER_UINT16_T);
                  uint16_t src3  = *(uint16_t *) (src+2*BYTES_PER_UINT16_T);
                  uint16_t src4  = *(uint16_t *) (src+3*BYTES_PER_UINT16_T);
                  uint16_t src5  = *(uint16_t *) (src+4*BYTES_PER_UINT16_T);
                  uint16_t src6  = *(uint16_t *) (src+5*BYTES_PER_UINT16_T);
                  uint16_t src7  = *(uint16_t *) (src+6*BYTES_PER_UINT16_T);
                  uint16_t src8  = *(uint16_t *) (src+7*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*BYTES_PER_UINT16_T)  = src1;
                  *(uint16_t *) (dst+1*BYTES_PER_UINT16_T)  = src2;
                  *(uint16_t *) (dst+2*BYTES_PER_UINT16_T)  = src3;
                  *(uint16_t *) (dst+3*BYTES_PER_UINT16_T)  = src4;
                  *(uint16_t *) (dst+4*BYTES_PER_UINT16_T)  = src5;
                  *(uint16_t *) (dst+5*BYTES_PER_UINT16_T)  = src6;
                  *(uint16_t *) (dst+6*BYTES_PER_UINT16_T)  = src7;
                  *(uint16_t *) (dst+7*BYTES_PER_UINT16_T)  = src8;
                  return TRUE;
               }
               break;
      case 16: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
                  return TRUE;
               }
               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
                  uint64_t src3  = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
                  uint64_t src4  = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
                  *(uint64_t *) (dst+2*BYTES_PER_UINT64_T)  = src3;
                  *(uint64_t *) (dst+3*BYTES_PER_UINT64_T)  = src4;
                  return TRUE;
               }
               else if (SW_CHECK_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
                  uint32_t src1  = *(uint32_t *) (src+0*BYTES_PER_UINT32_T);
                  uint32_t src2  = *(uint32_t *) (src+1*BYTES_PER_UINT32_T);
                  uint32_t src3  = *(uint32_t *) (src+2*BYTES_PER_UINT32_T);
                  uint32_t src4  = *(uint32_t *) (src+3*BYTES_PER_UINT32_T);
                  uint32_t src5  = *(uint32_t *) (src+4*BYTES_PER_UINT32_T);
                  uint32_t src6  = *(uint32_t *) (src+5*BYTES_PER_UINT32_T);
                  uint32_t src7  = *(uint32_t *) (src+6*BYTES_PER_UINT32_T);
                  uint32_t src8  = *(uint32_t *) (src+7*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0*BYTES_PER_UINT32_T)  = src1;
                  *(uint32_t *) (dst+1*BYTES_PER_UINT32_T)  = src2;
                  *(uint32_t *) (dst+2*BYTES_PER_UINT32_T)  = src3;
                  *(uint32_t *) (dst+3*BYTES_PER_UINT32_T)  = src4;
                  *(uint32_t *) (dst+4*BYTES_PER_UINT32_T)  = src5;
                  *(uint32_t *) (dst+5*BYTES_PER_UINT32_T)  = src6;
                  *(uint32_t *) (dst+6*BYTES_PER_UINT32_T)  = src7;
                  *(uint32_t *) (dst+7*BYTES_PER_UINT32_T)  = src8;
                  return TRUE;
               }
               else {
                  // Don't bother unrolling loops here, since that won't help for more than around 8 operations.
                  // Instead, just call multiple fixed functions.
                  if (xdir >= 0) {
                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
                  } else {
                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir);
                     swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir);
                  }
                  return TRUE;
               }
               break;
      case 32: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
                  uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
                  uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
                  vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
                  vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
                  return TRUE;
               }
               else if (SW_CHECK_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
                  uint64_t src1  = *(uint64_t *) (src+0*BYTES_PER_UINT64_T);
                  uint64_t src2  = *(uint64_t *) (src+1*BYTES_PER_UINT64_T);
                  uint64_t src3  = *(uint64_t *) (src+2*BYTES_PER_UINT64_T);
                  uint64_t src4  = *(uint64_t *) (src+3*BYTES_PER_UINT64_T);
                  uint64_t src5  = *(uint64_t *) (src+4*BYTES_PER_UINT64_T);
                  uint64_t src6  = *(uint64_t *) (src+5*BYTES_PER_UINT64_T);
                  uint64_t src7  = *(uint64_t *) (src+6*BYTES_PER_UINT64_T);
                  uint64_t src8  = *(uint64_t *) (src+7*BYTES_PER_UINT64_T);
                  *(uint64_t *) (dst+0*BYTES_PER_UINT64_T)  = src1;
                  *(uint64_t *) (dst+1*BYTES_PER_UINT64_T)  = src2;
                  *(uint64_t *) (dst+2*BYTES_PER_UINT64_T)  = src3;
                  *(uint64_t *) (dst+3*BYTES_PER_UINT64_T)  = src4;
                  *(uint64_t *) (dst+4*BYTES_PER_UINT64_T)  = src5;
                  *(uint64_t *) (dst+5*BYTES_PER_UINT64_T)  = src6;
                  *(uint64_t *) (dst+6*BYTES_PER_UINT64_T)  = src7;
                  *(uint64_t *) (dst+7*BYTES_PER_UINT64_T)  = src8;
                  return TRUE;
               }
               break;
      case 64: if (SW_CHECK_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,0)) {
                  uint32x4_t src1 = vld1q_u32((uint32_t *)(src+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2 = vld1q_u32((uint32_t *)(src+1*BYTES_PER_UINT32X4_T));
                  uint32x4_t src3 = vld1q_u32((uint32_t *)(src+2*BYTES_PER_UINT32X4_T));
                  uint32x4_t src4 = vld1q_u32((uint32_t *)(src+3*BYTES_PER_UINT32X4_T));
                  uint32x4_t src5 = vld1q_u32((uint32_t *)(src+4*BYTES_PER_UINT32X4_T));
                  uint32x4_t src6 = vld1q_u32((uint32_t *)(src+5*BYTES_PER_UINT32X4_T));
                  uint32x4_t src7 = vld1q_u32((uint32_t *)(src+6*BYTES_PER_UINT32X4_T));
                  uint32x4_t src8 = vld1q_u32((uint32_t *)(src+7*BYTES_PER_UINT32X4_T));
                  vst1q_u32((uint32_t *)(dst+0*BYTES_PER_UINT32X4_T),src1);
                  vst1q_u32((uint32_t *)(dst+1*BYTES_PER_UINT32X4_T),src2);
                  vst1q_u32((uint32_t *)(dst+2*BYTES_PER_UINT32X4_T),src3);
                  vst1q_u32((uint32_t *)(dst+3*BYTES_PER_UINT32X4_T),src4);
                  vst1q_u32((uint32_t *)(dst+4*BYTES_PER_UINT32X4_T),src5);
                  vst1q_u32((uint32_t *)(dst+5*BYTES_PER_UINT32X4_T),src6);
                  vst1q_u32((uint32_t *)(dst+6*BYTES_PER_UINT32X4_T),src7);
                  vst1q_u32((uint32_t *)(dst+7*BYTES_PER_UINT32X4_T),src8);
                  return TRUE;
               }
               break;
   }

   return FALSE;
}



/* Copy two rows of 16bpp pixels, for fixed-size widths. */
/* (Pointers are assumed to be half-word-aligned, which should be guaranteed for 16bpp.) */
static inline BOOL
swCopy16BppSmallFixedWidths2Rows_Unaligned(unsigned char *dst, unsigned char *src, int w, int xdir, int dpitch, int spitch)
{
   // Try to copy the following pixels using 16-bit alignment, or higher alignments if available.
   // Also, unroll loops as much as possible to prevent stores interferring with subsequent loads.
   switch(w) {
      // NOTE: Several callers of this code assume that all calls with w<=8 will succeeed and return TRUE.
      case  0: return TRUE;
               break;
      case  1: {
                  uint16_t src1a  = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src1b  = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
                  return TRUE;
               }
               break;
      case  2: if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32_t src1a  = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src1b  = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
                  return TRUE;
               }
               else {
                  uint16_t src1a  = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2a  = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src1b  = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2b  = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
                  return TRUE;
               }
               break;
      case  3: if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0*dpitch+0)                     = src1a;
                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
                  *(uint32_t *) (dst+1*dpitch+0)                     = src1b;
                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
                  uint16_t src1a = *(uint16_t *) (src+0*spitch);
                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src1b = *(uint16_t *) (src+1*spitch);
                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*dpitch+0)                     = src1a;
                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
                  *(uint16_t *) (dst+1*dpitch+0)                     = src1b;
                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
                  return TRUE;
               }
               else {
                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
                  *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T)  = src3a;
                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
                  *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T)  = src3b;
                  return TRUE;
               }
               break;
      case  4: if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
                  return TRUE;
               }
               else {
                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
                  *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T)  = src3a;
                  *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T)  = src4a;
                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
                  *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T)  = src3b;
                  *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T)  = src4b;
                  return TRUE;
               }
               break;
      case  5: if (xdir >= 0) {
                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 1, xdir, dpitch, spitch);
                  return TRUE;
               } else {
                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 1, xdir, dpitch, spitch);
                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
                  return TRUE;
               }
               break;
      case  6: if (xdir >= 0)
               {
                  if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2 * BYTES_PER_UINT16_T)) {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 2, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2 * BYTES_PER_UINT16_T, src + 2 * BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch);
                  } else {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 2, xdir, dpitch, spitch);
                  }
               } else {
                  if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2 * BYTES_PER_UINT16_T)) {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2 * BYTES_PER_UINT16_T, src + 2 * BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 2, xdir, dpitch, spitch);
                  } else {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 2, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
                  }
               }
               return TRUE;
               break;
      case  7: if (xdir >= 0) {
                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
               } else {
                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
                  swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
               }
               return TRUE;
               break;
      case  8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
                  uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
                  uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
                  return TRUE;
               }
               else {
                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
                  uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
                  uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
                  uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
                  uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
                  uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
                  uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
                  uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
                  uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
                  *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T)  = src3a;
                  *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T)  = src4a;
                  *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T)  = src5a;
                  *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T)  = src6a;
                  *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T)  = src7a;
                  *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T)  = src8a;
                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
                  *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T)  = src3b;
                  *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T)  = src4b;
                  *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T)  = src5b;
                  *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T)  = src6b;
                  *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T)  = src7b;
                  *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T)  = src8b;
                  return TRUE;
               }
               break;
      case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT32_T)) {
                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
                  uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
                  uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
                  uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
                  uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
                  uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
                  uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
                  *(uint32_t *) (dst+0*dpitch+0)                                        = src1a;
                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2a;
                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3a;
                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5a;
                  *(uint32_t *) (dst+1*dpitch+0)                                        = src1b;
                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2b;
                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3b;
                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5b;
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
                  uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
                  uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
                  uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
                  uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
                  uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
                  uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
                  uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
                  uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
                  *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T)  = src5a;
                  *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T)  = src6a;
                  *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T)  = src7a;
                  *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T)  = src8a;
                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
                  *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T)  = src5b;
                  *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T)  = src6b;
                  *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T)  = src7b;
                  *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T)  = src8b;
                  return TRUE;
               }
               else {
                  // Don't bother unrolling loops, since that won't help for more than around 8 operations.
                  // Instead, just call multiple fixed functions.
                  if (xdir >= 0) {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                  } else {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
                  }
                  return TRUE;
               }
               break;
      case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
                  if (xdir >= 0) {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                           src + 0,                            4, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T,      src + (4)*BYTES_PER_UINT16_T,      16, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T,   src + (16+4)*BYTES_PER_UINT16_T,    8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T,  4, xdir, dpitch, spitch);
                  } else {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+16+4)*BYTES_PER_UINT16_T, src + (8+16+4)*BYTES_PER_UINT16_T,  4, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (16+4)*BYTES_PER_UINT16_T,   src + (16+4)*BYTES_PER_UINT16_T,    8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4)*BYTES_PER_UINT16_T,      src + (4)*BYTES_PER_UINT16_T,      16, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                           src + 0,                            4, xdir, dpitch, spitch);
                   }
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
                  uint64_t src5a = *(uint64_t *) (src+0*spitch+4*BYTES_PER_UINT64_T);
                  uint64_t src6a = *(uint64_t *) (src+0*spitch+5*BYTES_PER_UINT64_T);
                  uint64_t src7a = *(uint64_t *) (src+0*spitch+6*BYTES_PER_UINT64_T);
                  uint64_t src8a = *(uint64_t *) (src+0*spitch+7*BYTES_PER_UINT64_T);
                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
                  uint64_t src5b = *(uint64_t *) (src+1*spitch+4*BYTES_PER_UINT64_T);
                  uint64_t src6b = *(uint64_t *) (src+1*spitch+5*BYTES_PER_UINT64_T);
                  uint64_t src7b = *(uint64_t *) (src+1*spitch+6*BYTES_PER_UINT64_T);
                  uint64_t src8b = *(uint64_t *) (src+1*spitch+7*BYTES_PER_UINT64_T);
                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
                  *(uint64_t *) (dst+0*dpitch+4*BYTES_PER_UINT64_T)  = src5a;
                  *(uint64_t *) (dst+0*dpitch+5*BYTES_PER_UINT64_T)  = src6a;
                  *(uint64_t *) (dst+0*dpitch+6*BYTES_PER_UINT64_T)  = src7a;
                  *(uint64_t *) (dst+0*dpitch+7*BYTES_PER_UINT64_T)  = src8a;
                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
                  *(uint64_t *) (dst+1*dpitch+4*BYTES_PER_UINT64_T)  = src5b;
                  *(uint64_t *) (dst+1*dpitch+5*BYTES_PER_UINT64_T)  = src6b;
                  *(uint64_t *) (dst+1*dpitch+6*BYTES_PER_UINT64_T)  = src7b;
                  *(uint64_t *) (dst+1*dpitch+7*BYTES_PER_UINT64_T)  = src8b;
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
                  if (xdir >= 0) {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
                  } else {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
                  }
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
                  if (xdir >= 0) {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
                  } else {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
                  }
                  return TRUE;
               }
               else {
                  if (xdir >= 0) {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                  } else {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 3*8*BYTES_PER_UINT16_T, src + 3*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 2*8*BYTES_PER_UINT16_T, src + 2*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 1*8*BYTES_PER_UINT16_T, src + 1*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0*8*BYTES_PER_UINT16_T, src + 0*8*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                  }
                  return TRUE;
               }
               break;
      case 64: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
                  uint32x4_t src5a = vld1q_u32((uint32_t *)(src+0*spitch+4*BYTES_PER_UINT32X4_T));
                  uint32x4_t src6a = vld1q_u32((uint32_t *)(src+0*spitch+5*BYTES_PER_UINT32X4_T));
                  uint32x4_t src7a = vld1q_u32((uint32_t *)(src+0*spitch+6*BYTES_PER_UINT32X4_T));
                  uint32x4_t src8a = vld1q_u32((uint32_t *)(src+0*spitch+7*BYTES_PER_UINT32X4_T));
                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
                  uint32x4_t src5b = vld1q_u32((uint32_t *)(src+1*spitch+4*BYTES_PER_UINT32X4_T));
                  uint32x4_t src6b = vld1q_u32((uint32_t *)(src+1*spitch+5*BYTES_PER_UINT32X4_T));
                  uint32x4_t src7b = vld1q_u32((uint32_t *)(src+1*spitch+6*BYTES_PER_UINT32X4_T));
                  uint32x4_t src8b = vld1q_u32((uint32_t *)(src+1*spitch+7*BYTES_PER_UINT32X4_T));
                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+4*BYTES_PER_UINT32X4_T),src5a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+5*BYTES_PER_UINT32X4_T),src6a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+6*BYTES_PER_UINT32X4_T),src7a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+7*BYTES_PER_UINT32X4_T),src8a);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+4*BYTES_PER_UINT32X4_T),src5b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+5*BYTES_PER_UINT32X4_T),src6b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+6*BYTES_PER_UINT32X4_T),src7b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+7*BYTES_PER_UINT32X4_T),src8b);
                  return TRUE;
               }//HERE
               else if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,4*BYTES_PER_UINT16_T)) {
                  if (xdir >= 0) {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                             src + 0,                             4,    xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T,   src + (0*16+4)*BYTES_PER_UINT16_T,   2*16, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T,   src + (2*16+4)*BYTES_PER_UINT16_T,   16,   xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T,   src + (3*16+4)*BYTES_PER_UINT16_T,   8,    xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4,    xdir, dpitch, spitch);
                  } else {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (8+3*16+4)*BYTES_PER_UINT16_T, src + (8+3*16+4)*BYTES_PER_UINT16_T, 4,    xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*16+4)*BYTES_PER_UINT16_T,   src + (3*16+4)*BYTES_PER_UINT16_T,   8,    xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*16+4)*BYTES_PER_UINT16_T,   src + (2*16+4)*BYTES_PER_UINT16_T,   16,   xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*16+4)*BYTES_PER_UINT16_T,   src + (0*16+4)*BYTES_PER_UINT16_T,   2*16, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                             src + 0,                             4,    xdir, dpitch, spitch);
                  }
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
                  if (xdir >= 0) {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
                  } else {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+2)*BYTES_PER_UINT16_T, src + (7*8+2)*BYTES_PER_UINT16_T, 6, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+2)*BYTES_PER_UINT16_T, src + (6*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+2)*BYTES_PER_UINT16_T, src + (5*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+2)*BYTES_PER_UINT16_T, src + (4*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+2)*BYTES_PER_UINT16_T, src + (3*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+2)*BYTES_PER_UINT16_T, src + (2*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+2)*BYTES_PER_UINT16_T, src + (1*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+2)*BYTES_PER_UINT16_T, src + (0*8+2)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 2, xdir, dpitch, spitch);
                  }
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
                  if (xdir >= 0) {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
                  } else {
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (7*8+1)*BYTES_PER_UINT16_T, src + (7*8+1)*BYTES_PER_UINT16_T, 7, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (6*8+1)*BYTES_PER_UINT16_T, src + (6*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (5*8+1)*BYTES_PER_UINT16_T, src + (5*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (4*8+1)*BYTES_PER_UINT16_T, src + (4*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (3*8+1)*BYTES_PER_UINT16_T, src + (3*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (2*8+1)*BYTES_PER_UINT16_T, src + (2*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (1*8+1)*BYTES_PER_UINT16_T, src + (1*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + (0*8+1)*BYTES_PER_UINT16_T, src + (0*8+1)*BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 0,                          src + 0                         , 1, xdir, dpitch, spitch);
                  }
                  return TRUE;
               }
               break;
   }

   return FALSE;
}


/* Copy two rows of 16bpp pixels, for fixed-size widths.                                 */
/* (Pointers are assumed to be half-word-aligned, which should be guaranteed for 16bpp.) */
static inline BOOL
swCopy16BppSmallFixedWidths4Rows_Unaligned(unsigned char *dst, unsigned char *src, int w, int xdir, int dpitch, int spitch)
{
   // Try to copy the following pixels using 16-bit alignment, or higher alignments if available.
   // Also, unroll loops as much as possible to prevent stores interferring with subsequent loads.
   switch(w) {
      // NOTE: Several callers of this code assume that all calls with w<=8 will succeeed and return TRUE.
      case  0: return TRUE;
               break;
      case  1: {
                  uint16_t src1a  = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src1b  = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src1c  = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src1d  = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
                  *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T)  = src1c;
                  *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T)  = src1d;
                  return TRUE;
               }
               break;
      case  2: if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32_t src1a  = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src1b  = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src1c  = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src1d  = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
                  *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T)  = src1c;
                  *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T)  = src1d;
                  return TRUE;
               }
               else {
                  uint16_t src1a  = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2a  = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src1b  = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2b  = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src1c  = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2c  = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src1d  = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2d  = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
                  *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T)  = src1c;
                  *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T)  = src2c;
                  *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T)  = src1d;
                  *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T)  = src2d;
                  return TRUE;
               }
               break;
      case  3: if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0);
                  uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0);
                  uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0*dpitch+0)                     = src1a;
                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
                  *(uint32_t *) (dst+1*dpitch+0)                     = src1b;
                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
                  *(uint32_t *) (dst+2*dpitch+0)                     = src1c;
                  *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T)  = src2c;
                  *(uint32_t *) (dst+3*dpitch+0)                     = src1d;
                  *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T)  = src2d;
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
                  uint16_t src1a = *(uint16_t *) (src+0*spitch);
                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src1b = *(uint16_t *) (src+1*spitch);
                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src1c = *(uint16_t *) (src+2*spitch);
                  uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src1d = *(uint16_t *) (src+3*spitch);
                  uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*dpitch+0)                     = src1a;
                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
                  *(uint16_t *) (dst+1*dpitch+0)                     = src1b;
                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
                  *(uint16_t *) (dst+2*dpitch+0)                     = src1c;
                  *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T)  = src2c;
                  *(uint16_t *) (dst+3*dpitch+0)                     = src1d;
                  *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T)  = src2d;
                  return TRUE;
               }
               else {
                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
                  *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T)  = src3a;
                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
                  *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T)  = src3b;
                  *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T)  = src1c;
                  *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T)  = src2c;
                  *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T)  = src3c;
                  *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T)  = src1d;
                  *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T)  = src2d;
                  *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T)  = src3d;
                  return TRUE;
               }
               break;
      case  4: if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
                  *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T)  = src1c;
                  *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T)  = src1d;
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
                  *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T)  = src1c;
                  *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T)  = src2c;
                  *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T)  = src1d;
                  *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T)  = src2d;
                  return TRUE;
               }
               else {
                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src4c = *(uint16_t *) (src+2*spitch+3*BYTES_PER_UINT16_T);
                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src4d = *(uint16_t *) (src+3*spitch+3*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
                  *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T)  = src3a;
                  *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T)  = src4a;
                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
                  *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T)  = src3b;
                  *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T)  = src4b;
                  *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T)  = src1c;
                  *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T)  = src2c;
                  *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T)  = src3c;
                  *(uint16_t *) (dst+2*dpitch+3*BYTES_PER_UINT16_T)  = src4c;
                  *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T)  = src1d;
                  *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T)  = src2d;
                  *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T)  = src3d;
                  *(uint16_t *) (dst+3*dpitch+3*BYTES_PER_UINT16_T)  = src4d;
                  return TRUE;
               }
               break;
      case  5: if (xdir >= 0) {
                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 1, xdir, dpitch, spitch);
               } else {
                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 1, xdir, dpitch, spitch);
                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
               }
               return TRUE;
               break;
     case  6: if (xdir >= 0)
               {
                  if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2 * BYTES_PER_UINT16_T)) {
                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 2, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 2 * BYTES_PER_UINT16_T, src + 2 * BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch);
                  } else {
                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 2, xdir, dpitch, spitch);
                  }
               } else {
                  if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2 * BYTES_PER_UINT16_T)) {
                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 2 * BYTES_PER_UINT16_T, src + 2 * BYTES_PER_UINT16_T, 4, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 2, xdir, dpitch, spitch);
                  } else {
                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 2, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
                  }
               }
               return TRUE;
               break;
      case  7: if (xdir >= 0) {
                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
               } else {
                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 4 * BYTES_PER_UINT16_T, src + 4 * BYTES_PER_UINT16_T, 3, xdir, dpitch, spitch);
                  swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 4, xdir, dpitch, spitch);
               }
               return TRUE;
               break;
      // TODO: Add more alignment checks for 8 pixel-wide cases for performance reasons?
      //       For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,DOUBLE_WORD_ALIGNMENT_BYTE_SIZE/2)) and related half-aligned cases...
      case  8: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
                  uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T);
                  uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T);
                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
                  *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T)  = src1c;
                  *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T)  = src2c;
                  *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T)  = src1d;
                  *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T)  = src2d;
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T);
                  uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T);
                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T);
                  uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
                  *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T)  = src1c;
                  *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T)  = src2c;
                  *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T)  = src3c;
                  *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T)  = src4c;
                  *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T)  = src1d;
                  *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T)  = src2d;
                  *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T)  = src3d;
                  *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T)  = src4d;
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
                  uint16_t src5a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
                  uint16_t src5b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0);
                  uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
                  uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
                  uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
                  uint16_t src5c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0);
                  uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
                  uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
                  uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
                  uint16_t src5d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
                  *(uint16_t *) (dst+2*dpitch+0)                                        = src1c;
                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2c;
                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3c;
                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4c;
                  *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5c;
                  *(uint16_t *) (dst+3*dpitch+0)                                        = src1d;
                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2d;
                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3d;
                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4d;
                  *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5d;
                  return TRUE;
               }
               else {
                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2a = *(uint16_t *) (src+0*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3a = *(uint16_t *) (src+0*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src4a = *(uint16_t *) (src+0*spitch+3*BYTES_PER_UINT16_T);
                  uint16_t src5a = *(uint16_t *) (src+0*spitch+4*BYTES_PER_UINT16_T);
                  uint16_t src6a = *(uint16_t *) (src+0*spitch+5*BYTES_PER_UINT16_T);
                  uint16_t src7a = *(uint16_t *) (src+0*spitch+6*BYTES_PER_UINT16_T);
                  uint16_t src8a = *(uint16_t *) (src+0*spitch+7*BYTES_PER_UINT16_T);
                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2b = *(uint16_t *) (src+1*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3b = *(uint16_t *) (src+1*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src4b = *(uint16_t *) (src+1*spitch+3*BYTES_PER_UINT16_T);
                  uint16_t src5b = *(uint16_t *) (src+1*spitch+4*BYTES_PER_UINT16_T);
                  uint16_t src6b = *(uint16_t *) (src+1*spitch+5*BYTES_PER_UINT16_T);
                  uint16_t src7b = *(uint16_t *) (src+1*spitch+6*BYTES_PER_UINT16_T);
                  uint16_t src8b = *(uint16_t *) (src+1*spitch+7*BYTES_PER_UINT16_T);
                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2c = *(uint16_t *) (src+2*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3c = *(uint16_t *) (src+2*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src4c = *(uint16_t *) (src+2*spitch+3*BYTES_PER_UINT16_T);
                  uint16_t src5c = *(uint16_t *) (src+2*spitch+4*BYTES_PER_UINT16_T);
                  uint16_t src6c = *(uint16_t *) (src+2*spitch+5*BYTES_PER_UINT16_T);
                  uint16_t src7c = *(uint16_t *) (src+2*spitch+6*BYTES_PER_UINT16_T);
                  uint16_t src8c = *(uint16_t *) (src+2*spitch+7*BYTES_PER_UINT16_T);
                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0*BYTES_PER_UINT16_T);
                  uint16_t src2d = *(uint16_t *) (src+3*spitch+1*BYTES_PER_UINT16_T);
                  uint16_t src3d = *(uint16_t *) (src+3*spitch+2*BYTES_PER_UINT16_T);
                  uint16_t src4d = *(uint16_t *) (src+3*spitch+3*BYTES_PER_UINT16_T);
                  uint16_t src5d = *(uint16_t *) (src+3*spitch+4*BYTES_PER_UINT16_T);
                  uint16_t src6d = *(uint16_t *) (src+3*spitch+5*BYTES_PER_UINT16_T);
                  uint16_t src7d = *(uint16_t *) (src+3*spitch+6*BYTES_PER_UINT16_T);
                  uint16_t src8d = *(uint16_t *) (src+3*spitch+7*BYTES_PER_UINT16_T);
                  *(uint16_t *) (dst+0*dpitch+0*BYTES_PER_UINT16_T)  = src1a;
                  *(uint16_t *) (dst+0*dpitch+1*BYTES_PER_UINT16_T)  = src2a;
                  *(uint16_t *) (dst+0*dpitch+2*BYTES_PER_UINT16_T)  = src3a;
                  *(uint16_t *) (dst+0*dpitch+3*BYTES_PER_UINT16_T)  = src4a;
                  *(uint16_t *) (dst+0*dpitch+4*BYTES_PER_UINT16_T)  = src5a;
                  *(uint16_t *) (dst+0*dpitch+5*BYTES_PER_UINT16_T)  = src6a;
                  *(uint16_t *) (dst+0*dpitch+6*BYTES_PER_UINT16_T)  = src7a;
                  *(uint16_t *) (dst+0*dpitch+7*BYTES_PER_UINT16_T)  = src8a;
                  *(uint16_t *) (dst+1*dpitch+0*BYTES_PER_UINT16_T)  = src1b;
                  *(uint16_t *) (dst+1*dpitch+1*BYTES_PER_UINT16_T)  = src2b;
                  *(uint16_t *) (dst+1*dpitch+2*BYTES_PER_UINT16_T)  = src3b;
                  *(uint16_t *) (dst+1*dpitch+3*BYTES_PER_UINT16_T)  = src4b;
                  *(uint16_t *) (dst+1*dpitch+4*BYTES_PER_UINT16_T)  = src5b;
                  *(uint16_t *) (dst+1*dpitch+5*BYTES_PER_UINT16_T)  = src6b;
                  *(uint16_t *) (dst+1*dpitch+6*BYTES_PER_UINT16_T)  = src7b;
                  *(uint16_t *) (dst+1*dpitch+7*BYTES_PER_UINT16_T)  = src8b;
                  *(uint16_t *) (dst+2*dpitch+0*BYTES_PER_UINT16_T)  = src1c;
                  *(uint16_t *) (dst+2*dpitch+1*BYTES_PER_UINT16_T)  = src2c;
                  *(uint16_t *) (dst+2*dpitch+2*BYTES_PER_UINT16_T)  = src3c;
                  *(uint16_t *) (dst+2*dpitch+3*BYTES_PER_UINT16_T)  = src4c;
                  *(uint16_t *) (dst+2*dpitch+4*BYTES_PER_UINT16_T)  = src5c;
                  *(uint16_t *) (dst+2*dpitch+5*BYTES_PER_UINT16_T)  = src6c;
                  *(uint16_t *) (dst+2*dpitch+6*BYTES_PER_UINT16_T)  = src7c;
                  *(uint16_t *) (dst+2*dpitch+7*BYTES_PER_UINT16_T)  = src8c;
                  *(uint16_t *) (dst+3*dpitch+0*BYTES_PER_UINT16_T)  = src1d;
                  *(uint16_t *) (dst+3*dpitch+1*BYTES_PER_UINT16_T)  = src2d;
                  *(uint16_t *) (dst+3*dpitch+2*BYTES_PER_UINT16_T)  = src3d;
                  *(uint16_t *) (dst+3*dpitch+3*BYTES_PER_UINT16_T)  = src4d;
                  *(uint16_t *) (dst+3*dpitch+4*BYTES_PER_UINT16_T)  = src5d;
                  *(uint16_t *) (dst+3*dpitch+5*BYTES_PER_UINT16_T)  = src6d;
                  *(uint16_t *) (dst+3*dpitch+6*BYTES_PER_UINT16_T)  = src7d;
                  *(uint16_t *) (dst+3*dpitch+7*BYTES_PER_UINT16_T)  = src8d;
                  return TRUE;
               }
               break;
      case 16: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T));
                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T));
                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
                  vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c);
                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
                  vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d);
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint64_t src1a = *(uint64_t *) (src+0*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src2a = *(uint64_t *) (src+0*spitch+1*BYTES_PER_UINT64_T);
                  uint64_t src3a = *(uint64_t *) (src+0*spitch+2*BYTES_PER_UINT64_T);
                  uint64_t src4a = *(uint64_t *) (src+0*spitch+3*BYTES_PER_UINT64_T);
                  uint64_t src1b = *(uint64_t *) (src+1*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src2b = *(uint64_t *) (src+1*spitch+1*BYTES_PER_UINT64_T);
                  uint64_t src3b = *(uint64_t *) (src+1*spitch+2*BYTES_PER_UINT64_T);
                  uint64_t src4b = *(uint64_t *) (src+1*spitch+3*BYTES_PER_UINT64_T);
                  uint64_t src1c = *(uint64_t *) (src+2*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src2c = *(uint64_t *) (src+2*spitch+1*BYTES_PER_UINT64_T);
                  uint64_t src3c = *(uint64_t *) (src+2*spitch+2*BYTES_PER_UINT64_T);
                  uint64_t src4c = *(uint64_t *) (src+2*spitch+3*BYTES_PER_UINT64_T);
                  uint64_t src1d = *(uint64_t *) (src+3*spitch+0*BYTES_PER_UINT64_T);
                  uint64_t src2d = *(uint64_t *) (src+3*spitch+1*BYTES_PER_UINT64_T);
                  uint64_t src3d = *(uint64_t *) (src+3*spitch+2*BYTES_PER_UINT64_T);
                  uint64_t src4d = *(uint64_t *) (src+3*spitch+3*BYTES_PER_UINT64_T);
                  *(uint64_t *) (dst+0*dpitch+0*BYTES_PER_UINT64_T)  = src1a;
                  *(uint64_t *) (dst+0*dpitch+1*BYTES_PER_UINT64_T)  = src2a;
                  *(uint64_t *) (dst+0*dpitch+2*BYTES_PER_UINT64_T)  = src3a;
                  *(uint64_t *) (dst+0*dpitch+3*BYTES_PER_UINT64_T)  = src4a;
                  *(uint64_t *) (dst+1*dpitch+0*BYTES_PER_UINT64_T)  = src1b;
                  *(uint64_t *) (dst+1*dpitch+1*BYTES_PER_UINT64_T)  = src2b;
                  *(uint64_t *) (dst+1*dpitch+2*BYTES_PER_UINT64_T)  = src3b;
                  *(uint64_t *) (dst+1*dpitch+3*BYTES_PER_UINT64_T)  = src4b;
                  *(uint64_t *) (dst+2*dpitch+0*BYTES_PER_UINT64_T)  = src1c;
                  *(uint64_t *) (dst+2*dpitch+1*BYTES_PER_UINT64_T)  = src2c;
                  *(uint64_t *) (dst+2*dpitch+2*BYTES_PER_UINT64_T)  = src3c;
                  *(uint64_t *) (dst+2*dpitch+3*BYTES_PER_UINT64_T)  = src4c;
                  *(uint64_t *) (dst+3*dpitch+0*BYTES_PER_UINT64_T)  = src1d;
                  *(uint64_t *) (dst+3*dpitch+1*BYTES_PER_UINT64_T)  = src2d;
                  *(uint64_t *) (dst+3*dpitch+2*BYTES_PER_UINT64_T)  = src3d;
                  *(uint64_t *) (dst+3*dpitch+3*BYTES_PER_UINT64_T)  = src4d;
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,2*BYTES_PER_UINT16_T)) {
                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0);
                  uint64_t src2a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
                  uint64_t src3a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
                  uint64_t src4a = *(uint64_t *) (src+0*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0);
                  uint64_t src2b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
                  uint64_t src3b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
                  uint64_t src4b = *(uint64_t *) (src+1*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0);
                  uint64_t src2c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
                  uint64_t src3c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
                  uint64_t src4c = *(uint64_t *) (src+2*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
                  uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0);
                  uint64_t src2d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T);
                  uint64_t src3d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T);
                  uint64_t src4d = *(uint64_t *) (src+3*spitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T);
                  uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T);
                  *(uint32_t *) (dst+0*dpitch+0)                                        = src1a;
                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2a;
                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3a;
                  *(uint64_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5a;
                  *(uint32_t *) (dst+1*dpitch+0)                                        = src1b;
                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2b;
                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3b;
                  *(uint64_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5b;
                  *(uint32_t *) (dst+2*dpitch+0)                                        = src1c;
                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2c;
                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3c;
                  *(uint64_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4c;
                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5c;
                  *(uint32_t *) (dst+3*dpitch+0)                                        = src1d;
                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+0*BYTES_PER_UINT64_T)  = src2d;
                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+1*BYTES_PER_UINT64_T)  = src3d;
                  *(uint64_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+2*BYTES_PER_UINT64_T)  = src4d;
                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT32_T+3*BYTES_PER_UINT64_T)  = src5d;
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32_t src1a = *(uint32_t *) (src+0*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2a = *(uint32_t *) (src+0*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src3a = *(uint32_t *) (src+0*spitch+2*BYTES_PER_UINT32_T);
                  uint32_t src4a = *(uint32_t *) (src+0*spitch+3*BYTES_PER_UINT32_T);
                  uint32_t src5a = *(uint32_t *) (src+0*spitch+4*BYTES_PER_UINT32_T);
                  uint32_t src6a = *(uint32_t *) (src+0*spitch+5*BYTES_PER_UINT32_T);
                  uint32_t src7a = *(uint32_t *) (src+0*spitch+6*BYTES_PER_UINT32_T);
                  uint32_t src8a = *(uint32_t *) (src+0*spitch+7*BYTES_PER_UINT32_T);
                  uint32_t src1b = *(uint32_t *) (src+1*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2b = *(uint32_t *) (src+1*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src3b = *(uint32_t *) (src+1*spitch+2*BYTES_PER_UINT32_T);
                  uint32_t src4b = *(uint32_t *) (src+1*spitch+3*BYTES_PER_UINT32_T);
                  uint32_t src5b = *(uint32_t *) (src+1*spitch+4*BYTES_PER_UINT32_T);
                  uint32_t src6b = *(uint32_t *) (src+1*spitch+5*BYTES_PER_UINT32_T);
                  uint32_t src7b = *(uint32_t *) (src+1*spitch+6*BYTES_PER_UINT32_T);
                  uint32_t src8b = *(uint32_t *) (src+1*spitch+7*BYTES_PER_UINT32_T);
                  uint32_t src1c = *(uint32_t *) (src+2*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2c = *(uint32_t *) (src+2*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src3c = *(uint32_t *) (src+2*spitch+2*BYTES_PER_UINT32_T);
                  uint32_t src4c = *(uint32_t *) (src+2*spitch+3*BYTES_PER_UINT32_T);
                  uint32_t src5c = *(uint32_t *) (src+2*spitch+4*BYTES_PER_UINT32_T);
                  uint32_t src6c = *(uint32_t *) (src+2*spitch+5*BYTES_PER_UINT32_T);
                  uint32_t src7c = *(uint32_t *) (src+2*spitch+6*BYTES_PER_UINT32_T);
                  uint32_t src8c = *(uint32_t *) (src+2*spitch+7*BYTES_PER_UINT32_T);
                  uint32_t src1d = *(uint32_t *) (src+3*spitch+0*BYTES_PER_UINT32_T);
                  uint32_t src2d = *(uint32_t *) (src+3*spitch+1*BYTES_PER_UINT32_T);
                  uint32_t src3d = *(uint32_t *) (src+3*spitch+2*BYTES_PER_UINT32_T);
                  uint32_t src4d = *(uint32_t *) (src+3*spitch+3*BYTES_PER_UINT32_T);
                  uint32_t src5d = *(uint32_t *) (src+3*spitch+4*BYTES_PER_UINT32_T);
                  uint32_t src6d = *(uint32_t *) (src+3*spitch+5*BYTES_PER_UINT32_T);
                  uint32_t src7d = *(uint32_t *) (src+3*spitch+6*BYTES_PER_UINT32_T);
                  uint32_t src8d = *(uint32_t *) (src+3*spitch+7*BYTES_PER_UINT32_T);
                  *(uint32_t *) (dst+0*dpitch+0*BYTES_PER_UINT32_T)  = src1a;
                  *(uint32_t *) (dst+0*dpitch+1*BYTES_PER_UINT32_T)  = src2a;
                  *(uint32_t *) (dst+0*dpitch+2*BYTES_PER_UINT32_T)  = src3a;
                  *(uint32_t *) (dst+0*dpitch+3*BYTES_PER_UINT32_T)  = src4a;
                  *(uint32_t *) (dst+0*dpitch+4*BYTES_PER_UINT32_T)  = src5a;
                  *(uint32_t *) (dst+0*dpitch+5*BYTES_PER_UINT32_T)  = src6a;
                  *(uint32_t *) (dst+0*dpitch+6*BYTES_PER_UINT32_T)  = src7a;
                  *(uint32_t *) (dst+0*dpitch+7*BYTES_PER_UINT32_T)  = src8a;
                  *(uint32_t *) (dst+1*dpitch+0*BYTES_PER_UINT32_T)  = src1b;
                  *(uint32_t *) (dst+1*dpitch+1*BYTES_PER_UINT32_T)  = src2b;
                  *(uint32_t *) (dst+1*dpitch+2*BYTES_PER_UINT32_T)  = src3b;
                  *(uint32_t *) (dst+1*dpitch+3*BYTES_PER_UINT32_T)  = src4b;
                  *(uint32_t *) (dst+1*dpitch+4*BYTES_PER_UINT32_T)  = src5b;
                  *(uint32_t *) (dst+1*dpitch+5*BYTES_PER_UINT32_T)  = src6b;
                  *(uint32_t *) (dst+1*dpitch+6*BYTES_PER_UINT32_T)  = src7b;
                  *(uint32_t *) (dst+1*dpitch+7*BYTES_PER_UINT32_T)  = src8b;
                  *(uint32_t *) (dst+2*dpitch+0*BYTES_PER_UINT32_T)  = src1c;
                  *(uint32_t *) (dst+2*dpitch+1*BYTES_PER_UINT32_T)  = src2c;
                  *(uint32_t *) (dst+2*dpitch+2*BYTES_PER_UINT32_T)  = src3c;
                  *(uint32_t *) (dst+2*dpitch+3*BYTES_PER_UINT32_T)  = src4c;
                  *(uint32_t *) (dst+2*dpitch+4*BYTES_PER_UINT32_T)  = src5c;
                  *(uint32_t *) (dst+2*dpitch+5*BYTES_PER_UINT32_T)  = src6c;
                  *(uint32_t *) (dst+2*dpitch+6*BYTES_PER_UINT32_T)  = src7c;
                  *(uint32_t *) (dst+2*dpitch+7*BYTES_PER_UINT32_T)  = src8c;
                  *(uint32_t *) (dst+3*dpitch+0*BYTES_PER_UINT32_T)  = src1d;
                  *(uint32_t *) (dst+3*dpitch+1*BYTES_PER_UINT32_T)  = src2d;
                  *(uint32_t *) (dst+3*dpitch+2*BYTES_PER_UINT32_T)  = src3d;
                  *(uint32_t *) (dst+3*dpitch+3*BYTES_PER_UINT32_T)  = src4d;
                  *(uint32_t *) (dst+3*dpitch+4*BYTES_PER_UINT32_T)  = src5d;
                  *(uint32_t *) (dst+3*dpitch+5*BYTES_PER_UINT32_T)  = src6d;
                  *(uint32_t *) (dst+3*dpitch+6*BYTES_PER_UINT32_T)  = src7d;
                  *(uint32_t *) (dst+3*dpitch+7*BYTES_PER_UINT32_T)  = src8d;
                  return TRUE;
               }
               else if (SW_CHECK_PITCHED_ALIGNMENT(WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,BYTES_PER_UINT16_T)) {
                  uint16_t src1a = *(uint16_t *) (src+0*spitch+0);
                  uint32_t src2a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
                  uint32_t src3a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
                  uint32_t src4a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
                  uint32_t src5a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
                  uint32_t src6a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
                  uint32_t src7a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
                  uint32_t src8a = *(uint32_t *) (src+0*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
                  uint16_t src9a = *(uint16_t *) (src+0*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
                  uint16_t src1b = *(uint16_t *) (src+1*spitch+0);
                  uint32_t src2b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
                  uint32_t src3b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
                  uint32_t src4b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
                  uint32_t src5b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
                  uint32_t src6b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
                  uint32_t src7b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
                  uint32_t src8b = *(uint32_t *) (src+1*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
                  uint16_t src9b = *(uint16_t *) (src+1*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
                  uint16_t src1c = *(uint16_t *) (src+2*spitch+0);
                  uint32_t src2c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
                  uint32_t src3c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
                  uint32_t src4c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
                  uint32_t src5c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
                  uint32_t src6c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
                  uint32_t src7c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
                  uint32_t src8c = *(uint32_t *) (src+2*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
                  uint16_t src9c = *(uint16_t *) (src+2*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
                  uint16_t src1d = *(uint16_t *) (src+3*spitch+0);
                  uint32_t src2d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T);
                  uint32_t src3d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T);
                  uint32_t src4d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T);
                  uint32_t src5d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T);
                  uint32_t src6d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T);
                  uint32_t src7d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T);
                  uint32_t src8d = *(uint32_t *) (src+3*spitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T);
                  uint16_t src9d = *(uint16_t *) (src+3*spitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T);
                  *(uint16_t *) (dst+0*dpitch+0)                                        = src1a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7a;
                  *(uint32_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8a;
                  *(uint16_t *) (dst+0*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9a;
                  *(uint16_t *) (dst+1*dpitch+0)                                        = src1b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7b;
                  *(uint32_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8b;
                  *(uint16_t *) (dst+1*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9b;
                  *(uint16_t *) (dst+2*dpitch+0)                                        = src1c;
                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2c;
                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3c;
                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4c;
                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5c;
                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6c;
                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7c;
                  *(uint32_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8c;
                  *(uint16_t *) (dst+2*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9c;
                  *(uint16_t *) (dst+3*dpitch+0)                                        = src1d;
                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+0*BYTES_PER_UINT32_T)  = src2d;
                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+1*BYTES_PER_UINT32_T)  = src3d;
                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+2*BYTES_PER_UINT32_T)  = src4d;
                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+3*BYTES_PER_UINT32_T)  = src5d;
                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+4*BYTES_PER_UINT32_T)  = src6d;
                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+5*BYTES_PER_UINT32_T)  = src7d;
                  *(uint32_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+6*BYTES_PER_UINT32_T)  = src8d;
                  *(uint16_t *) (dst+3*dpitch+BYTES_PER_UINT16_T+7*BYTES_PER_UINT32_T)  = src9d;
                  return TRUE;
               }
               else {
                  // Don't bother unrolling loops, since that won't help for more than around 8 operations.
                  // Instead, just call multiple fixed functions.
                  if (xdir >= 0) {
                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                  } else {
                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, 8, xdir, dpitch, spitch);
                     swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch);
                  }
                  return TRUE;
               }
               break;
      // TODO: Add more alignment checks for 32 pixel-wide cases for performance reasons?
      //       For example, handling (SW_CHECK_PITCHED_ALIGNMENT(DOUBLE_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,XXX)) and related cases could make a big difference here...
      case 32: if (SW_CHECK_PITCHED_ALIGNMENT(QUAD_WORD_ALIGNMENT_BYTE_SIZE,dst,src,dpitch,spitch,0)) {
                  uint32x4_t src1a = vld1q_u32((uint32_t *)(src+0*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2a = vld1q_u32((uint32_t *)(src+0*spitch+1*BYTES_PER_UINT32X4_T));
                  uint32x4_t src3a = vld1q_u32((uint32_t *)(src+0*spitch+2*BYTES_PER_UINT32X4_T));
                  uint32x4_t src4a = vld1q_u32((uint32_t *)(src+0*spitch+3*BYTES_PER_UINT32X4_T));
                  uint32x4_t src1b = vld1q_u32((uint32_t *)(src+1*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2b = vld1q_u32((uint32_t *)(src+1*spitch+1*BYTES_PER_UINT32X4_T));
                  uint32x4_t src3b = vld1q_u32((uint32_t *)(src+1*spitch+2*BYTES_PER_UINT32X4_T));
                  uint32x4_t src4b = vld1q_u32((uint32_t *)(src+1*spitch+3*BYTES_PER_UINT32X4_T));
                  uint32x4_t src1c = vld1q_u32((uint32_t *)(src+2*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2c = vld1q_u32((uint32_t *)(src+2*spitch+1*BYTES_PER_UINT32X4_T));
                  uint32x4_t src3c = vld1q_u32((uint32_t *)(src+2*spitch+2*BYTES_PER_UINT32X4_T));
                  uint32x4_t src4c = vld1q_u32((uint32_t *)(src+2*spitch+3*BYTES_PER_UINT32X4_T));
                  uint32x4_t src1d = vld1q_u32((uint32_t *)(src+3*spitch+0*BYTES_PER_UINT32X4_T));
                  uint32x4_t src2d = vld1q_u32((uint32_t *)(src+3*spitch+1*BYTES_PER_UINT32X4_T));
                  uint32x4_t src3d = vld1q_u32((uint32_t *)(src+3*spitch+2*BYTES_PER_UINT32X4_T));
                  uint32x4_t src4d = vld1q_u32((uint32_t *)(src+3*spitch+3*BYTES_PER_UINT32X4_T));
                  vst1q_u32((uint32_t *)(dst+0*dpitch+0*BYTES_PER_UINT32X4_T),src1a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+1*BYTES_PER_UINT32X4_T),src2a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+2*BYTES_PER_UINT32X4_T),src3a);
                  vst1q_u32((uint32_t *)(dst+0*dpitch+3*BYTES_PER_UINT32X4_T),src4a);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+0*BYTES_PER_UINT32X4_T),src1b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+1*BYTES_PER_UINT32X4_T),src2b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+2*BYTES_PER_UINT32X4_T),src3b);
                  vst1q_u32((uint32_t *)(dst+1*dpitch+3*BYTES_PER_UINT32X4_T),src4b);
                  vst1q_u32((uint32_t *)(dst+2*dpitch+0*BYTES_PER_UINT32X4_T),src1c);
                  vst1q_u32((uint32_t *)(dst+2*dpitch+1*BYTES_PER_UINT32X4_T),src2c);
                  vst1q_u32((uint32_t *)(dst+2*dpitch+2*BYTES_PER_UINT32X4_T),src3c);
                  vst1q_u32((uint32_t *)(dst+2*dpitch+3*BYTES_PER_UINT32X4_T),src4c);
                  vst1q_u32((uint32_t *)(dst+3*dpitch+0*BYTES_PER_UINT32X4_T),src1d);
                  vst1q_u32((uint32_t *)(dst+3*dpitch+1*BYTES_PER_UINT32X4_T),src2d);
                  vst1q_u32((uint32_t *)(dst+3*dpitch+2*BYTES_PER_UINT32X4_T),src3d);
                  vst1q_u32((uint32_t *)(dst+3*dpitch+3*BYTES_PER_UINT32X4_T),src4d);
                  return TRUE;
               }
               break;
   }

   return FALSE;
}


/* Draw multiple rows with a specific memory copy without narrow width functions. */
#define DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(MEMCPY_FUNCTION,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS) \
do { \
   /* Draw four rows at a time, in the most efficient way. */ \
   while (h >= FOUR_ROWS) { \
      h -= FOUR_ROWS; \
\
      BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \
      MEMCPY_FUNCTION(dst + 0*dpitch, src + 0*spitch, w * BYTES_PER_UINT16_T); \
      MEMCPY_FUNCTION(dst + 1*dpitch, src + 1*spitch, w * BYTES_PER_UINT16_T); \
      MEMCPY_FUNCTION(dst + 2*dpitch, src + 2*spitch, w * BYTES_PER_UINT16_T); \
      MEMCPY_FUNCTION(dst + 3*dpitch, src + 3*spitch, w * BYTES_PER_UINT16_T); \
      UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \
\
      dst += FOUR_ROWS * dpitch; \
      src += FOUR_ROWS * spitch; \
   } \
\
   BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \
\
   /* Draw two rows at a time, in the most efficient way. */ \
   while (h >= TWO_ROWS) { \
      h -= TWO_ROWS; \
\
      MEMCPY_FUNCTION(dst + 0*dpitch, src + 0*spitch, w * BYTES_PER_UINT16_T); \
      MEMCPY_FUNCTION(dst + 1*dpitch, src + 1*spitch, w * BYTES_PER_UINT16_T); \
\
      dst += TWO_ROWS * dpitch; \
      src += TWO_ROWS * spitch; \
   } \
\
   /* Draw one row at a time, in the most efficient way. */ \
   while (h >= ONE_ROW) { \
      h -= ONE_ROW; \
\
      MEMCPY_FUNCTION(dst, src, w * BYTES_PER_UINT16_T); \
\
      dst += ONE_ROW * dpitch; \
      src += ONE_ROW * spitch; \
   } \
\
   UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \
\
} while (0)


/* Draw multiple rows with a specific memory copy. */
#define DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(MEMCPY_FUNCTION,BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS) \
do { \
   /* Draw four rows at a time, in the most efficient way. */ \
   while (h >= FOUR_ROWS) { \
      h -= FOUR_ROWS; \
\
      BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \
\
      /* First, check if the blit can be done using unaligned fixed-size operations for four rows at a time. */ \
      if (swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, w, xdir, dpitch, spitch)) \
         ; \
      else \
      { \
         MEMCPY_FUNCTION(dst + 0*dpitch, src + 0*spitch, w * BYTES_PER_UINT16_T); \
         MEMCPY_FUNCTION(dst + 1*dpitch, src + 1*spitch, w * BYTES_PER_UINT16_T); \
         MEMCPY_FUNCTION(dst + 2*dpitch, src + 2*spitch, w * BYTES_PER_UINT16_T); \
         MEMCPY_FUNCTION(dst + 3*dpitch, src + 3*spitch, w * BYTES_PER_UINT16_T); \
      } \
\
     UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \
\
      dst += FOUR_ROWS * dpitch; \
      src += FOUR_ROWS * spitch; \
   } \
\
   BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \
\
   /* Draw two rows at a time, in the most efficient way. */ \
   while (h >= TWO_ROWS) { \
      h -= TWO_ROWS; \
\
      /* First, check if the blit can be done using unaligned fixed-size operations for two rows at a time. */ \
      if (swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, w, xdir, dpitch, spitch)) \
         ; \
      else \
      { \
         MEMCPY_FUNCTION(dst + 0*dpitch, src + 0*spitch, w * BYTES_PER_UINT16_T); \
         MEMCPY_FUNCTION(dst + 1*dpitch, src + 1*spitch, w * BYTES_PER_UINT16_T); \
      } \
\
      dst += TWO_ROWS * dpitch; \
      src += TWO_ROWS * spitch; \
   } \
\
   /* Draw one row at a time, in the most efficient way. */ \
   while (h >= ONE_ROW) { \
      h -= ONE_ROW; \
\
      /* First, check if the blit can be done using unaligned fixed-size operations for two rows at a time. */ \
      if (swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, w, xdir)) \
         ; \
      else \
      { \
         MEMCPY_FUNCTION(dst, src, w * BYTES_PER_UINT16_T); \
      } \
\
      dst += ONE_ROW * dpitch; \
      src += ONE_ROW * spitch; \
   } \
\
   UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \
\
} while (0)


/* Draw multiple rows with small fixed width functions in a positive X direction. */
#define DRAW_MULTIPLE_ROWS_WITH_POSITIVE_XDIR(BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS) \
do { \
   /* Draw four rows at a time, in the most efficient way. */ \
   while (h >= FOUR_ROWS) { \
      h -= FOUR_ROWS; \
\
      BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \
\
      /* First, check if the blit can be done using unaligned fixed-size operations for four rows at a time. */ \
      if (swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, w, xdir, dpitch, spitch)) \
         ; \
      else \
      { \
         /* NOTE: Assumes that all copies of less than 8 pixels succeed, so return values are not checked. */ \
         swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); \
         swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, w % 8, xdir, dpitch, spitch); \
      } \
\
      UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \
\
      dst += FOUR_ROWS * dpitch; \
      src += FOUR_ROWS * spitch; \
   } \
\
   BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \
\
   /* Draw two rows at a time, in the most efficient way. */ \
   while (h >= TWO_ROWS) { \
      h -= TWO_ROWS; \
\
      /* First, check if the blit can be done using unaligned fixed-size operations for two rows at a time. */ \
      if (swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, w, xdir, dpitch, spitch)) \
         ; \
      else \
      { \
         /* NOTE: Assumes that all copies of less than 8 pixels succeed, so return values are not checked. */ \
         swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); \
         swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, w % 8, xdir, dpitch, spitch); \
      } \
\
      dst += TWO_ROWS * dpitch; \
      src += TWO_ROWS * spitch; \
   } \
\
   /* Draw one row at a time, in the most efficient way. */ \
   while (h >= ONE_ROW) { \
      h -= ONE_ROW; \
\
      /* First, check if the blit can be done using unaligned fixed-size operations for two rows at a time. */ \
      if (swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, w, xdir)) \
         ; \
      else \
      { \
         /* NOTE: Assumes that all copies of less than 8 pixels succeed, so return values are not checked. */ \
         swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); \
         swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, w % 8, xdir); \
      } \
\
      dst += ONE_ROW * dpitch; \
      src += ONE_ROW * spitch; \
   } \
\
   UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \
\
} while (0)


/* Draw multiple rows with small fixed width functions in a negative X direction. */
#define DRAW_MULTIPLE_ROWS_WITH_NEGATIVE_XDIR(BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS,UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS) \
do { \
   /* Draw four rows at a time, in the most efficient way. */ \
   while (h >= FOUR_ROWS) { \
      h -= FOUR_ROWS; \
\
      BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \
\
      /* First, check if the blit can be done using unaligned fixed-size operations for four rows at a time. */ \
      if (swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, w, xdir, dpitch, spitch)) \
         ; \
      else \
      { \
         /* NOTE: Assumes that all copies of less than 8 pixels succeed, so return values are not checked. */ \
         swCopy16BppSmallFixedWidths4Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, w % 8, xdir, dpitch, spitch); \
         swCopy16BppSmallFixedWidths4Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); \
      } \
\
      UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \
\
      dst += FOUR_ROWS * dpitch; \
      src += FOUR_ROWS * spitch; \
   } \
\
   BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS(); \
\
   /* Draw two rows at a time, in the most efficient way. */ \
   while (h >= TWO_ROWS) { \
      h -= TWO_ROWS; \
\
      /* First, check if the blit can be done using unaligned fixed-size operations for two rows at a time. */ \
      if (swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, w, xdir, dpitch, spitch)) \
         ; \
      else \
      { \
         /* NOTE: Assumes that all copies of less than 8 pixels succeed, so return values are not checked. */ \
         swCopy16BppSmallFixedWidths2Rows_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, w % 8, xdir, dpitch, spitch); \
         swCopy16BppSmallFixedWidths2Rows_Unaligned(dst, src, 8, xdir, dpitch, spitch); \
      } \
\
      dst += TWO_ROWS * dpitch; \
      src += TWO_ROWS * spitch; \
   } \
\
   /* Draw one row at a time, in the most efficient way. */ \
   while (h >= ONE_ROW) { \
      h -= ONE_ROW; \
\
      /* First, check if the blit can be done using unaligned fixed-size operations for two rows at a time. */ \
      if (swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, w, xdir)) \
         ; \
      else \
      { \
         /* NOTE: Assumes that all copies of less than 8 pixels succeed, so return values are not checked. */ \
         swCopy16BppSmallFixedWidths1Row_Unaligned(dst + 8 * BYTES_PER_UINT16_T, src + 8 * BYTES_PER_UINT16_T, w % 8, xdir); \
         swCopy16BppSmallFixedWidths1Row_Unaligned(dst, src, 8, xdir); \
      } \
\
      dst += ONE_ROW * dpitch; \
      src += ONE_ROW * spitch; \
   } \
\
   UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS(); \
\
} while (0)



/* Copy a line of 16bpp pixels, using fixed width functions only.                        */
/* (Pointers are assumed to be half-word-aligned, which should be guaranteed for 16bpp.) */

static inline void
swCopyRect16BppFixedWidth_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, int xdir, int ydir, int dpitch, int spitch, BOOL rowsOverlap, BOOL blockSignalsForVFP)
{
   // It's critically important to keep in mind what the ordering requirements are for the functions below.  The following rules must be followed:
   // (1) neon_memcpy() and memcpy() should only be called when 'rowsOverlap' is false.
   //     (These functions are usually faster than neon_memmove() and memmove(), so they are preferred when possible.)
   // (2) neon_memmove() and memmove() *MUST* be called when 'rowsOverlap' is true.
   // (3) Calls to X_memcpy()/X_memmove() for multiple rows must be ordered by ydir.  However, since spitch and dpitch have signs that match xdir and ydir,
   //     it is sufficient to calculate source and destination address with spitch and dpitch to guarantee this ordering.
   // (4) Single calls to swCopy16BppSmallFixedWidths1Row_Unaligned(), swCopy16BppSmallFixedWidths2Rows_Unaligned(), and swCopy16BppSmallFixedWidths4Rows_Unaligned()
   //     may be called for any combination of xdir and ydir.
   // (5) However, not-widthstanding the point above, if multiple calls to the three swCopy16BppSmallFixedWidthsxRow(s)_Unaligned() functions are made
   //     for the same row, then calls must be made from left-to-right for xdir=1 and right-to-left for xdir=-1.
   // Violating the above rules may not obviously generate incorrect results, but full compliance tests will probably fail for obscure corner cases.

   const int FOUR_ROWS = 4;
   const int TWO_ROWS = 2;
   const int ONE_ROW = 1;


   // Prefer memcpy() and memmove() when copies are wide.
   if (w >= 16)
   {
      // Just check for rows overlapping, which is all that is needed to distinguish between memmove() and memcpy().
      // NOTE: memcpy() is generally faster than neon_memcpy() up to about 128 bytes and specialized draw functions are faster for up to 32 bytes.
      if (rowsOverlap)
      {
         if (w > 64) {
            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
         }
         else if (w == 64) {
            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memmove, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
         }
         else {
            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memmove, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP);
         }
      }
      else
      {
         if (w > 64) {
            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY_NO_NARROW_COPIES(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
         }
         else if (w == 64) {
            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(neon_memcpy, BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
         }
         else {
            DRAW_MULTIPLE_ROWS_WITH_MEMORY_COPY(memcpy, SIGNAL_BLOCK_NOOP, SIGNAL_BLOCK_NOOP);
         }
      }
   }
   // Handle remaining cases -- including reverse (buffered) copies.
   else {
      if (xdir >= 0) {
         DRAW_MULTIPLE_ROWS_WITH_POSITIVE_XDIR(BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
      }
      else {
         DRAW_MULTIPLE_ROWS_WITH_NEGATIVE_XDIR(BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS, UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS);
      }
   }
}


/* Copy a line of 8bpp pixels. (No alignment assumed.)                */
/* NOTE: This is probably only needed for 12bpp planar video copies. */

static inline void
swCopyRect8Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, int xdir, int ydir, int dpitch, int spitch, BOOL rowsOverlap, BOOL blockSignalsForVFP)
{
   const int ONE_ROW = 1;
   while (h > 0) {
      h -= ONE_ROW;

      // Handle remaining cases -- including reverse (buffered) copies.
      // NOTE: memcpy() is generally faster than neon_memcpy() up to about 128 bytes.
      if (xdir >= 0 || !rowsOverlap) {
         if (w >= 128) {
            BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
            neon_memcpy(dst, src, w);
            UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
         }
         else
            memcpy(dst, src, w);
      }
      else {
         if (w >= 128) {
            BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
            neon_memmove(dst, src, w);
            UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
         }
         else
            memmove(dst, src, w);
      }

      dst += dpitch;
      src += spitch;
   }
}


/* Copy a line of 16bpp pixels.                                                          */
/* (Pointers are assumed to be half-word-aligned, which should be guaranteed for 16bpp.) */

static inline void
swCopyRect16Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, int xdir, int ydir, int dpitch, int spitch, BOOL rowsOverlap, BOOL blockSignalsForVFP)
{
   // Handle single-pixel width columns as a special case.
   // Since this function only requires half-word-alignment, which is guaranteed at this point,
   // it's safe to call now with no further tests.
   if (w == 1) {
      swCopy2ByteWideRectangle_HalfWordAligned(dst, src, h, dpitch, spitch);
      return;
   }

   swCopyRect16BppFixedWidth_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP);
}


/* Copy a line of 24bpp pixels.                        */
/* (Pointers are not assumed to be aligned for 24bpp.) */

static inline void
swCopyRect24Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, int xdir, int ydir, int dpitch, int spitch, BOOL rowsOverlap, BOOL blockSignalsForVFP)
{
   // TODO: Make optimized draws for fixed sizes?  Performance for this case is likely to be pretty poor.
   const int ONE_ROW = 1;
   while (h > 0) {
      h -= ONE_ROW;

      // Handle remaining cases -- including reverse (buffered) copies.
      // NOTE: memcpy() is generally faster than neon_memcpy() up to about 128 bytes.
      if (xdir >= 0 || !rowsOverlap) {
         if (w >= 42) {
            BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
            neon_memcpy(dst, src,  w * BYTES_PER_24BPP_PIXEL);
            UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
         }
         else
            memcpy(dst, src,  w * BYTES_PER_24BPP_PIXEL);
      }
      else {
         if (w >= 42) {
            BLOCK_SIGNALS_BEFORE_VFP_OPERATIONS();
            neon_memmove(dst, src,  w * BYTES_PER_24BPP_PIXEL);
            UNBLOCK_SIGNALS_AFTER_VFP_OPERATIONS();
         }
         else
            memmove(dst, src,  w * BYTES_PER_24BPP_PIXEL);
      }

      dst += dpitch;
      src += spitch;
   }
}


/* Copy a line of 32bpp pixels.                                                     */
/* (Pointers are assumed to be word-aligned, which should be guaranteed for 32bpp.) */

static inline void
swCopyRect32Bpp_Unaligned(unsigned char *dst, unsigned char *src, int w, int h, int xdir, int ydir, int dpitch, int spitch, BOOL rowsOverlap, BOOL blockSignalsForVFP)
{
   // As a pretty good first pass at optimization, use the 16bpp code to draw 32bpp rectangles.
   swCopyRect16Bpp_Unaligned(dst, src, w * BYTES_PER_UINT32_T / BYTES_PER_UINT16_T, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP);
}


/* Perform a software blit */

void swBlit(MSMPtr pMsm, MSMBlitRec *blit, int bpp, BOOL blockSignalsForVFP)
{
   int h = blit->dstRect->h;
   int w = blit->dstRect->w;

   int cpp = bpp / 8;

   int spitch = blit->src->pitch;
   int dpitch = blit->dst->pitch;

   uint8_t *srcLine;
   uint8_t *src;

   uint8_t *dstLine;
   uint8_t *dst;

   int src_x = blit->srcRect->x;
   int dst_x = blit->dstRect->x;

   int xdir = 1;
   int ydir = 1;

   if (blit->src->flags & MSM_BLIT_GEM) {
       struct msm_drm_bo *bo = (struct msm_drm_bo *) blit->src->priv[0];
	msm_drm_bo_map(bo);
        srcLine = (uint8_t *)bo->virt;
   }
   else {
       srcLine = (uint8_t *) pMsm->fbmem + blit->src->priv[0];
   }

   if (blit->dst->flags & MSM_BLIT_GEM) {
       struct msm_drm_bo *bo = (struct msm_drm_bo *) blit->dst->priv[0];
	msm_drm_bo_map(bo);
       dstLine = (uint8_t *) bo->virt;
   }
   else {
       dstLine = (uint8_t *) pMsm->fbmem + blit->dst->priv[0];
   }

   if (srcLine == NULL || dstLine == NULL) {
        if (blit->src->flags & MSM_BLIT_GEM)
            msm_drm_bo_unmap((struct msm_drm_bo *) blit->src->priv[0]);
        if (blit->dst->flags & MSM_BLIT_GEM)
	    msm_drm_bo_unmap((struct msm_drm_bo *) blit->dst->priv[0]);
	return;
   }

   srcLine += (blit->srcRect->y * spitch);
   dstLine += (blit->dstRect->y * dpitch);

   src = srcLine + blit->srcRect->x * cpp;
   dst = dstLine + blit->dstRect->x * cpp;



   // This trivial one-pixel copy is independent of xdir and ydir, so it can be done before the overlap check.
   // (This makes the 1x1 copy case significantly faster and there is reason to believe this is a common case.)
   if (h == 1 && w == 1) {
     switch (bpp) {
         case 8:  *dst = *src;
                  break;
         case 16: *(uint16_t *)dst = *(uint16_t *)src;
                  break;
         case 24: {
                    uint8_t src1a = *(src+0);
                    uint8_t src1b = *(src+1);
                    uint8_t src1c = *(src+2);
                    *(dst+0) = src1a;
                    *(dst+1) = src1b;
                    *(dst+2) = src1c;
                  }
                  break;
         case 32: *(uint32_t *)dst = *(uint32_t *)src;
                  break;
         default: break;
      }
	goto unmapbo;
   }

   // If starting destination and source lines are the same,
   // and rectangles overlap, then buffering of complete rows or reverse row-copies are required.
   BOOL rowsOverlap = FALSE;
   if (isOverlap(blit, bpp))
   {
      // If destination and source rectangles overlap and destination is to right of source, then copies must be done from the right to the left.
      if (dst_x > src_x) {
         xdir = -1;
      }

      // If destination and source rectangles overlap and destination is lower than source (or same position as), then copies must be done from the bottom up.
      if (dstLine >= srcLine) {
         ydir = -1;
         src += ((h - 1) * spitch);
         dst += ((h - 1) * dpitch);
         spitch = -spitch;
         dpitch = -dpitch;
      }

      // If initial destination line is equal to the initial source line then each row overlaps (requiring memmove() or equivalent).
      if (dstLine == srcLine)
         rowsOverlap = TRUE;
   }

   // Call BPP-specific code to draw pixels.
   switch (bpp) {
      case 8:  swCopyRect8Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP);
               break;
      case 16: swCopyRect16Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP);
               break;
      case 24: swCopyRect24Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP);
               break;
      case 32: swCopyRect32Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP);
               break;
      default:  break;
   }

unmapbo:
   if (blit->src->flags & MSM_BLIT_GEM)
	msm_drm_bo_unmap((struct msm_drm_bo *) blit->src->priv[0]);

   if (blit->dst->flags & MSM_BLIT_GEM)
        msm_drm_bo_unmap((struct msm_drm_bo *) blit->dst->priv[0]);
}


/* Perform a software blit, but assume no overlap. */

void swBlit_NoOverlap(unsigned char * __restrict__ dst, unsigned char * __restrict__ src, int w, int h, int dpitch, int spitch, int bpp, BOOL blockSignalsForVFP)
{
   // Trivial one-pixel copy.
   // (This makes the 1x1 copy case significantly faster and there is reason to believe this is a common case.)
   if (h == 1 && w == 1) {
     switch (bpp) {
         case 8:  *dst = *src;
                  break;
         case 16: *(uint16_t *)dst = *(uint16_t *)src;
                  break;
         case 24: {
                    uint8_t src1a = *(src+0);
                    uint8_t src1b = *(src+1);
                    uint8_t src1c = *(src+2);
                    *(dst+0) = src1a;
                    *(dst+1) = src1b;
                    *(dst+2) = src1c;
                  }
                  break;
         case 32: *(uint32_t *)dst = *(uint32_t *)src;
                  break;
         default: break;
      }
      return;
   }

   // Call BPP-specific code to draw pixels.
   const int xdir = 1,
             ydir = 1,
             rowsOverlap = FALSE;
   switch (bpp) {
      case 8:  swCopyRect8Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP);
               break;
      case 16: swCopyRect16Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP);
               break;
      case 24: swCopyRect24Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP);
               break;
      case 32: swCopyRect32Bpp_Unaligned(dst, src, w, h, xdir, ydir, dpitch, spitch, rowsOverlap, blockSignalsForVFP);
               break;
      default: return;
   }
}

