/**
 *
 * @file core_zgemmsp.c
 *
 * PaStiX kernel routines operating on the solver structure.
 *
 * @copyright 2011-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
 *                      Univ. Bordeaux. All rights reserved.
 *
 * @version 6.0.1
 * @author Mathieu Faverge
 * @author Pierre Ramet
 * @author Xavier Lacoste
 * @date 2018-07-16
 * @precisions normal z -> c d s
 *
 **/
#include "common.h"
#include "cblas.h"
#include "blend/solver.h"
#include "kernels_trace.h"
#include "pastix_zcores.h"
#include "pastix_zlrcores.h"

#ifndef DOXYGEN_SHOULD_SKIP_THIS
static pastix_complex64_t mzone = -1.0;
static pastix_complex64_t zone  =  1.0;
static pastix_complex64_t zzero =  0.0;
#endif /* DOXYGEN_SHOULD_SKIP_THIS */

/**
 *******************************************************************************
 *
 * @ingroup kernel_fact_null
 *
 * @brief Compute the updates that are generated by the transposition of one
 * single off-diagonal block.
 *
 * Both cblk involved in the computation are stored with the 1D storage: Column
 * Major Layout with blocks interleaved.
 *
 * All the off-diagonal block below block are multiplied by the selected block
 * and added to the facing cblk.
 *
 *******************************************************************************
 *
 * @param[in] sideA
 *          Specify if A and C belong to the lower part, or to the upper part.
 *          If sideA == PastixLCoef, the contribution of:
 *          (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
 *          C, otherwise the contribution:
 *          (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
 *          to C.
 *          The pointer to the data structure that describes the panel from
 *          which we compute the contributions. Next column blok must be
 *          accessible through cblk[1].
 *
 * @param[in] trans
 *          Specify the transposition used for the B matrix. It has to be either
 *          PastixTrans or PastixConjTrans.
 *
 * @param[in] cblk
 *          The cblk structure to which block belongs to. The A and B pointers
 *          must be the coeftab of this column block.
 *          Next column blok must be accessible through cblk[1].
 *
 * @param[in] blok
 *          The block from which we compute the contributions.
 *
 * @param[inout] fcblk
 *          The pointer to the data structure that describes the panel on which
 *          we compute the contributions. The C pointer must be one of the
 *          coeftab from this fcblk. Next column blok must be accessible through
 *          fcblk[1].
 *
 * @param[in] A
 *          The pointer to the coeftab of the cblk.lcoeftab matrix storing the
 *          coefficients of the panel when the Lower part is computed,
 *          cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
 *
 * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
 *          the coefficients of the panel, if Symmetric/Hermitian cases or if
 *          upper part is computed; cblk.ucoeftab otherwise. Must be of size
 *          cblk.stride -by- cblk.width
 *
 * @param[inout] C
 *          The pointer to the fcblk.lcoeftab if the lower part is computed,
 *          fcblk.ucoeftab otherwise.
 *
 * @param[in] work
 *          Temporary memory buffer that is at least equal to the height of the
 *          block B by the sum of the height of all the blocks below the block
 *          B.
 *
 *******************************************************************************
 *
 * @sa core_zgemmsp_1d2d
 * @sa core_zgemmsp_2d2d
 *
 *******************************************************************************/
static inline void
core_zgemmsp_1d1d( pastix_coefside_t sideA, pastix_trans_t trans,
                   const SolverCblk         *cblk,
                   const SolverBlok         *blok,
                         SolverCblk         *fcblk,
                   const pastix_complex64_t *A,
                   const pastix_complex64_t *B,
                         pastix_complex64_t *C,
                         pastix_complex64_t *work )
{
    const SolverBlok *iterblok;
    const SolverBlok *fblok;
    const SolverBlok *lblok;

    pastix_complex64_t *tmpC;
    pastix_complex64_t *wtmp;
    pastix_int_t stride, stridef, indblok;
    pastix_int_t M, N, K, m;
    int shift;

    /* Both cblk and fcblk are stored in 1D */
    assert(!(cblk->cblktype  & CBLK_LAYOUT_2D));
    assert(!(fcblk->cblktype & CBLK_LAYOUT_2D));

    shift = (sideA == PastixUCoef) ? 1 : 0;

    stride  = cblk->stride;
    stridef = fcblk->stride;
    K = cblk_colnbr( cblk );

    /* First blok */
    indblok = blok->coefind;

    N = blok_rownbr( blok );
    M = stride - indblok - (shift * N);

    /* Matrix A = Aik */
    A = A + indblok + (shift * N);
    B = B + indblok;

    /*
     * Compute update A * B'
     */
    wtmp = work;
    kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
    cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
                 M, N, K,
                 CBLAS_SADDR(zone),  A,    stride,
                                     B,    stride,
                 CBLAS_SADDR(zzero), wtmp, M );
    kernel_trace_stop_lvl2( FLOPS_ZGEMM( M, N, K ) );

    /*
     * Add contribution to C in fcblk
     */

    /* Get the first block of the distant panel */
    fblok = fcblk->fblokptr;

    /* Move the pointer to the top of the right column */
    C = C + (blok->frownum - fcblk->fcolnum) * stridef;

    lblok = cblk[1].fblokptr;

    /* for all following blocks in block column */
    for (iterblok=blok+shift; iterblok<lblok; iterblok++) {

        /* Find facing blok */
        while (!is_block_inside_fblock( iterblok, fblok ))
        {
            fblok++;
            assert( fblok < fcblk[1].fblokptr );
        }

        tmpC = C + fblok->coefind + iterblok->frownum - fblok->frownum;
        m = blok_rownbr( iterblok );

        pastix_cblk_lock( fcblk );
        core_zgeadd( PastixNoTrans, m, N,
                     -1.0, wtmp, M,
                      1.0, tmpC, stridef );
        pastix_cblk_unlock( fcblk );

        /* Displacement to next block */
        wtmp += m;
    }
}

/**
 *******************************************************************************
 *
 * @ingroup kernel_fact_null
 *
 * @brief Compute the updates that are generated by the transposition of one
 * single off-diagonal block.
 *
 * The cblk involved in the matrices A and B are stored with the 1D storage:
 * Column Major Layout with blocks interleaved. The facing cblk of the atrix C,
 * is stored with the 2D storage where each block is stored continuously one
 * after another. (Similar to dense tile storage with variant tile size)
 *
 * All the off-diagonal block below block are multiplied by the selected block
 * and added to the facing cblk.
 *
 *******************************************************************************
 *
 * @param[in] sideA
 *          Specify if A and C belong to the lower part, or to the upper part.
 *          If sideA == PastixLCoef, the contribution of:
 *          (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
 *          C, otherwise the contribution:
 *          (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
 *          to C.
 *          The pointer to the data structure that describes the panel from
 *          which we compute the contributions. Next column blok must be
 *          accessible through cblk[1].
 *
 * @param[in] trans
 *          Specify the transposition used for the B matrix. It has to be either
 *          PastixTrans or PastixConjTrans.
 *
 * @param[in] cblk
 *          The cblk structure to which block belongs to. The A and B pointers
 *          must be the coeftab of this column block.
 *          Next column blok must be accessible through cblk[1].
 *
 * @param[in] blok
 *          The block from which we compute the contributions.
 *
 * @param[inout] fcblk
 *          The pointer to the data structure that describes the panel on which
 *          we compute the contributions. The C pointer must be one of the
 *          coeftab from this fcblk. Next column blok must be accessible through
 *          fcblk[1].
 *
 * @param[in] A
 *          The pointer to the coeftab of the cblk.lcoeftab matrix storing the
 *          coefficients of the panel when the Lower part is computed,
 *          cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
 *
 * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
 *          the coefficients of the panel, if Symmetric/Hermitian cases or if
 *          upper part is computed; cblk.ucoeftab otherwise. Must be of size
 *          cblk.stride -by- cblk.width
 *
 * @param[inout] C
 *          The pointer to the fcblk.lcoeftab if the lower part is computed,
 *          fcblk.ucoeftab otherwise.
 *
 *******************************************************************************
 *
 * @sa core_zgemmsp_1d1d
 * @sa core_zgemmsp_2d2d
 *
 *******************************************************************************/
static inline void
core_zgemmsp_1d2d( pastix_coefside_t sideA, pastix_trans_t trans,
                   const SolverCblk         *cblk,
                   const SolverBlok         *blok,
                         SolverCblk         *fcblk,
                   const pastix_complex64_t *A,
                   const pastix_complex64_t *B,
                         pastix_complex64_t *C )
{
    const SolverBlok *iterblok;
    const SolverBlok *fblok;
    const SolverBlok *lblok;
    const pastix_complex64_t *blokA;
    const pastix_complex64_t *blokB;
    pastix_complex64_t *blokC;

    pastix_int_t stride, stridef;
    pastix_int_t M, N, K;
    int shift;

    /* cblk is stored in 1D and fcblk in 2D */
    assert(!(cblk->cblktype & CBLK_LAYOUT_2D));
    assert( fcblk->cblktype & CBLK_LAYOUT_2D );

    shift = (sideA == PastixUCoef) ? 1 : 0;
    stride  = cblk->stride;

    /* Get the B block and its dimensions */
    blokB = B + blok->coefind;

    stride  = cblk->stride;
    K = cblk_colnbr( cblk );
    N = blok_rownbr( blok );

    /**
     * Add contribution to C in fcblk:
     *    Get the first facing block of the distant panel, and the last block of
     *    the current cblk
     */
    fblok = fcblk->fblokptr;
    lblok = cblk[1].fblokptr;

    for (iterblok=blok+shift; iterblok<lblok; iterblok++) {

        /* Find facing blok */
        while (!is_block_inside_fblock( iterblok, fblok ))
        {
            fblok++;
            assert( fblok < fcblk[1].fblokptr );
        }

        stridef = blok_rownbr(fblok);

        /* Get the A block and its dimensions */
        blokA = A + iterblok->coefind;
        M = blok_rownbr( iterblok );

        blokC = C + fblok->coefind
            + iterblok->frownum - fblok->frownum
            + (blok->frownum - fcblk->fcolnum) * stridef;

        pastix_cblk_lock( fcblk );
        kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
        cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
                     M, N, K,
                     CBLAS_SADDR(mzone), blokA, stride,
                                         blokB, stride,
                     CBLAS_SADDR(zone),  blokC, stridef );
        kernel_trace_stop_lvl2( FLOPS_ZGEMM( M, N, K ) );
        pastix_cblk_unlock( fcblk );
    }
}

/**
 *******************************************************************************
 *
 * @ingroup kernel_fact_null
 *
 * @brief Compute the updates that are generated by the transposition of one
 * single off-diagonal block.
 *
 * Both cblk involved in the matrices A, B and C are stored with the 2D storage
 * where each block is stored continuously one after another. (Similar to dense
 * tile storage with variant tile size)
 *
 * All the off-diagonal block below block are multiplied by the selected block
 * and added to the facing cblk.
 *
 *******************************************************************************
 *
 * @param[in] sideA
 *          Specify if A and C belong to the lower part, or to the upper part.
 *          If sideA == PastixLCoef, the contribution of:
 *          (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
 *          C, otherwise the contribution:
 *          (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
 *          to C.
 *          The pointer to the data structure that describes the panel from
 *          which we compute the contributions. Next column blok must be
 *          accessible through cblk[1].
 *
 * @param[in] trans
 *          Specify the transposition used for the B matrix. It has to be either
 *          PastixTrans or PastixConjTrans.
 *
 * @param[in] cblk
 *          The cblk structure to which block belongs to. The A and B pointers
 *          must be the coeftab of this column block.
 *          Next column blok must be accessible through cblk[1].
 *
 * @param[in] blok
 *          The block from which we compute the contributions.
 *
 * @param[inout] fcblk
 *          The pointer to the data structure that describes the panel on which
 *          we compute the contributions. The C pointer must be one of the
 *          coeftab from this fcblk. Next column blok must be accessible through
 *          fcblk[1].
 *
 * @param[in] A
 *          The pointer to the coeftab of the cblk.lcoeftab matrix storing the
 *          coefficients of the panel when the Lower part is computed,
 *          cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
 *
 * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
 *          the coefficients of the panel, if Symmetric/Hermitian cases or if
 *          upper part is computed; cblk.ucoeftab otherwise. Must be of size
 *          cblk.stride -by- cblk.width
 *
 * @param[inout] C
 *          The pointer to the fcblk.lcoeftab if the lower part is computed,
 *          fcblk.ucoeftab otherwise.
 *
 *******************************************************************************
 *
 * @sa core_zgemmsp_1d1d
 * @sa core_zgemmsp_1d2d
 *
 *******************************************************************************/
static inline void
core_zgemmsp_2d2d( pastix_coefside_t sideA, pastix_trans_t trans,
                   const SolverCblk         *cblk,
                   const SolverBlok         *blok,
                         SolverCblk         *fcblk,
                   const pastix_complex64_t *A,
                   const pastix_complex64_t *B,
                         pastix_complex64_t *C )
{
    const SolverBlok *iterblok;
    const SolverBlok *fblok;
    const SolverBlok *lblok;
    const pastix_complex64_t *blokA;
    const pastix_complex64_t *blokB;
    pastix_complex64_t *blokC;

    pastix_int_t M, N, K, lda, ldb, ldc;
    int shift;

    /* Both cblk and fcblk must be stored in 2D */
    assert( cblk->cblktype  & CBLK_LAYOUT_2D );
    assert( fcblk->cblktype & CBLK_LAYOUT_2D );

    shift = (sideA == PastixUCoef) ? 1 : 0;

    /* Get the B block and its dimensions */
    blokB = B + blok->coefind;

    ldb = blok_rownbr( blok );
    K = cblk_colnbr( cblk );
    N = blok_rownbr( blok );

    /*
     * Add contribution to C in fcblk:
     *    Get the first facing block of the distant panel, and the last block of
     *    the current cblk
     */
    fblok = fcblk->fblokptr;
    lblok = cblk[1].fblokptr;

    for (iterblok=blok+shift; iterblok<lblok; iterblok++) {

        /* Find facing blok */
        while (!is_block_inside_fblock( iterblok, fblok ))
        {
            fblok++;
            assert( fblok < fcblk[1].fblokptr );
        }

        ldc = blok_rownbr(fblok);

        /* Get the A block and its dimensions */
        blokA = A + iterblok->coefind;
        M = blok_rownbr( iterblok );
        lda = M;

        blokC = C + fblok->coefind
            + iterblok->frownum - fblok->frownum
            + (blok->frownum - fcblk->fcolnum) * ldc;

        pastix_cblk_lock( fcblk );
        kernel_trace_start_lvl2( PastixKernelLvl2_FR_GEMM );
        cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
                     M, N, K,
                     CBLAS_SADDR(mzone), blokA, lda,
                                         blokB, ldb,
                     CBLAS_SADDR(zone),  blokC, ldc );
        kernel_trace_stop_lvl2( FLOPS_ZGEMM( M, N, K ) );
        pastix_cblk_unlock( fcblk );
    }
}

/**
 *******************************************************************************
 *
 * @ingroup kernel_fact_null
 *
 * @brief Compute the updates that are generated by the transposition of all the
 * blocks facing a common diagonal block, by another similar set of blocks.
 *
 * This is used to performed an update:
 *
 *                      C_mn = C_mn - A_mk * op(B_kn)
 *
 * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
 * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
 * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
 * belongs to the set of block of the cblk n facing the diagonal block of the
 * cblk m.
 *
 *******************************************************************************
 *
 * @param[in] trans
 *          Specify the transposition used for the B matrices. It has to be either
 *          PastixTrans or PastixConjTrans.
 *
 * @param[in] blok_mk
 *          Index of the first off-diagonal block in cblk, that is used for A.
 *
 * @param[in] blok_kn
 *          Index of the first off-diagonal block in cblk, that is used for B.
 *
 * @param[in] blok_mn
 *          Index of the first off-diagonal block in fcblk, that is used for C.
 *
 * @param[in] cblk
 *          The cblk structure to which block belongs to. The A and B pointers
 *          must be the coeftab of this column block.
 *          Next column blok must be accessible through cblk[1].
 *
 * @param[inout] fcblk
 *          The pointer to the data structure that describes the panel on which
 *          we compute the contributions. The C pointer must be one of the
 *          coeftab from this fcblk. Next column blok must be accessible through
 *          fcblk[1].
 *
 * @param[in] A
 *          The pointer to the coeftab of the cblk.lcoeftab matrix storing the
 *          coefficients of the panel when the Lower part is computed,
 *          cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
 *
 * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
 *          the coefficients of the panel, if Symmetric/Hermitian cases or if
 *          upper part is computed; cblk.ucoeftab otherwise. Must be of size
 *          cblk.stride -by- cblk.width
 *
 * @param[inout] C
 *          The pointer to the fcblk.lcoeftab if the lower part is computed,
 *          fcblk.ucoeftab otherwise.
 *
 *******************************************************************************
 *
 * @sa core_zgemmsp_1d1d
 * @sa core_zgemmsp_1d2d
 *
 *******************************************************************************/
static inline void
core_zgemmsp_block_frfr( pastix_trans_t trans,
                         pastix_int_t blok_mk,
                         pastix_int_t blok_kn,
                         pastix_int_t blok_mn,
                         const SolverCblk         *cblk,
                               SolverCblk         *fcblk,
                         const pastix_complex64_t *A,
                         const pastix_complex64_t *B,
                               pastix_complex64_t *C )
{
    const SolverBlok *blokA, *blokB, *blokC;
    const SolverBlok *bA, *bB, *bC;
    const SolverBlok *fblokK, *lblokK;
    const SolverBlok *fblokN, *lblokN;

    const pastix_complex64_t *Aptr, *Bptr;
    pastix_complex64_t *Cptr;
    pastix_int_t M, N, K, lda, ldb, ldc, cblk_n, cblk_m;
    pastix_int_t full_m;
    size_t offsetA, offsetB, offsetC;

    pastix_fixdbl_t flops = 0.0;
    pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );

    /* Both cblk and fcblk must be stored in 2D */
    assert( cblk->cblktype  & CBLK_LAYOUT_2D );
    assert( fcblk->cblktype & CBLK_LAYOUT_2D );

    /*
     * Blocs on column K
     */
    fblokK = cblk[0].fblokptr;
    lblokK = cblk[1].fblokptr;

    blokB = fblokK + blok_kn;
    offsetB = blokB->coefind;
    cblk_n = blokB->fcblknm;

    blokA = fblokK + blok_mk;
    offsetA = blokA->coefind;
    cblk_m = blokA->fcblknm;

    /*
     * Blocs on column N
     */
    fblokN = fcblk[0].fblokptr;
    lblokN = fcblk[1].fblokptr;

    blokC = fblokN + blok_mn;
    offsetC = blokC->coefind;
    assert( blokC->fcblknm == cblk_m );

    K = cblk_colnbr( cblk );
    full_m = 0;

    bC = blokC;
    for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++) {
        M = blok_rownbr(bA);
        Aptr = A + bA->coefind - offsetA;
        lda = M;
        full_m += M;

        /* Find facing C blok */
        while (!is_block_inside_fblock( bA, bC )) {
            bC++;
            assert( bC < lblokN );
        }

        Cptr = C + bC->coefind - offsetC;
        ldc = blok_rownbr(bC);

        for (bB = blokB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++) {
            N = blok_rownbr( bB );
            Bptr = B + bB->coefind - offsetB;
            ldb = N;

            cblas_zgemm( CblasColMajor, CblasNoTrans, (CBLAS_TRANSPOSE)trans,
                         M, N, K,
                         CBLAS_SADDR(mzone), Aptr, lda,
                         Bptr, ldb,
                         CBLAS_SADDR(zone),  Cptr + (bA->frownum - bC->frownum)
                         + (bB->frownum - fcblk->fcolnum) * ldc , ldc );

            flops += FLOPS_ZGEMM( M, N, K );
        }
    }

    kernel_trace_stop( blokB->inlast, PastixKernelGEMMBlok2d2d,
                       full_m, full_m, K,
                       flops, time );

    (void)lblokN;
    return;
}

/**
 *******************************************************************************
 *
 * @ingroup kernel_fact_null
 *
 * @brief Compute the updates that are generated by the transposition of all the
 * blocks facing a common diagonal block, by another similar set of blocks.
 *
 * This is used to perform an update:
 *
 *                      C_mn = C_mn - A_mk * op(B_kn)
 *
 * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
 * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
 * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
 * belongs to the set of block of the cblk n facing the diagonal block of the
 * cblk m.
 *
 * In this routine, all the matrices are low-rank
 *
 *******************************************************************************
 *
 * @param[in] sideA
 *          Specify if A and C belong to the lower part, or to the upper part.
 *          If sideA == PastixLCoef, the contribution of:
 *          (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
 *          C, otherwise the contribution:
 *          (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
 *          to C.
 *
 * @param[in] transB
 *          Specify the transposition used for the B matrices. It has to be either
 *          PastixTrans or PastixConjTrans.
 *
 * @param[in] blok_mk
 *          Index of the first off-diagonal block in cblk, that is used for A.
 *
 * @param[in] blok_kn
 *          Index of the first off-diagonal block in cblk, that is used for B.
 *
 * @param[in] blok_mn
 *          Index of the first off-diagonal block in fcblk, that is used for C.
 *
 * @param[in] cblk
 *          The cblk structure to which block belongs to. The A and B pointers
 *          must be the coeftab of this column block.
 *          Next column blok must be accessible through cblk[1].
 *
 * @param[inout] fcblk
 *          The pointer to the data structure that describes the panel on which
 *          we compute the contributions. The C pointer must be one of the
 *          coeftab from this fcblk. Next column blok must be accessible through
 *          fcblk[1].
 *
 * @param[in] A
 *          The pointer to the coeftab of the cblk.lcoeftab matrix storing the
 *          coefficients of the panel when the Lower part is computed,
 *          cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
 *
 * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
 *          the coefficients of the panel, if Symmetric/Hermitian cases or if
 *          upper part is computed; cblk.ucoeftab otherwise. Must be of size
 *          cblk.stride -by- cblk.width
 *
 * @param[in] lowrank
 *          The structure with low-rank parameters.
 *
 *******************************************************************************/
static inline void
core_zgemmsp_block_frlr( pastix_coefside_t         sideA,
                         pastix_trans_t            transB,
                         pastix_int_t              blok_mk,
                         pastix_int_t              blok_kn,
                         pastix_int_t              blok_mn,
                         const SolverCblk         *cblk,
                               SolverCblk         *fcblk,
                         const pastix_complex64_t *A,
                         const pastix_complex64_t *B,
                         const pastix_lr_t        *lowrank )
{
    const SolverBlok *blokA, *blokB, *blokC;
    const SolverBlok *bA, *bB, *bC;
    const SolverBlok *fblokK, *lblokK;
    const SolverBlok *fblokN, *lblokN;

    pastix_lrblock_t lrA, lrB;
    pastix_lrblock_t *lrC;
    core_zlrmm_t params;

    pastix_int_t M, K, cblk_n, cblk_m, full_m;
    size_t offsetA, offsetB;

    pastix_fixdbl_t flops = 0.0;
    pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );

    /* Provide an internal lock as this function is already protected by the runtime dependency system */
    pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;

    /* Both cblk and fcblk must be stored in 2D */
    assert( cblk->cblktype  & CBLK_LAYOUT_2D );
    assert( fcblk->cblktype & CBLK_LAYOUT_2D );

    /* Both cblk and fcblk must be compressed */
    assert(!(cblk->cblktype  & CBLK_COMPRESSED) );
    assert(  fcblk->cblktype & CBLK_COMPRESSED  );

    /*
     * Blocs on column K
     */
    fblokK = cblk[0].fblokptr;
    lblokK = cblk[1].fblokptr;

    blokB   = fblokK + blok_kn;
    offsetB = blokB->coefind;
    cblk_n  = blokB->fcblknm;
    lrB.rk  = -1;

    blokA   = fblokK + blok_mk;
    offsetA = blokA->coefind;
    cblk_m  = blokA->fcblknm;
    lrA.rk  = -1;

    /*
     * Blocs on column N
     */
    fblokN = fcblk[0].fblokptr;
    lblokN = fcblk[1].fblokptr;

    blokC = fblokN + blok_mn;
    assert( blokC->fcblknm == cblk_m );

    full_m = 0;
    K = cblk_colnbr( cblk );

    params.lowrank = lowrank;
    params.transA  = PastixNoTrans;
    params.transB  = transB;
    params.K       = K;
    params.Cn      = cblk_colnbr( fcblk );
    params.alpha   = -1.0;
    params.beta    = 1.0;
    params.work    = NULL;
    params.lwork   = -1;
    params.lock    = &(lock);

    bC = blokC;
    for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++) {
        M         = blok_rownbr(bA);
        params.M  = M;
        full_m   += M;

        lrA.rkmax = M;
        lrA.u = (pastix_complex64_t*)A +  bA->coefind - offsetA;
        lrA.v = NULL;

        /* Find facing C blok */
        while (!is_block_inside_fblock( bA, bC )) {
            bC++;
            assert( bC < lblokN );
        }

        lrC = bC->LRblock + sideA;

        params.A = &lrA;
        params.C = lrC;
        params.Cm = blok_rownbr( bC );

        for (bB = blokB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++) {

            params.N    = blok_rownbr( bB );
            params.offx = bA->frownum - bC->frownum;
            params.offy = bB->frownum - fcblk->fcolnum;

            lrB.rkmax = M;
            lrB.u = (pastix_complex64_t*)B +  bB->coefind - offsetB;
            lrB.v = NULL;

            params.B = &lrB;

            flops += core_zlrmm( &params );
        }
    }

    kernel_trace_stop( blokB->inlast, PastixKernelGEMMBlokLRLR,
                       full_m, full_m, K,
                       flops, time );

    (void)lblokN;
}

/**
 *******************************************************************************
 *
 * @ingroup kernel_fact_null
 *
 * @brief Compute the updates that are generated by the transposition of all the
 * blocks facing a common diagonal block, by another similar set of blocks.
 *
 * This is used to perform an update:
 *
 *                      C_mn = C_mn - A_mk * op(B_kn)
 *
 * where A_mk is the set of blocks in cblk k, facing the diagonal block of the
 * cblk m; B_kn is the set of blocks in cblk n facing the diagonal block of the
 * cblk k; and C_mn is the set of blocks impacted by this update, it necessarily
 * belongs to the set of block of the cblk n facing the diagonal block of the
 * cblk m.
 *
 * In this routine, all the matrices are low-rank
 *
 *******************************************************************************
 *
 * @param[in] sideA
 *          Specify if A and C belong to the lower part, or to the upper part.
 *          If sideA == PastixLCoef, the contribution of:
 *          (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
 *          C, otherwise the contribution:
 *          (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
 *          to C.
 *
 * @param[in] sideB
 *          Specify if B belongs to the L part, or to the U part. this is used
 *          internally in the kernel to select the correct data pointer.
 *          If PastixLCoef, B belongs to the L part, otherwise B belogns to the
 *          U part.
 *
 * @param[in] transB
 *          Specify the transposition used for the B matrices. It has to be either
 *          PastixTrans or PastixConjTrans.
 *
 * @param[in] blok_mk
 *          Index of the first off-diagonal block in cblk, that is used for A.
 *
 * @param[in] blok_kn
 *          Index of the first off-diagonal block in cblk, that is used for B.
 *
 * @param[in] blok_mn
 *          Index of the first off-diagonal block in fcblk, that is used for C.
 *
 * @param[in] cblk
 *          The cblk structure to which block belongs to. The A and B pointers
 *          must be the coeftab of this column block.
 *          Next column blok must be accessible through cblk[1].
 *
 * @param[inout] fcblk
 *          The pointer to the data structure that describes the panel on which
 *          we compute the contributions. The C pointer must be one of the
 *          coeftab from this fcblk. Next column blok must be accessible through
 *          fcblk[1].
 *
 * @param[in] lowrank
 *          The structure with low-rank parameters.
 *
 *******************************************************************************/
static inline void
core_zgemmsp_block_lrlr( pastix_coefside_t  sideA,
                         pastix_coefside_t  sideB,
                         pastix_trans_t     transB,
                         pastix_int_t       blok_mk,
                         pastix_int_t       blok_kn,
                         pastix_int_t       blok_mn,
                         const SolverCblk  *cblk,
                               SolverCblk  *fcblk,
                         const pastix_lr_t *lowrank )
{
    const SolverBlok *blokA, *blokB, *blokC;
    const SolverBlok *bA, *bB, *bC;
    const SolverBlok *fblokK, *lblokK;
    const SolverBlok *fblokN, *lblokN;

    const pastix_lrblock_t *lrA, *lrB;
    pastix_lrblock_t *lrC;
    core_zlrmm_t params;

    pastix_int_t M, K, cblk_n, cblk_m, full_m;

    pastix_fixdbl_t flops = 0.0;
    pastix_fixdbl_t time = kernel_trace_start( PastixKernelGEMMBlok2d2d );

    /* Provide an internal lock as this function is already protected by the runtime dependency system */
    pastix_atomic_lock_t lock = PASTIX_ATOMIC_UNLOCKED;

    /* Both cblk and fcblk must be stored in 2D */
    assert( cblk->cblktype  & CBLK_LAYOUT_2D );
    assert( fcblk->cblktype & CBLK_LAYOUT_2D );

    /* Both cblk and fcblk must be compressed */
    assert( cblk->cblktype  & CBLK_COMPRESSED );
    assert( fcblk->cblktype & CBLK_COMPRESSED );

    /*
     * Blocs on column K
     */
    fblokK = cblk[0].fblokptr;
    lblokK = cblk[1].fblokptr;

    blokB = fblokK + blok_kn;
    cblk_n = blokB->fcblknm;

    blokA = fblokK + blok_mk;
    cblk_m = blokA->fcblknm;

    /*
     * Blocs on column N
     */
    fblokN = fcblk[0].fblokptr;
    lblokN = fcblk[1].fblokptr;

    blokC = fblokN + blok_mn;
    assert( blokC->fcblknm == cblk_m );

    full_m = 0;
    K = cblk_colnbr( cblk );

    params.lowrank = lowrank;
    params.transA  = PastixNoTrans;
    params.transB  = transB;
    params.K       = K;
    params.Cn      = cblk_colnbr( fcblk );
    params.alpha   = -1.0;
    params.beta    = 1.0;
    params.work    = NULL;
    params.lwork   = -1;
    params.lock    = &(lock);

    bC = blokC;
    for (bA = blokA; (bA < lblokK) && (bA->fcblknm == cblk_m); bA++) {
        M   = blok_rownbr(bA);
        params.M  = M;
        full_m   += M;
        lrA = bA->LRblock + sideA;

        /* Find facing C blok */
        while (!is_block_inside_fblock( bA, bC )) {
            bC++;
            assert( bC < lblokN );
        }

        lrC  = bC->LRblock + sideA;

        params.A = lrA;
        params.C = lrC;
        params.Cm = blok_rownbr( bC );

        for (bB = blokB; (bB < lblokK) && (bB->fcblknm == cblk_n); bB++) {
            lrB = bB->LRblock + sideB;

            params.N    = blok_rownbr( bB );
            params.offx = bA->frownum - bC->frownum;
            params.offy = bB->frownum - fcblk->fcolnum;
            params.B    = lrB;

            flops += core_zlrmm( &params );
        }
    }

    kernel_trace_stop( blokB->inlast, PastixKernelGEMMBlokLRLR,
                       full_m, full_m, K,
                       flops, time );

    (void)lblokN;
}

/**
 *******************************************************************************
 *
 * @ingroup kernel_fact_null
 *
 * @brief Compute the updates associated to one off-diagonal block.
 *
 *******************************************************************************
 *
 * @param[in] sideA
 *          Specify if A and C belong to the lower part, or to the upper part.
 *          If sideA == PastixLCoef, the contribution of:
 *          (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
 *          C, otherwise the contribution:
 *          (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
 *          to C.
 *
 * @param[in] trans
 *          Specify the transposition used for the B matrix. It has to be either
 *          PastixTrans or PastixConjTrans.
 *
 * @param[in] cblk
 *          The cblk structure to which block belongs to. The A and B pointers
 *          must be the coeftab of this column block.
 *          Next column blok must be accessible through cblk[1].
 *
 * @param[in] blok
 *          The block from which we compute the contributions.
 *
 * @param[inout] fcblk
 *          The pointer to the data structure that describes the panel on which
 *          we compute the contributions. The C pointer must be one of the
 *          coeftab from this fcblk. Next column blok must be accessible through
 *          fcblk[1].
 *
 * @param[in] A
 *          The pointer to the coeftab of the cblk.lcoeftab matrix storing the
 *          coefficients of the panel when the Lower part is computed,
 *          cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
 *
 * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
 *          the coefficients of the panel, if Symmetric/Hermitian cases or if
 *          upper part is computed; cblk.ucoeftab otherwise. Must be of size
 *          cblk.stride -by- cblk.width
 *
 * @param[in] work
 *          Temporary memory buffer.
 *
 * @param[in] lowrank
 *          The structure with low-rank parameters.
 *
 *******************************************************************************
 *
 * @return  The number of flops performed
 *
 *******************************************************************************/
static inline pastix_fixdbl_t
core_zgemmsp_fulllr( pastix_coefside_t         sideA,
                     pastix_trans_t            trans,
                     const SolverCblk         *cblk,
                     const SolverBlok         *blok,
                           SolverCblk         *fcblk,
                     const pastix_complex64_t *A,
                     const pastix_complex64_t *B,
                           pastix_complex64_t *work,
                           pastix_int_t        lwork,
                     const pastix_lr_t        *lowrank )
{
    const SolverBlok *iterblok;
    const SolverBlok *fblok;
    const SolverBlok *lblok;
    pastix_lrblock_t lrA, lrB, *lrC;
    core_zlrmm_t params;

    pastix_int_t stride, shift;
    pastix_int_t M, N;

    pastix_fixdbl_t flops = 0.0;

    /* Update from a dense block to a low rank block */
    assert(!(cblk->cblktype  & CBLK_COMPRESSED));
    assert(  fcblk->cblktype & CBLK_COMPRESSED );
    assert(  fcblk->cblktype & CBLK_LAYOUT_2D  );

    shift  = (sideA == PastixUCoef) ? 1 : 0;
    stride = cblk->stride;

     N = blok_rownbr( blok );

    /* Get the B block and its dimensions */
    lrB.rk = -1;
    lrB.rkmax = (cblk->cblktype & CBLK_LAYOUT_2D) ? N : stride;
    lrB.u = (pastix_complex64_t*)B + blok->coefind; /* lrB is const, we can cast the B pointer */
    lrB.v = NULL;

    /**
     * Add contribution to C in fcblk:
     *    Get the first facing block of the distant panel, and the last block of
     *    the current cblk
     */
    fblok = fcblk->fblokptr;
    lblok = cblk[1].fblokptr;

    params.lowrank = lowrank;
    params.transA  = PastixNoTrans;
    params.transB  = trans;
    params.N       = blok_rownbr( blok );
    params.K       = cblk_colnbr( cblk );
    params.Cn      = cblk_colnbr( fcblk );
    params.alpha   = -1.0;
    params.beta    = 1.0;
    params.work    = work;
    params.lwork   = lwork;
    params.lock    = &(fcblk->lock);
    params.B       = &lrB;

    for (iterblok=blok+shift; iterblok<lblok; iterblok++) {

        /* Find facing blok */
        while (!is_block_inside_fblock( iterblok, fblok ))
        {
            fblok++;
            assert( fblok < fcblk[1].fblokptr );
        }

        /* Get the A block and its dimensions */
        M = blok_rownbr( iterblok );
        lrA.rk = -1;
        lrA.rkmax = (cblk->cblktype & CBLK_LAYOUT_2D) ? M : stride;
        lrA.u = (pastix_complex64_t*)A + iterblok->coefind; /* Same as for B */
        lrA.v = NULL;

        lrC = fblok->LRblock + shift;

        params.M  = M;
        params.A  = &lrA;
        params.C  = lrC;
        params.Cm = blok_rownbr( fblok );

        params.offx = iterblok->frownum - fblok->frownum;
        params.offy = blok->frownum - fcblk->fcolnum;

        flops += core_zlrmm( &params );
    }
    return flops;
}

/**
 *******************************************************************************
 *
 * @ingroup kernel_fact_null
 *
 * @brief Compute the updates associated to one off-diagonal block.
 *
 *******************************************************************************
 *
 * @param[in] sideA
 *          Specify if A and C belong to the lower part, or to the upper part.
 *          If sideA == PastixLCoef, the contribution of:
 *          (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
 *          C, otherwise the contribution:
 *          (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
 *          to C.
 *
 * @param[in] sideB
 *          Specify if B belongs to the L part, or to the U part. this is used
 *          internally in the kernel to select the correct data pointer.
 *          If PastixLCoef, B belongs to the L part, otherwise B belogns to the
 *          U part.
 *
 * @param[in] trans
 *          Specify the transposition used for the B matrix. It has to be either
 *          PastixTrans or PastixConjTrans.
 *
 * @param[in] cblk
 *          The cblk structure to which block belongs to. The A and B pointers
 *          must be the coeftab of this column block.
 *          Next column blok must be accessible through cblk[1].
 *
 * @param[in] blok
 *          The block from which we compute the contributions.
 *
 * @param[inout] fcblk
 *          The pointer to the data structure that describes the panel on which
 *          we compute the contributions. The C pointer must be one of the
 *          coeftab from this fcblk. Next column blok must be accessible through
 *          fcblk[1].
 *
 * @param[in] work
 *          Temporary memory buffer.
 *
 * @param[in] lowrank
 *          The structure with low-rank parameters.
 *
 *******************************************************************************
 *
 * @return  The number of flops performed
 *
 *******************************************************************************/
static inline pastix_fixdbl_t
core_zgemmsp_lr( pastix_coefside_t         sideA,
                 pastix_coefside_t         sideB,
                 pastix_trans_t            trans,
                 const SolverCblk         *cblk,
                 const SolverBlok         *blok,
                       SolverCblk         *fcblk,
                       pastix_complex64_t *work,
                       pastix_int_t        lwork,
                 const pastix_lr_t        *lowrank )
{
    const SolverBlok *iterblok;
    const SolverBlok *fblok;
    const SolverBlok *lblok;

    pastix_int_t N, K, shift;
    pastix_lrblock_t *lrB;
    core_zlrmm_t params;

    pastix_fixdbl_t flops = 0.0;

    /* Update from a low-rank cblk to a low-rank cblk */
    assert( cblk->cblktype  & CBLK_COMPRESSED );
    assert( fcblk->cblktype & CBLK_COMPRESSED );
    assert( cblk->cblktype  & CBLK_LAYOUT_2D );
    assert( fcblk->cblktype & CBLK_LAYOUT_2D );

    shift = (sideA == PastixUCoef) ? 1 : 0;

    /* Get the B block and its dimensions */
    lrB = (sideB == PastixLCoef) ? blok->LRblock : blok->LRblock+1;
    K = cblk_colnbr( cblk );
    N = blok_rownbr( blok );

    /*
     * Add contribution to C in fcblk:
     *    Get the first facing block of the distant panel, and the last block of
     *    the current cblk
     */
    fblok = fcblk->fblokptr;
    lblok = cblk[1].fblokptr;

    params.lowrank = lowrank;
    params.transA  = PastixNoTrans;
    params.transB  = trans;
    params.N       = N;
    params.K       = K;
    params.Cn      = cblk_colnbr( fcblk );
    params.alpha   = -1.0;
    params.beta    = 1.0;
    params.work    = work;
    params.lwork   = lwork;
    params.lwused  = 0;
    params.lock    = &(fcblk->lock);
    params.B       = lrB;

    /* for all following blocks in block column */
    for (iterblok=blok+shift; iterblok<lblok; iterblok++) {

        /* Find facing blok */
        while (!is_block_inside_fblock( iterblok, fblok ))
        {
            fblok++;
            assert( fblok < fcblk[1].fblokptr );
        }

        params.M  = blok_rownbr( iterblok );
        params.A  = iterblok->LRblock + shift;
        params.C  = fblok->LRblock + shift;
        params.Cm = blok_rownbr( fblok );

        params.offx = iterblok->frownum - fblok->frownum;
        params.offy = blok->frownum - fcblk->fcolnum;

        flops += core_zlrmm( &params );
    }
    return flops;
}

/**
 *******************************************************************************
 *
 * @brief Compute the updates associated to one off-diagonal block.
 *
 *******************************************************************************
 *
 * @param[in] sideA
 *          Specify if A and C belong to the lower part, or to the upper part.
 *          If sideA == PastixLCoef, the contribution of:
 *          (block .. (cblk[1].fblokptr-1)) -by- block is computed and added to
 *          C, otherwise the contribution:
 *          (block+1 .. (cblk[1].fblokptr-1)) -by- block is computed and added
 *          to C.
 *          The pointer to the data structure that describes the panel from
 *          which we compute the contributions. Next column blok must be
 *          accessible through cblk[1].
 *
 * @param[in] sideB
 *          Specify if B belongs to the L part, or to the U part. this is used
 *          internally in the kernel to select the correct data pointer.
 *          If PastixLCoef, B belongs to the L part, otherwise B belongs to the
 *          U part.
 *
 * @param[in] trans
 *          Specify the transposition used for the B matrix. It has to be either
 *          PastixTrans or PastixConjTrans.
 *
 * @param[in] cblk
 *          The cblk structure to which block belongs to. The A and B pointers
 *          must be the coeftab of this column block.
 *          Next column blok must be accessible through cblk[1].
 *
 * @param[in] blok
 *          The block from which we compute the contributions.
 *
 * @param[inout] fcblk
 *          The pointer to the data structure that describes the panel on which
 *          we compute the contributions. The C pointer must be one of the
 *          coeftab from this fcblk. Next column blok must be accessible through
 *          fcblk[1].
 *
 * @param[in] A
 *          The pointer to the coeftab of the cblk.lcoeftab matrix storing the
 *          coefficients of the panel when the Lower part is computed,
 *          cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
 *
 * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
 *          the coefficients of the panel, if Symmetric/Hermitian cases or if
 *          upper part is computed; cblk.ucoeftab otherwise. Must be of size
 *          cblk.stride -by- cblk.width
 *
 * @param[inout] C
 *          The pointer to the fcblk.lcoeftab if the lower part is computed,
 *          fcblk.ucoeftab otherwise.
 *
 * @param[in] work
 *          Temporary memory buffer.
 *
 * @param[in] lowrank
 *          The structure with low-rank parameters.
 *
 *******************************************************************************
 *
 * @return  The number of static pivoting during factorization of the diagonal
 *          block.
 *
 *******************************************************************************/
void
cpucblk_zgemmsp(       pastix_coefside_t   sideA,
                       pastix_coefside_t   sideB,
                       pastix_trans_t      trans,
                 const SolverCblk         *cblk,
                 const SolverBlok         *blok,
                       SolverCblk         *fcblk,
                 const pastix_complex64_t *A,
                 const pastix_complex64_t *B,
                       pastix_complex64_t *C,
                       pastix_complex64_t *work,
                       pastix_int_t        lwork,
                 const pastix_lr_t        *lowrank )
{
    pastix_ktype_t ktype;
    pastix_fixdbl_t time, flops = 0.0;
    pastix_int_t m = cblk->stride;
    pastix_int_t n = blok_rownbr( blok );
    pastix_int_t k = cblk_colnbr( cblk );

    m -= (cblk->cblktype & CBLK_LAYOUT_2D) ? blok->coefind / k : blok->coefind;
    m -= (sideA == PastixUCoef) ? blok_rownbr( blok ) : 0;

    if ( fcblk->cblktype & CBLK_COMPRESSED ) {
        if ( cblk->cblktype & CBLK_COMPRESSED ) {
            ktype = PastixKernelGEMMCblkLRLR;
            time  = kernel_trace_start( ktype );

            flops = core_zgemmsp_lr( sideA, sideB, trans,
                                     cblk, blok, fcblk,
                                     work, lwork, lowrank );
        }
        else {
            ktype = PastixKernelGEMMCblkFRLR;
            time  = kernel_trace_start( ktype );

            flops = core_zgemmsp_fulllr( sideA, trans,
                                         cblk, blok, fcblk,
                                         A, B, work, lwork,
                                         lowrank );
        }
    }
    else if ( fcblk->cblktype & CBLK_LAYOUT_2D ) {
        if ( cblk->cblktype & CBLK_LAYOUT_2D ) {
            ktype = PastixKernelGEMMCblk2d2d;
            time  = kernel_trace_start( ktype );

            core_zgemmsp_2d2d( sideA, trans,
                               cblk, blok, fcblk,
                               A, B, C );
        }
        else {
            ktype = PastixKernelGEMMCblk1d2d;
            time  = kernel_trace_start( ktype );

            core_zgemmsp_1d2d( sideA, trans,
                               cblk, blok, fcblk,
                               A, B, C );
        }
        flops = FLOPS_ZGEMM( m, n, k );
    }
    else {
        ktype = PastixKernelGEMMCblk1d1d;
        time  = kernel_trace_start( ktype );

        core_zgemmsp_1d1d( sideA, trans,
                           cblk, blok, fcblk,
                           A, B, C, work );

        flops = FLOPS_ZGEMM( m, n, k );
    }

    kernel_trace_stop( blok->inlast, ktype, m, n, k, flops, time );
}

/**
 *******************************************************************************
 *
 * @brief Compute the CPU gemm associated to a couple of off-diagonal blocks.
 *
 *    C_l = C_l - A_l * op(B_s), with B_s = B_l, or B_u
 *  or
 *    C_u = C_u - A_u * op(B_s), with B_s = B_l, or B_u
 *
 *******************************************************************************
 *
 * @param[in] sideA
 *          Specify if A and C belong to the L part, or to the U part of the
 *          matrix. This is used internally in the kernels to select the correct
 *          data pointers.  If PastixLCoef, A and C belong to the L part,
 *          otherwise A and C belong to the U part.
 *
 * @param[in] sideB
 *          Specify if B belongs to the lower or upper part of the matrix. This
 *          is used internally in the kernels to select the correct data
 *          pointers.  If PastixLCoef, B belongs to the L part, otherwise B
 *          belongs to the U part.
 *
 * @param[in] transB
 *          Specify wheter B should be used as PastixNoTrans, PastixTrans, or
 *          PastixConjTrans in the computations.
 *
 * @param[in] cblk
 *          The cblk structure to which block A and B belong to. The A and B
 *          pointers must be one of the [lu]coeftab of this column block.
 *          Next column blok must be accessible through cblk[1].
 *
 * @param[inout] fcblk
 *          The pointer to the data structure that describes the panel on which
 *          we compute the contributions. The C pointer must be one of the
 *          [lu]coeftab from this fcblk.
 *          Next column blok must be accessible through fcblk[1].
 *
 * @param[in] blok_mk
 *          Specify the index of the A block in the cblk column. This index is
 *          0-based for the diagonal block.
 *
 * @param[in] blok_nk
 *          Specify the index of the B block in the cblk column. This index is
 *          0-based for the diagonal block.
 *
 * @param[in] blok_mn
 *          Specify the index of the C block in the fcblk column. This index is
 *          0-based for the diagonal block.
 *
 * @param[in] A
 *          The pointer to the coeftab of the cblk.lcoeftab matrix storing the
 *          coefficients of the panel when the Lower part is computed,
 *          cblk.ucoeftab otherwise. Must be of size cblk.stride -by- cblk.width
 *
 * @param[in] B The pointer to the coeftab of the cblk.lcoeftab matrix storing
 *          the coefficients of the panel, if Symmetric/Hermitian cases or if
 *          upper part is computed; cblk.ucoeftab otherwise. Must be of size
 *          cblk.stride -by- cblk.width
 *
 * @param[inout] C
 *          The pointer to the fcblk.lcoeftab if the lower part is computed,
 *          fcblk.ucoeftab otherwise.
 *
 * @param[in] lowrank
 *          The structure with the low-rank parameters.
 *
 *******************************************************************************/
void
cpublok_zgemmsp(       pastix_coefside_t   sideA,
                       pastix_coefside_t   sideB,
                       pastix_trans_t      transB,
                 const SolverCblk         *cblk,
                       SolverCblk         *fcblk,
                       pastix_int_t        blok_mk,
                       pastix_int_t        blok_nk,
                       pastix_int_t        blok_mn,
                 const pastix_complex64_t *A,
                 const pastix_complex64_t *B,
                       pastix_complex64_t *C,
                 const pastix_lr_t        *lowrank )
{
    if ( fcblk->cblktype & CBLK_COMPRESSED ) {
        if ( cblk->cblktype & CBLK_COMPRESSED ) {
            core_zgemmsp_block_lrlr( sideA, sideB, transB,
                                     blok_mk, blok_nk, blok_mn,
                                     cblk, fcblk,
                                     lowrank );
        }
        else {
            core_zgemmsp_block_frlr( sideA, transB,
                                     blok_mk, blok_nk, blok_mn,
                                     cblk, fcblk,
                                     A, B, lowrank );
        }
    }
    else {
        if ( cblk->cblktype & CBLK_COMPRESSED ) {
            assert(0);
        }
        else {
            core_zgemmsp_block_frfr( transB,
                                     blok_mk, blok_nk, blok_mn,
                                     cblk, fcblk,
                                     A, B, C );
        }
    }
}
