Routine: PCGBTRS()  File: SRC\pcgbtrs.f

 
 
# lines: 1182
  # code: 1182
  # comment: 0
  # blank:0
# Variables:79
# Callers:1
# Callings:0
# Words:467
# Keywords:279
 

 

..
     .. Array Arguments ..
     ..
  Purpose
  =======
  PCGBTRS solves a system of linear equations
            A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
                                    or
            A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS)
  where A(1:N, JA:JA+N-1) is the matrix used to produce the factors
  stored in A(1:N,JA:JA+N-1) and AF by PCGBTRF.
  A(1:N, JA:JA+N-1) is an N-by-N complex
  banded distributed
  matrix with bandwidth BWL, BWU.
  Routine PCGBTRF MUST be called first.
  =====================================================================
  Arguments
  =========
  TRANS   (global input) CHARACTER
          = 'N':  Solve with A(1:N, JA:JA+N-1);
          = 'C':  Solve with conjugate_transpose( A(1:N, JA:JA+N-1) );
  N       (global input) INTEGER
          The number of rows and columns to be operated on, i.e. the
          order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
  BWL     (global input) INTEGER
          Number of subdiagonals. 0 <= BWL <= N-1
  BWU     (global input) INTEGER
          Number of superdiagonals. 0 <= BWU <= N-1
  NRHS    (global input) INTEGER
          The number of right hand sides, i.e., the number of columns
          of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
          NRHS >= 0.
  A       (local input/local output) COMPLEX pointer into
          local memory to an array with first dimension
          LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA).
          On entry, this array contains the local pieces of the
          N-by-N unsymmetric banded distributed Cholesky factor L or
          L^T A(1:N, JA:JA+N-1).
          This local portion is stored in the packed banded format
            used in LAPACK. Please see the Notes below and the
            ScaLAPACK manual for more detail on the format of
            distributed matrices.
  JA      (global input) INTEGER
          The index in the global array A that points to the start of
          the matrix to be operated on (which may be either all of A
          or a submatrix of A).
  DESCA   (global and local input) INTEGER array of dimension DLEN.
          if 1D type (DTYPE_A=501), DLEN >= 7;
          if 2D type (DTYPE_A=1), DLEN >= 9 .
          The array descriptor for the distributed matrix A.
          Contains information of mapping of A to memory. Please
          see NOTES below for full description and options.
  IPIV    (local output) INTEGER array, dimension >= DESCA( NB ).
          Pivot indices for local factorizations.
          Users *should not* alter the contents between
          factorization and solve.
  B       (local input/local output) COMPLEX pointer into
          local memory to an array of local lead dimension lld_b>=NB.
          On entry, this array contains the
          the local pieces of the right hand sides
          B(IB:IB+N-1, 1:NRHS).
          On exit, this contains the local piece of the solutions
          distributed matrix X.
  IB      (global input) INTEGER
          The row index in the global array B that points to the first
          row of the matrix to be operated on (which may be either
          all of B or a submatrix of B).
  DESCB   (global and local input) INTEGER array of dimension DLEN.
          if 1D type (DTYPE_B=502), DLEN >=7;
          if 2D type (DTYPE_B=1), DLEN >= 9.
          The array descriptor for the distributed matrix B.
          Contains information of mapping of B to memory. Please
          see NOTES below for full description and options.
  AF      (local output) COMPLEX array, dimension LAF.
          Auxiliary Fillin Space.
          Fillin is created during the factorization routine
          PCGBTRF and this is stored in AF. If a linear system
          is to be solved using PCGBTRS after the factorization
          routine, AF *must not be altered* after the factorization.
  LAF     (local input) INTEGER
          Size of user-input Auxiliary Fillin space AF. Must be >=
          (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu)
          If LAF is not large enough, an error code will be returned
          and the minimum acceptable size will be returned in AF( 1 )
  WORK    (local workspace/local output)
          COMPLEX temporary workspace. This space may
          be overwritten in between calls to routines. WORK must be
          the size given in LWORK.
          On exit, WORK( 1 ) contains the minimal LWORK.
  LWORK   (local input or global input) INTEGER
          Size of user-input workspace WORK.
          If LWORK is too small, the minimal acceptable size will be
          returned in WORK(1) and an error code is returned. LWORK>=
          NRHS*(NB+2*bwl+4*bwu)
  INFO    (global output) INTEGER
          = 0:  successful exit
          < 0:  If the i-th argument is an array and the j-entry had
                an illegal value, then INFO = -(i*100+j), if the i-th
                argument is a scalar and had an illegal value, then
                INFO = -i.
  =====================================================================
  Restrictions
  ============
  The following are restrictions on the input parameters. Some of these
    are temporary and will be removed in future releases, while others
    may reflect fundamental technical limitations.
    Non-cyclic restriction: VERY IMPORTANT!
      P*NB>= mod(JA-1,NB)+N.
      The mapping for matrices must be blocked, reflecting the nature
      of the divide and conquer algorithm as a task-parallel algorithm.
      This formula in words is: no processor may have more than one
      chunk of the matrix.
    Blocksize cannot be too small:
      If the matrix spans more than one processor, the following
      restriction on NB, the size of each block on each processor,
      must hold:
      NB >= (BWL+BWU)+1
      The bulk of parallel computation is done on the matrix of size
      O(NB) on each processor. If this is too small, divide and conquer
      is a poor choice of algorithm.
    Submatrix reference:
      JA = IB
      Alignment restriction that prevents unnecessary communication.
  =====================================================================
  Notes
  =====
  If the factorization routine and the solve routine are to be called
    separately (to solve various sets of righthand sides using the same
    coefficient matrix), the auxiliary space AF *must not be altered*
    between calls to the factorization routine and the solve routine.
  The best algorithm for solving banded and tridiagonal linear systems
    depends on a variety of parameters, especially the bandwidth.
    Currently, only algorithms designed for the case N/P >> bw are
    implemented. These go by many names, including Divide and Conquer,
    Partitioning, domain decomposition-type, etc.
  Algorithm description: Divide and Conquer
    The Divide and Conqer algorithm assumes the matrix is narrowly
      banded compared with the number of equations. In this situation,
      it is best to distribute the input matrix A one-dimensionally,
      with columns atomic and rows divided amongst the processes.
      The basic algorithm divides the banded matrix up into
      P pieces with one stored on each processor,
      and then proceeds in 2 phases for the factorization or 3 for the
      solution of a linear system.
      1) Local Phase:
         The individual pieces are factored independently and in
         parallel. These factors are applied to the matrix creating
         fillin, which is stored in a non-inspectable way in auxiliary
         space AF. Mathematically, this is equivalent to reordering
         the matrix A as P A P^T and then factoring the principal
         leading submatrix of size equal to the sum of the sizes of
         the matrices factored on each processor. The factors of
         these submatrices overwrite the corresponding parts of A
         in memory.
      2) Reduced System Phase:
         A small (max(bwl,bwu)* (P-1)) system is formed representing
         interaction of the larger blocks, and is stored (as are its
         factors) in the space AF. A parallel Block Cyclic Reduction
         algorithm is used. For a linear system, a parallel front solve
         followed by an analagous backsolve, both using the structure
         of the factored matrix, are performed.
      3) Backsubsitution Phase:
         For a linear system, a local backsubstitution is performed on
         each processor in parallel.
  Descriptors
  ===========
  Descriptors now have *types* and differ from ScaLAPACK 1.0.
  Note: banded codes can use either the old two dimensional
    or new one-dimensional descriptors, though the processor grid in
    both cases *must be one-dimensional*. We describe both types below.
  Each global data object is described by an associated description
  vector.  This vector stores the information required to establish
  the mapping between an object element and its corresponding process
  and memory location.
  Let A be a generic term for any 2D block cyclicly distributed array.
  Such a global array has an associated description vector DESCA.
  In the following comments, the character _ should be read as
  "of the global array".
  NOTATION        STORED IN      EXPLANATION
  --------------- -------------- --------------------------------------
  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
                                 DTYPE_A = 1.
  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
                                 the BLACS process grid A is distribu-
                                 ted over. The context itself is glo-
                                 bal, but the handle (the integer
                                 value) may vary.
  M_A    (global) DESCA( M_ )    The number of rows in the global
                                 array A.
  N_A    (global) DESCA( N_ )    The number of columns in the global
                                 array A.
  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
                                 the rows of the array.
  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
                                 the columns of the array.
  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
                                 row of the array A is distributed.
  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
                                 first column of the array A is
                                 distributed.
  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
  Let K be the number of rows or columns of a distributed matrix,
  and assume that its process grid has dimension p x q.
  LOCr( K ) denotes the number of elements of K that a process
  would receive if K were distributed over the p processes of its
  process column.
  Similarly, LOCc( K ) denotes the number of elements of K that a
  process would receive if K were distributed over the q processes of
  its process row.
  The values of LOCr() and LOCc() may be determined via a call to the
  ScaLAPACK tool function, NUMROC:
          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
  An upper bound for these quantities may be computed by:
          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
  One-dimensional descriptors:
  One-dimensional descriptors are a new addition to ScaLAPACK since
    version 1.0. They simplify and shorten the descriptor for 1D
    arrays.
  Since ScaLAPACK supports two-dimensional arrays as the fundamental
    object, we allow 1D arrays to be distributed either over the
    first dimension of the array (as if the grid were P-by-1) or the
    2nd dimension (as if the grid were 1-by-P). This choice is
    indicated by the descriptor type (501 or 502)
    as described below.
    IMPORTANT NOTE: the actual BLACS grid represented by the
    CTXT entry in the descriptor may be *either*  P-by-1 or 1-by-P
    irrespective of which one-dimensional descriptor type
    (501 or 502) is input.
    This routine will interpret the grid properly either way.
    ScaLAPACK routines *do not support intercontext operations* so that
    the grid passed to a single ScaLAPACK routine *must be the same*
    for all array descriptors passed to that routine.
    NOTE: In all cases where 1D descriptors are used, 2D descriptors
    may also be used, since a one-dimensional array is a special case
    of a two-dimensional array with one dimension of size unity.
    The two-dimensional array used in this case *must* be of the
    proper orientation:
      If the appropriate one-dimensional descriptor is DTYPEA=501
      (1 by P type), then the two dimensional descriptor must
      have a CTXT value that refers to a 1 by P BLACS grid;
      If the appropriate one-dimensional descriptor is DTYPEA=502
      (P by 1 type), then the two dimensional descriptor must
      have a CTXT value that refers to a P by 1 BLACS grid.
  Summary of allowed descriptors, types, and BLACS grids:
  DTYPE           501         502         1         1
  BLACS grid      1xP or Px1  1xP or Px1  1xP       Px1
  -----------------------------------------------------
  A               OK          NO          OK        NO
  B               NO          OK          NO        OK
  Note that a consequence of this chart is that it is not possible
    for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
    to opposite requirements for the orientation of the BLACS grid,
    and as noted before, the *same* BLACS context must be used in
    all descriptors in a single ScaLAPACK subroutine call.
  Let A be a generic term for any 1D block cyclicly distributed array.
  Such a global array has an associated description vector DESCA.
  In the following comments, the character _ should be read as
  "of the global array".
  NOTATION        STORED IN  EXPLANATION
  --------------- ---------- ------------------------------------------
  DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
                                TYPE_A = 501: 1-by-P grid.
                                TYPE_A = 502: P-by-1 grid.
  CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
                                the BLACS process grid A is distribu-
                                ted over. The context itself is glo-
                                bal, but the handle (the integer
                                value) may vary.
  N_A    (global) DESCA( 3 ) The size of the array dimension being
                                distributed.
  NB_A   (global) DESCA( 4 ) The blocking factor used to distribute
                                the distributed dimension of the array.
  SRC_A  (global) DESCA( 5 ) The process row or column over which the
                                first row or column of the array
                                is distributed.
  LLD_A  (local)  DESCA( 6 ) The leading dimension of the local array
                                storing the local blocks of the distri-
                                buted array A. Minimum value of LLD_A
                                depends on TYPE_A.
                                TYPE_A = 501: LLD_A >=
                                   size of undistributed dimension, 1.
                                TYPE_A = 502: LLD_A >=NB_A, 1.
  Reserved        DESCA( 7 ) Reserved for future use.
  =====================================================================
  Implemented for ScaLAPACK by:
     Andrew J. Cleary, Livermore National Lab and University of Tenn.,
     and Marbwus Hegland, Australian Natonal University. Feb., 1997.
  Based on code written by    : Peter Arbenz, ETH Zurich, 1996.
  =====================================================================
     .. Parameters ..

 
Display dynamic version Find AutoScroll Reload FontSize: - + Hide Comments Hide Blanks Frame FullScreen MailPrint

 
001        SUBROUTINE PCGBTRS( TRANS , N , BWL , BWU , NRHS , A , JA , DESCA , IPIV ,
002       $B , IB , DESCB , AF , LAF , WORK , LWORK , INFO )
003  
004  *     -- ScaLAPACK routine(version 1.7) --
005  *     University of Tennessee , Knoxville , Oak Ridge National Laboratory ,
006  *     and University of California , Berkeley.
007  *     August 7 , 2001
008  
009  *     .. Scalar Arguments ..
010        CHARACTER TRANS
011        INTEGER BWU , BWL , IB , INFO , JA , LAF , LWORK , N , NRHS
012        REAL ONE , ZERO
013        PARAMETER( ONE = 1.0E + 0 )
014        PARAMETER( ZERO = 0.0E + 0 )
015        COMPLEX CONE , CZERO
016        PARAMETER( CONE =( 1.0E + 0 , 0.0E + 0 ) )
017        PARAMETER( CZERO =( 0.0E + 0 , 0.0E + 0 ) )
018        INTEGER INT_ONE
019        PARAMETER( INT_ONE = 1 )
020        INTEGER DESCMULT , BIGNUM
021        PARAMETER( DESCMULT = 100 , BIGNUM = DESCMULT*DESCMULT )
022        INTEGER BLOCK_CYCLIC_2D , CSRC_ , CTXT_ , DLEN_ , DTYPE_ ,
023       $LLD_ , MB_ , M_ , NB_ , N_ , RSRC_
024        PARAMETER( BLOCK_CYCLIC_2D = 1 , DLEN_ = 9 , DTYPE_ = 1 ,
025       $CTXT_ = 2 , M_ = 3 , N_ = 4 , MB_ = 5 , NB_ = 6 ,
026       $RSRC_ = 7 , CSRC_ = 8 , LLD_ = 9 )
027  *     ..
028  *     .. Local Scalars ..
029        INTEGER APTR , BBPTR , BM , BMN , BN , BNN , BW , CSRC ,
030       $FIRST_PROC , ICTXT , ICTXT_NEW , ICTXT_SAVE ,
031       $IDUM2 , IDUM3 , J , JA_NEW , L , LBWL , LBWU , LDBB ,
032       $LDW , LLDA , LLDB , LM , LMJ , LN , LPTR , MYCOL ,
033       $MYROW , NB , NEICOL , NP , NPACT , NPCOL , NPROW ,
034       $NPSTR , NP_SAVE , ODD_SIZE , PART_OFFSET ,
035       $RECOVERY_VAL , RETURN_CODE , STORE_M_B ,
036       $STORE_N_A , WORK_SIZE_MIN , WPTR
037  *     ..
038  *     .. Local Arrays ..
039        INTEGER DESCA_1XP( 7 ) , DESCB_PX1( 7 ) ,
040       $PARAM_CHECK( 17 , 3 )
041  *     ..
042  *     .. External Subroutines ..
043        EXTERNAL BLACS_GRIDINFO , DESC_CONVERT , GLOBCHK , PXERBLA ,
044       $RESHAPE
045  *     ..
046  *     .. External Functions ..
047        LOGICAL LSAME
048        INTEGER NUMROC
049        EXTERNAL LSAME
050        EXTERNAL NUMROC
051  *     ..
052  *     .. Intrinsic Functions ..
053        INTRINSIC ICHAR , MOD
054  *     ..
055  *     .. Executable Statements ..
056  
057  *     Test the input parameters
058  
059        INFO = 0
060  
061  *     Convert descriptor into standard form for easy access to
062  *     parameters , check that grid is of right shape.
063  
064        DESCA_1XP( 1 ) = 501
065        DESCB_PX1( 1 ) = 502
066  
067        CALL DESC_CONVERT( DESCA , DESCA_1XP , RETURN_CODE )
068  
069        IF( RETURN_CODE .NE. 0) THEN
070            INFO = - ( 8*100 + 2 )
071        ENDIF
072  
073        CALL DESC_CONVERT( DESCB , DESCB_PX1 , RETURN_CODE )
074  
075        IF( RETURN_CODE .NE. 0) THEN
076            INFO = - ( 11*100 + 2 )
077        ENDIF
078  
079  *     Consistency checks for DESCA and DESCB.
080  
081  *     Context must be the same
082        IF( DESCA_1XP( 2 ) .NE. DESCB_PX1( 2 ) ) THEN
083            INFO = - ( 11*100 + 2 )
084        ENDIF
085  
086  *     These are alignment restrictions that may or may not be removed
087  *     in future releases. - Andy Cleary , April 14 , 1996.
088  
089  *     Block sizes must be the same
090        IF( DESCA_1XP( 4 ) .NE. DESCB_PX1( 4 ) ) THEN
091            INFO = - ( 11*100 + 4 )
092        ENDIF
093  
094  *     Source processor must be the same
095  
096        IF( DESCA_1XP( 5 ) .NE. DESCB_PX1( 5 ) ) THEN
097            INFO = - ( 11*100 + 5 )
098        ENDIF
099  
100  *     Get values out of descriptor for use in code.
101  
102        ICTXT = DESCA_1XP( 2 )
103        CSRC = DESCA_1XP( 5 )
104        NB = DESCA_1XP( 4 )
105        LLDA = DESCA_1XP( 6 )
106        STORE_N_A = DESCA_1XP( 3 )
107        LLDB = DESCB_PX1( 6 )
108        STORE_M_B = DESCB_PX1( 3 )
109  
110  *     Get grid parameters
111  
112        CALL BLACS_GRIDINFO( ICTXT , NPROW , NPCOL , MYROW , MYCOL )
113        NP = NPROW * NPCOL
114  
115        IF( LSAME( TRANS , 'N' ) ) THEN
116            IDUM2 = ICHAR( 'N' )
117        ELSE IF( LSAME( TRANS , 'C' ) ) THEN
118            IDUM2 = ICHAR( 'C' )
119        ELSE
120            INFO = - 1
121        END IF
122  
123        IF( LWORK .LT. - 1) THEN
124            INFO = - 16
125        ELSE IF( LWORK .EQ. - 1 ) THEN
126            IDUM3 = - 1
127        ELSE
128            IDUM3 = 1
129        ENDIF
130  
131        IF( N .LT. 0 ) THEN
132            INFO = - 2
133        ENDIF
134  
135        IF( N + JA - 1 .GT. STORE_N_A ) THEN
136            INFO = - ( 8*100 + 6 )
137        ENDIF
138  
139        IF(( BWL .GT. N - 1 ) .OR.
140       $( BWL .LT. 0 ) ) THEN
141        INFO = - 3
142        ENDIF
143  
144        IF(( BWU .GT. N - 1 ) .OR.
145       $( BWU .LT. 0 ) ) THEN
146        INFO = - 4
147        ENDIF
148  
149        IF( LLDA .LT.(2*BWL + 2*BWU + 1) ) THEN
150            INFO = - ( 8*100 + 6 )
151        ENDIF
152  
153        IF( NB .LE. 0 ) THEN
154            INFO = - ( 8*100 + 4 )
155        ENDIF
156  
157        BW = BWU + BWL
158  
159        IF( N + IB - 1 .GT. STORE_M_B ) THEN
160            INFO = - ( 11*100 + 3 )
161        ENDIF
162  
163        IF( LLDB .LT. NB ) THEN
164            INFO = - ( 11*100 + 6 )
165        ENDIF
166  
167        IF( NRHS .LT. 0 ) THEN
168            INFO = - 5
169        ENDIF
170  
171  *     Current alignment restriction
172  
173        IF( JA .NE. IB) THEN
174            INFO = - 7
175        ENDIF
176  
177  *     Argument checking that is specific to Divide & Conquer routine
178  
179        IF( NPROW .NE. 1 ) THEN
180            INFO = - ( 8*100 + 2 )
181        ENDIF
182  
183        IF( N .GT. NP*NB - MOD( JA - 1 , NB )) THEN
184            INFO = - ( 2 )
185            CALL PXERBLA( ICTXT ,
186       $    'PCGBTRS , D&C alg. : only 1 block per proc' ,
187       $    - INFO )
188            RETURN
189        ENDIF
190  
191        IF((JA + N - 1.GT.NB) .AND.( NB.LT.(BWL + BWU + 1) )) THEN
192            INFO = - ( 8*100 + 4 )
193            CALL PXERBLA( ICTXT ,
194       $    'PCGBTRS , D&C alg. : NB too small' ,
195       $    - INFO )
196            RETURN
197        ENDIF
198  
199  *     Check worksize
200  
201        WORK_SIZE_MIN = NRHS*(NB + 2*BWL + 4*BWU)
202  
203        WORK( 1 ) = WORK_SIZE_MIN
204  
205        IF( LWORK .LT. WORK_SIZE_MIN ) THEN
206            IF( LWORK .NE. - 1 ) THEN
207                INFO = - 16
208                CALL PXERBLA( ICTXT ,
209       $        'PCGBTRS : worksize error ' ,
210       $        - INFO )
211            ENDIF
212            RETURN
213        ENDIF
214  
215  *     Pack params and positions into arrays for global consistency check
216  
217        PARAM_CHECK( 17 , 1 ) = DESCB(5)
218        PARAM_CHECK( 16 , 1 ) = DESCB(4)
219        PARAM_CHECK( 15 , 1 ) = DESCB(3)
220        PARAM_CHECK( 14 , 1 ) = DESCB(2)
221        PARAM_CHECK( 13 , 1 ) = DESCB(1)
222        PARAM_CHECK( 12 , 1 ) = IB
223        PARAM_CHECK( 11 , 1 ) = DESCA(5)
224        PARAM_CHECK( 10 , 1 ) = DESCA(4)
225        PARAM_CHECK( 9 , 1 ) = DESCA(3)
226        PARAM_CHECK( 8 , 1 ) = DESCA(1)
227        PARAM_CHECK( 7 , 1 ) = JA
228        PARAM_CHECK( 6 , 1 ) = NRHS
229        PARAM_CHECK( 5 , 1 ) = BWU
230        PARAM_CHECK( 4 , 1 ) = BWL
231        PARAM_CHECK( 3 , 1 ) = N
232        PARAM_CHECK( 2 , 1 ) = IDUM3
233        PARAM_CHECK( 1 , 1 ) = IDUM2
234  
235        PARAM_CHECK( 17 , 2 ) = 1105
236        PARAM_CHECK( 16 , 2 ) = 1104
237        PARAM_CHECK( 15 , 2 ) = 1103
238        PARAM_CHECK( 14 , 2 ) = 1102
239        PARAM_CHECK( 13 , 2 ) = 1101
240        PARAM_CHECK( 12 , 2 ) = 10
241        PARAM_CHECK( 11 , 2 ) = 805
242        PARAM_CHECK( 10 , 2 ) = 804
243        PARAM_CHECK( 9 , 2 ) = 803
244        PARAM_CHECK( 8 , 2 ) = 801
245        PARAM_CHECK( 7 , 2 ) = 7
246        PARAM_CHECK( 6 , 2 ) = 5
247        PARAM_CHECK( 5 , 2 ) = 4
248        PARAM_CHECK( 4 , 2 ) = 3
249        PARAM_CHECK( 3 , 2 ) = 2
250        PARAM_CHECK( 2 , 2 ) = 16
251        PARAM_CHECK( 1 , 2 ) = 1
252  
253  *     Want to find errors with MIN( ) , so if no error , set it to a big
254  *     number. If there already is an error , multiply by the the
255  *     descriptor multiplier.
256  
257        IF( INFO.GE.0 ) THEN
258            INFO = BIGNUM
259        ELSE IF( INFO.LT. - DESCMULT ) THEN
260            INFO = - INFO
261        ELSE
262            INFO = - INFO * DESCMULT
263        END IF
264  
265  *     Check consistency across processors
266  
267        CALL GLOBCHK( ICTXT , 17 , PARAM_CHECK , 17 ,
268       $PARAM_CHECK( 1 , 3 ) , INFO )
269  
270  *     Prepare output : set info = 0 if no error , and divide by DESCMULT
271  *     if error is not in a descriptor entry.
272  
273        IF( INFO.EQ.BIGNUM ) THEN
274            INFO = 0
275        ELSE IF( MOD( INFO , DESCMULT ) .EQ. 0 ) THEN
276            INFO = - INFO / DESCMULT
277        ELSE
278            INFO = - INFO
279        END IF
280  
281        IF( INFO.LT.0 ) THEN
282            CALL PXERBLA( ICTXT , 'PCGBTRS' , - INFO )
283            RETURN
284        END IF
285  
286  *     Quick return if possible
287  
288        IF( N.EQ.0 )
289       $    RETURN
290  
291            IF( NRHS.EQ.0 )
292       $        RETURN
293  
294  *             Adjust addressing into matrix space to properly get into
295  *             the beginning part of the relevant data
296  
297                PART_OFFSET = NB*((JA - 1) / (NPCOL*NB) )
298  
299                IF((MYCOL - CSRC) .LT.(JA - PART_OFFSET - 1) / NB ) THEN
300                PART_OFFSET = PART_OFFSET + NB
301            ENDIF
302  
303            IF( MYCOL .LT. CSRC ) THEN
304                PART_OFFSET = PART_OFFSET - NB
305            ENDIF
306  
307  *         Form a new BLACS grid(the "standard form" grid) with only procs
308  *         holding part of the matrix , of size 1xNP where NP is adjusted ,
309  *         starting at csrc = 0 , with JA modified to reflect dropped procs.
310  
311  *         First processor to hold part of the matrix :
312  
313            FIRST_PROC = MOD(( JA - 1 ) / NB + CSRC , NPCOL )
314  
315  *         Calculate new JA one while dropping off unused processors.
316  
317            JA_NEW = MOD( JA - 1 , NB ) + 1
318  
319  *         Save and compute new value of NP
320  
321            NP_SAVE = NP
322            NP =( JA_NEW + N - 2 ) / NB + 1
323  
324  *         Call utility routine that forms "standard-form" grid
325  
326            CALL RESHAPE( ICTXT , INT_ONE , ICTXT_NEW , INT_ONE ,
327       $    FIRST_PROC , INT_ONE , NP )
328  
329  *         Use new context from standard grid as context.
330  
331            ICTXT_SAVE = ICTXT
332            ICTXT = ICTXT_NEW
333            DESCA_1XP( 2 ) = ICTXT_NEW
334            DESCB_PX1( 2 ) = ICTXT_NEW
335  
336  *         Get information about new grid.
337  
338            CALL BLACS_GRIDINFO( ICTXT , NPROW , NPCOL , MYROW , MYCOL )
339  
340  *         Drop out processors that do not have part of the matrix.
341  
342            IF( MYROW .LT. 0 ) THEN
343                GOTO 1234
344            ENDIF
345  
346  *         Begin main code
347  
348  *         Move data into workspace - communicate / copy(overlap)
349  
350            IF(MYCOL .LT. NPCOL - 1) THEN
351                CALL CGESD2D( ICTXT , BWU , NRHS , B(NB - BWU + 1) , LLDB ,
352       $        0 , MYCOL + 1)
353            ENDIF
354  
355            IF(MYCOL .LT. NPCOL - 1) THEN
356                LM = NB - BWU
357            ELSE
358                LM = NB
359            ENDIF
360  
361            IF(MYCOL .GT. 0) THEN
362                WPTR = BWU + 1
363            ELSE
364                WPTR = 1
365            ENDIF
366  
367            LDW = NB + BWU + 2*BW + BWU
368  
369            CALL CLACPY( 'G' , LM , NRHS , B(1) , LLDB , WORK( WPTR ) , LDW )
370  
371  *         Zero out rest of work
372  
373            DO 1501 J = 1 , NRHS
374                DO 1502 L = WPTR + LM , LDW
375                    WORK((J - 1)*LDW + L ) = CZERO
376   1502         CONTINUE
377   1501     CONTINUE
378  
379            IF(MYCOL .GT. 0) THEN
380                CALL CGERV2D( ICTXT , BWU , NRHS , WORK(1) , LDW ,
381       $        0 , MYCOL - 1)
382            ENDIF
383  
384  *         *******************************************************************
385  *         PHASE 1 : Local computation phase -- Solve L*X = B
386  *         *******************************************************************
387  
388  *         Size of main(or odd) partition in each processor
389  
390            ODD_SIZE = NUMROC( N , NB , MYCOL , 0 , NPCOL )
391  
392            IF(MYCOL .NE. 0) THEN
393                LBWL = BW
394                LBWU = 0
395                APTR = 1
396            ELSE
397                LBWL = BWL
398                LBWU = BWU
399                APTR = 1 + BWU
400            ENDIF
401  
402            IF(MYCOL .NE. NPCOL - 1) THEN
403                LM = NB - LBWU
404                LN = NB - BW
405            ELSE IF(MYCOL .NE. 0) THEN
406                LM = ODD_SIZE + BWU
407                LN = MAX(ODD_SIZE - BW , 0)
408            ELSE
409                LM = N
410                LN = MAX( N - BW , 0 )
411            ENDIF
412  
413            DO 21 J = 1 , LN
414  
415                LMJ = MIN(LBWL , LM - J)
416                L = IPIV( J )
417  
418                IF( L.NE.J ) THEN
419                    CALL CSWAP(NRHS , WORK(L) , LDW , WORK(J) , LDW)
420                ENDIF
421  
422                LPTR = BW + 1 + (J - 1)*LLDA + APTR
423  
424                CALL CGERU(LMJ , NRHS ,- CONE , A(LPTR) , 1 , WORK(J) , LDW ,
425       $        WORK(J + 1) , LDW)
426  
427     21     CONTINUE
428  
429  *         *******************************************************************
430  *         PHASE 2 : Global computation phase -- Solve L*X = B
431  *         *******************************************************************
432  
433  *         Define the initial dimensions of the diagonal blocks
434  *         The offdiagonal blocks(for MYCOL > 0) are of size BM by BW
435  
436            IF(MYCOL .NE. NPCOL - 1) THEN
437                BM = BW - LBWU
438                BN = BW
439            ELSE
440                BM = MIN(BW , ODD_SIZE) + BWU
441                BN = MIN(BW , ODD_SIZE)
442            ENDIF
443  
444  *         Pointer to first element of block bidiagonal matrix in AF
445  *         Leading dimension of block bidiagonal system
446  
447            BBPTR =(NB + BWU)*BW + 1
448            LDBB = 2*BW + BWU
449  
450            IF(NPCOL.EQ.1) THEN
451  
452  *             In this case the loop over the levels will not be
453  *             performed.
454                CALL CGETRS( 'N' , N - LN , NRHS , AF(BBPTR + BW*LDBB) , LDBB ,
455       $        IPIV(LN + 1) , WORK( LN + 1 ) , LDW , INFO)
456  
457            ENDIF
458  
459  *         Loop over levels ...
460  
461  *         The two integers NPACT(nu. of active processors) and NPSTR
462  *         (stride between active processors) is used to control the
463  *         loop.
464  
465            NPACT = NPCOL
466            NPSTR = 1
467  
468  *         Begin loop over levels
469    200     IF(NPACT .LE. 1) GOTO 300
470  
471  *         Test if processor is active
472            IF(MOD(MYCOL , NPSTR) .EQ. 0) THEN
473  
474  *             Send / Receive blocks
475  
476                IF(MOD(MYCOL , 2*NPSTR) .EQ. 0) THEN
477  
478                    NEICOL = MYCOL + NPSTR
479  
480                    IF(NEICOL / NPSTR .LE. NPACT - 1) THEN
481  
482                        IF(NEICOL / NPSTR .LT. NPACT - 1) THEN
483                            BMN = BW
484                        ELSE
485                            BMN = MIN(BW , NUMROC(N , NB , NEICOL , 0 , NPCOL)) + BWU
486                        ENDIF
487  
488                        CALL CGESD2D( ICTXT , BM , NRHS ,
489       $                WORK(LN + 1) , LDW , 0 , NEICOL )
490  
491                        IF( NPACT .NE. 2 )THEN
492  
493  *                         Receive answers back from partner processor
494  
495                            CALL CGERV2D(ICTXT , BM + BMN - BW , NRHS ,
496       $                    WORK( LN + 1 ) , LDW , 0 , NEICOL )
497  
498                            BM = BM + BMN - BW
499  
500                        ENDIF
501  
502                    ENDIF
503  
504                ELSE
505  
506                    NEICOL = MYCOL - NPSTR
507  
508                    IF(NEICOL .EQ. 0) THEN
509                        BMN = BW - BWU
510                    ELSE
511                        BMN = BW
512                    ENDIF
513  
514                    CALL CLACPY( 'G' , BM , NRHS , WORK(LN + 1) , LDW ,
515       $            WORK(NB + BWU + BMN + 1) , LDW )
516  
517                    CALL CGERV2D( ICTXT , BMN , NRHS , WORK( NB + BWU + 1 ) ,
518       $            LDW , 0 , NEICOL )
519  
520  *                 and do the permutations and eliminations
521  
522                    IF(NPACT .NE. 2) THEN
523  
524  *                     Solve locally for BW variables
525  
526                        CALL CLASWP( NRHS , WORK(NB + BWU + 1) , LDW , 1 , BW ,
527       $                IPIV(LN + 1) , 1)
528  
529                        CALL CTRSM('L' , 'L' , 'N' , 'U' , BW , NRHS , CONE ,
530       $                AF(BBPTR + BW*LDBB) , LDBB , WORK(NB + BWU + 1) , LDW)
531  
532  *                     Use soln just calculated to update RHS
533  
534                        CALL CGEMM( 'N' , 'N' , BM + BMN - BW , NRHS , BW ,
535       $                - CONE , AF(BBPTR + BW*LDBB + BW) , LDBB ,
536       $                WORK(NB + BWU + 1) , LDW ,
537       $                CONE , WORK(NB + BWU + 1 + BW) , LDW )
538  
539  *                     Give answers back to partner processor
540  
541                        CALL CGESD2D( ICTXT , BM + BMN - BW , NRHS ,
542       $                WORK(NB + BWU + 1 + BW) , LDW , 0 , NEICOL )
543  
544                    ELSE
545  
546  *                     Finish up calculations for final level
547  
548                        CALL CLASWP( NRHS , WORK(NB + BWU + 1) , LDW , 1 , BM + BMN ,
549       $                IPIV(LN + 1) , 1)
550  
551                        CALL CTRSM('L' , 'L' , 'N' , 'U' , BM + BMN , NRHS , CONE ,
552       $                AF(BBPTR + BW*LDBB) , LDBB , WORK(NB + BWU + 1) , LDW)
553                    ENDIF
554  
555                ENDIF
556  
557                NPACT =(NPACT + 1) / 2
558                NPSTR = NPSTR * 2
559                GOTO 200
560  
561            ENDIF
562  
563    300 CONTINUE
564  
565  *     *************************************
566  *     BACKSOLVE
567  *     *******************************************************************
568  *     PHASE 2 : Global computation phase -- Solve U*Y = X
569  *     *******************************************************************
570  
571        IF(NPCOL.EQ.1) THEN
572  
573  *         In this case the loop over the levels will not be
574  *         performed.
575  *         In fact , the backsolve portion was done in the call to
576  *         CGETRS in the frontsolve.
577  
578        ENDIF
579  
580  *     Compute variable needed to reverse loop structure in
581  *     reduced system.
582  
583        RECOVERY_VAL = NPACT*NPSTR - NPCOL
584  
585  *     Loop over levels
586  *     Terminal values of NPACT and NPSTR from frontsolve are used
587  
588   2200 IF( NPACT .GE. NPCOL ) GOTO 2300
589  
590        NPSTR = NPSTR / 2
591  
592        NPACT = NPACT*2
593  
594  *     Have to adjust npact for non - power - of - 2
595  
596        NPACT = NPACT - MOD((RECOVERY_VAL / NPSTR) , 2 )
597  
598  *     Find size of submatrix in this proc at this level
599  
600        IF( MYCOL / NPSTR .LT. NPACT - 1 ) THEN
601            BN = BW
602        ELSE
603            BN = MIN(BW , NUMROC(N , NB , NPCOL - 1 , 0 , NPCOL) )
604        ENDIF
605  
606  *     If this processor is even in this level...
607  
608        IF( MOD( MYCOL , 2*NPSTR ) .EQ. 0 ) THEN
609  
610            NEICOL = MYCOL + NPSTR
611  
612            IF( NEICOL / NPSTR .LE. NPACT - 1 ) THEN
613  
614                IF( NEICOL / NPSTR .LT. NPACT - 1 ) THEN
615                    BMN = BW
616                    BNN = BW
617                ELSE
618                    BMN = MIN(BW , NUMROC(N , NB , NEICOL , 0 , NPCOL)) + BWU
619                    BNN = MIN(BW , NUMROC(N , NB , NEICOL , 0 , NPCOL) )
620                ENDIF
621  
622                IF( NPACT .GT. 2 ) THEN
623  
624                    CALL CGESD2D( ICTXT , 2*BW , NRHS , WORK( LN + 1 ) ,
625       $            LDW , 0 , NEICOL )
626  
627                    CALL CGERV2D( ICTXT , BW , NRHS , WORK( LN + 1 ) ,
628       $            LDW , 0 , NEICOL )
629  
630                ELSE
631  
632                    CALL CGERV2D( ICTXT , BW , NRHS , WORK( LN + 1 ) ,
633       $            LDW , 0 , NEICOL )
634  
635                ENDIF
636  
637            ENDIF
638  
639        ELSE
640  *         This processor is odd on this level
641  
642            NEICOL = MYCOL - NPSTR
643  
644            IF(NEICOL .EQ. 0) THEN
645                BMN = BW - BWU
646            ELSE
647                BMN = BW
648            ENDIF
649  
650            IF( NEICOL .LT. NPCOL - 1 ) THEN
651                BNN = BW
652            ELSE
653                BNN = MIN(BW , NUMROC(N , NB , NEICOL , 0 , NPCOL) )
654            ENDIF
655  
656            IF( NPACT .GT. 2 ) THEN
657  
658  *             Move RHS to make room for received solutions
659  
660                CALL CLACPY( 'G' , BW , NRHS , WORK(NB + BWU + 1) ,
661       $        LDW , WORK(NB + BWU + BW + 1) , LDW )
662  
663                CALL CGERV2D( ICTXT , 2*BW , NRHS , WORK( LN + 1 ) ,
664       $        LDW , 0 , NEICOL )
665  
666                CALL CGEMM( 'N' , 'N' , BW , NRHS , BN ,
667       $        - CONE , AF(BBPTR) , LDBB ,
668       $        WORK(LN + 1) , LDW ,
669       $        CONE , WORK(NB + BWU + BW + 1) , LDW )
670  
671                IF( MYCOL .GT. NPSTR ) THEN
672  
673                    CALL CGEMM( 'N' , 'N' , BW , NRHS , BW ,
674       $            - CONE , AF(BBPTR + 2*BW*LDBB) , LDBB ,
675       $            WORK(LN + BW + 1) , LDW ,
676       $            CONE , WORK(NB + BWU + BW + 1) , LDW )
677  
678                ENDIF
679  
680                CALL CTRSM('L' , 'U' , 'N' , 'N' , BW , NRHS , CONE ,
681       $        AF(BBPTR + BW*LDBB) , LDBB , WORK(NB + BWU + BW + 1) , LDW)
682  
683  *             Send new solution to neighbor
684  
685                CALL CGESD2D( ICTXT , BW , NRHS ,
686       $        WORK( NB + BWU + BW + 1 ) , LDW , 0 , NEICOL )
687  
688  *             Copy new solution into expected place
689  
690                CALL CLACPY( 'G' , BW , NRHS , WORK(NB + BWU + 1 + BW) ,
691       $        LDW , WORK(LN + BW + 1) , LDW )
692  
693            ELSE
694  
695  *             Solve with local diagonal block
696  
697                CALL CTRSM( 'L' , 'U' , 'N' , 'N' , BN + BNN , NRHS , CONE ,
698       $        AF(BBPTR + BW*LDBB) , LDBB , WORK(NB + BWU + 1) , LDW)
699  
700  *             Send new solution to neighbor
701  
702                CALL CGESD2D( ICTXT , BW , NRHS ,
703       $        WORK(NB + BWU + 1) , LDW , 0 , NEICOL )
704  
705  *             Shift solutions into expected positions
706  
707                CALL CLACPY( 'G' , BNN + BN - BW , NRHS , WORK(NB + BWU + 1 + BW) ,
708       $        LDW , WORK(LN + 1) , LDW )
709  
710                IF((NB + BWU + 1) .NE.(LN + 1 + BW) ) THEN
711  
712  *                 Copy one row at a time since spaces may overlap
713  
714                    DO 1064 J = 1 , BW
715                        CALL CCOPY( NRHS , WORK(NB + BWU + J) , LDW ,
716       $                WORK(LN + BW + J) , LDW )
717   1064             CONTINUE
718  
719                ENDIF
720  
721            ENDIF
722  
723        ENDIF
724  
725        GOTO 2200
726  
727   2300 CONTINUE
728  *     End of loop over levels
729  
730  *     *******************************************************************
731  *     PHASE 1 :(Almost) Local computation phase -- Solve U*Y = X
732  *     *******************************************************************
733  
734  *     Reset BM to value it had before reduced system frontsolve...
735  
736        IF(MYCOL .NE. NPCOL - 1) THEN
737            BM = BW - LBWU
738        ELSE
739            BM = MIN(BW , ODD_SIZE) + BWU
740        ENDIF
741  
742  *     First metastep is to account for the fillin blocks AF
743  
744        IF( MYCOL .LT. NPCOL - 1 ) THEN
745  
746            CALL CGESD2D( ICTXT , BW , NRHS , WORK( NB - BW + 1 ) ,
747       $    LDW , 0 , MYCOL + 1 )
748  
749        ENDIF
750  
751        IF( MYCOL .GT. 0 ) THEN
752  
753            CALL CGERV2D( ICTXT , BW , NRHS , WORK( NB + BWU + 1 ) ,
754       $    LDW , 0 , MYCOL - 1 )
755  
756  *         Modify local right hand sides with received rhs's
757  
758            CALL CGEMM( 'N' , 'N' , LM - BM , NRHS , BW , - CONE ,
759       $    AF( 1 ) , LM , WORK( NB + BWU + 1 ) , LDW , CONE ,
760       $    WORK( 1 ) , LDW )
761  
762        ENDIF
763  
764        DO 2021 J = LN , 1 , - 1
765  
766            LMJ = MIN( BW , ODD_SIZE - 1 )
767  
768            LPTR = BW - 1 + J*LLDA + APTR
769  
770  *         In the following , the TRANS = T option is used to reverse
771  *         the order of multiplication , not as a true transpose
772  
773            CALL CGEMV( 'T' , LMJ , NRHS , - CONE , WORK( J + 1) , LDW ,
774       $    A( LPTR ) , LLDA - 1 , CONE , WORK( J ) , LDW )
775  
776  *         Divide by diagonal element
777  
778            CALL CSCAL( NRHS , CONE / A( LPTR - LLDA + 1 ) ,
779       $    WORK( J ) , LDW )
780   2021 CONTINUE
781  
782        CALL CLACPY( 'G' , ODD_SIZE , NRHS , WORK( 1 ) , LDW ,
783       $B( 1 ) , LLDB )
784  
785  *     Free BLACS space used to hold standard - form grid.
786  
787        ICTXT = ICTXT_SAVE
788        IF( ICTXT .NE. ICTXT_NEW ) THEN
789            CALL BLACS_GRIDEXIT( ICTXT_NEW )
790        ENDIF
791  
792   1234 CONTINUE
793  
794  *     Restore saved input parameters
795  
796        NP = NP_SAVE
797  
798  *     Output worksize
799  
800        WORK( 1 ) = WORK_SIZE_MIN
801  
802        RETURN
803  
804  *     End of PCGBTRS
805  
806        END