Routine: PSGBTRS()  File: SRC\psgbtrs.f

 
 
# lines: 1173
  # code: 1173
  # comment: 0
  # blank:0
# Variables:77
# Callers:1
# Callings:0
# Words:545
# Keywords:342
 

 

..
     .. Array Arguments ..
     ..
  Purpose
  =======
  PSGBTRS solves a system of linear equations
            A(1:N, JA:JA+N-1) * X = B(IB:IB+N-1, 1:NRHS)
                                    or
            A(1:N, JA:JA+N-1)' * X = B(IB:IB+N-1, 1:NRHS)
  where A(1:N, JA:JA+N-1) is the matrix used to produce the factors
  stored in A(1:N,JA:JA+N-1) and AF by PSGBTRF.
  A(1:N, JA:JA+N-1) is an N-by-N real
  banded distributed
  matrix with bandwidth BWL, BWU.
  Routine PSGBTRF MUST be called first.
  =====================================================================
  Arguments
  =========
  TRANS   (global input) CHARACTER
          = 'N':  Solve with A(1:N, JA:JA+N-1);
          = 'T' or 'C':  Solve with A(1:N, JA:JA+N-1)^T;
  N       (global input) INTEGER
          The number of rows and columns to be operated on, i.e. the
          order of the distributed submatrix A(1:N, JA:JA+N-1). N >= 0.
  BWL     (global input) INTEGER
          Number of subdiagonals. 0 <= BWL <= N-1
  BWU     (global input) INTEGER
          Number of superdiagonals. 0 <= BWU <= N-1
  NRHS    (global input) INTEGER
          The number of right hand sides, i.e., the number of columns
          of the distributed submatrix B(IB:IB+N-1, 1:NRHS).
          NRHS >= 0.
  A       (local input/local output) REAL pointer into
          local memory to an array with first dimension
          LLD_A >=(2*bwl+2*bwu+1) (stored in DESCA).
          On entry, this array contains the local pieces of the
          N-by-N unsymmetric banded distributed Cholesky factor L or
          L^T A(1:N, JA:JA+N-1).
          This local portion is stored in the packed banded format
            used in LAPACK. Please see the Notes below and the
            ScaLAPACK manual for more detail on the format of
            distributed matrices.
  JA      (global input) INTEGER
          The index in the global array A that points to the start of
          the matrix to be operated on (which may be either all of A
          or a submatrix of A).
  DESCA   (global and local input) INTEGER array of dimension DLEN.
          if 1D type (DTYPE_A=501), DLEN >= 7;
          if 2D type (DTYPE_A=1), DLEN >= 9 .
          The array descriptor for the distributed matrix A.
          Contains information of mapping of A to memory. Please
          see NOTES below for full description and options.
  IPIV    (local output) INTEGER array, dimension >= DESCA( NB ).
          Pivot indices for local factorizations.
          Users *should not* alter the contents between
          factorization and solve.
  B       (local input/local output) REAL pointer into
          local memory to an array of local lead dimension lld_b>=NB.
          On entry, this array contains the
          the local pieces of the right hand sides
          B(IB:IB+N-1, 1:NRHS).
          On exit, this contains the local piece of the solutions
          distributed matrix X.
  IB      (global input) INTEGER
          The row index in the global array B that points to the first
          row of the matrix to be operated on (which may be either
          all of B or a submatrix of B).
  DESCB   (global and local input) INTEGER array of dimension DLEN.
          if 1D type (DTYPE_B=502), DLEN >=7;
          if 2D type (DTYPE_B=1), DLEN >= 9.
          The array descriptor for the distributed matrix B.
          Contains information of mapping of B to memory. Please
          see NOTES below for full description and options.
  AF      (local output) REAL array, dimension LAF.
          Auxiliary Fillin Space.
          Fillin is created during the factorization routine
          PSGBTRF and this is stored in AF. If a linear system
          is to be solved using PSGBTRS after the factorization
          routine, AF *must not be altered* after the factorization.
  LAF     (local input) INTEGER
          Size of user-input Auxiliary Fillin space AF. Must be >=
          (NB+bwu)*(bwl+bwu)+6*(bwl+bwu)*(bwl+2*bwu)
          If LAF is not large enough, an error code will be returned
          and the minimum acceptable size will be returned in AF( 1 )
  WORK    (local workspace/local output)
          REAL temporary workspace. This space may
          be overwritten in between calls to routines. WORK must be
          the size given in LWORK.
          On exit, WORK( 1 ) contains the minimal LWORK.
  LWORK   (local input or global input) INTEGER
          Size of user-input workspace WORK.
          If LWORK is too small, the minimal acceptable size will be
          returned in WORK(1) and an error code is returned. LWORK>=
          NRHS*(NB+2*bwl+4*bwu)
  INFO    (global output) INTEGER
          = 0:  successful exit
          < 0:  If the i-th argument is an array and the j-entry had
                an illegal value, then INFO = -(i*100+j), if the i-th
                argument is a scalar and had an illegal value, then
                INFO = -i.
  =====================================================================
  Restrictions
  ============
  The following are restrictions on the input parameters. Some of these
    are temporary and will be removed in future releases, while others
    may reflect fundamental technical limitations.
    Non-cyclic restriction: VERY IMPORTANT!
      P*NB>= mod(JA-1,NB)+N.
      The mapping for matrices must be blocked, reflecting the nature
      of the divide and conquer algorithm as a task-parallel algorithm.
      This formula in words is: no processor may have more than one
      chunk of the matrix.
    Blocksize cannot be too small:
      If the matrix spans more than one processor, the following
      restriction on NB, the size of each block on each processor,
      must hold:
      NB >= (BWL+BWU)+1
      The bulk of parallel computation is done on the matrix of size
      O(NB) on each processor. If this is too small, divide and conquer
      is a poor choice of algorithm.
    Submatrix reference:
      JA = IB
      Alignment restriction that prevents unnecessary communication.
  =====================================================================
  Notes
  =====
  If the factorization routine and the solve routine are to be called
    separately (to solve various sets of righthand sides using the same
    coefficient matrix), the auxiliary space AF *must not be altered*
    between calls to the factorization routine and the solve routine.
  The best algorithm for solving banded and tridiagonal linear systems
    depends on a variety of parameters, especially the bandwidth.
    Currently, only algorithms designed for the case N/P >> bw are
    implemented. These go by many names, including Divide and Conquer,
    Partitioning, domain decomposition-type, etc.
  Algorithm description: Divide and Conquer
    The Divide and Conqer algorithm assumes the matrix is narrowly
      banded compared with the number of equations. In this situation,
      it is best to distribute the input matrix A one-dimensionally,
      with columns atomic and rows divided amongst the processes.
      The basic algorithm divides the banded matrix up into
      P pieces with one stored on each processor,
      and then proceeds in 2 phases for the factorization or 3 for the
      solution of a linear system.
      1) Local Phase:
         The individual pieces are factored independently and in
         parallel. These factors are applied to the matrix creating
         fillin, which is stored in a non-inspectable way in auxiliary
         space AF. Mathematically, this is equivalent to reordering
         the matrix A as P A P^T and then factoring the principal
         leading submatrix of size equal to the sum of the sizes of
         the matrices factored on each processor. The factors of
         these submatrices overwrite the corresponding parts of A
         in memory.
      2) Reduced System Phase:
         A small (max(bwl,bwu)* (P-1)) system is formed representing
         interaction of the larger blocks, and is stored (as are its
         factors) in the space AF. A parallel Block Cyclic Reduction
         algorithm is used. For a linear system, a parallel front solve
         followed by an analagous backsolve, both using the structure
         of the factored matrix, are performed.
      3) Backsubsitution Phase:
         For a linear system, a local backsubstitution is performed on
         each processor in parallel.
  Descriptors
  ===========
  Descriptors now have *types* and differ from ScaLAPACK 1.0.
  Note: banded codes can use either the old two dimensional
    or new one-dimensional descriptors, though the processor grid in
    both cases *must be one-dimensional*. We describe both types below.
  Each global data object is described by an associated description
  vector.  This vector stores the information required to establish
  the mapping between an object element and its corresponding process
  and memory location.
  Let A be a generic term for any 2D block cyclicly distributed array.
  Such a global array has an associated description vector DESCA.
  In the following comments, the character _ should be read as
  "of the global array".
  NOTATION        STORED IN      EXPLANATION
  --------------- -------------- --------------------------------------
  DTYPE_A(global) DESCA( DTYPE_ )The descriptor type.  In this case,
                                 DTYPE_A = 1.
  CTXT_A (global) DESCA( CTXT_ ) The BLACS context handle, indicating
                                 the BLACS process grid A is distribu-
                                 ted over. The context itself is glo-
                                 bal, but the handle (the integer
                                 value) may vary.
  M_A    (global) DESCA( M_ )    The number of rows in the global
                                 array A.
  N_A    (global) DESCA( N_ )    The number of columns in the global
                                 array A.
  MB_A   (global) DESCA( MB_ )   The blocking factor used to distribute
                                 the rows of the array.
  NB_A   (global) DESCA( NB_ )   The blocking factor used to distribute
                                 the columns of the array.
  RSRC_A (global) DESCA( RSRC_ ) The process row over which the first
                                 row of the array A is distributed.
  CSRC_A (global) DESCA( CSRC_ ) The process column over which the
                                 first column of the array A is
                                 distributed.
  LLD_A  (local)  DESCA( LLD_ )  The leading dimension of the local
                                 array.  LLD_A >= MAX(1,LOCr(M_A)).
  Let K be the number of rows or columns of a distributed matrix,
  and assume that its process grid has dimension p x q.
  LOCr( K ) denotes the number of elements of K that a process
  would receive if K were distributed over the p processes of its
  process column.
  Similarly, LOCc( K ) denotes the number of elements of K that a
  process would receive if K were distributed over the q processes of
  its process row.
  The values of LOCr() and LOCc() may be determined via a call to the
  ScaLAPACK tool function, NUMROC:
          LOCr( M ) = NUMROC( M, MB_A, MYROW, RSRC_A, NPROW ),
          LOCc( N ) = NUMROC( N, NB_A, MYCOL, CSRC_A, NPCOL ).
  An upper bound for these quantities may be computed by:
          LOCr( M ) <= ceil( ceil(M/MB_A)/NPROW )*MB_A
          LOCc( N ) <= ceil( ceil(N/NB_A)/NPCOL )*NB_A
  One-dimensional descriptors:
  One-dimensional descriptors are a new addition to ScaLAPACK since
    version 1.0. They simplify and shorten the descriptor for 1D
    arrays.
  Since ScaLAPACK supports two-dimensional arrays as the fundamental
    object, we allow 1D arrays to be distributed either over the
    first dimension of the array (as if the grid were P-by-1) or the
    2nd dimension (as if the grid were 1-by-P). This choice is
    indicated by the descriptor type (501 or 502)
    as described below.
    IMPORTANT NOTE: the actual BLACS grid represented by the
    CTXT entry in the descriptor may be *either*  P-by-1 or 1-by-P
    irrespective of which one-dimensional descriptor type
    (501 or 502) is input.
    This routine will interpret the grid properly either way.
    ScaLAPACK routines *do not support intercontext operations* so that
    the grid passed to a single ScaLAPACK routine *must be the same*
    for all array descriptors passed to that routine.
    NOTE: In all cases where 1D descriptors are used, 2D descriptors
    may also be used, since a one-dimensional array is a special case
    of a two-dimensional array with one dimension of size unity.
    The two-dimensional array used in this case *must* be of the
    proper orientation:
      If the appropriate one-dimensional descriptor is DTYPEA=501
      (1 by P type), then the two dimensional descriptor must
      have a CTXT value that refers to a 1 by P BLACS grid;
      If the appropriate one-dimensional descriptor is DTYPEA=502
      (P by 1 type), then the two dimensional descriptor must
      have a CTXT value that refers to a P by 1 BLACS grid.
  Summary of allowed descriptors, types, and BLACS grids:
  DTYPE           501         502         1         1
  BLACS grid      1xP or Px1  1xP or Px1  1xP       Px1
  -----------------------------------------------------
  A               OK          NO          OK        NO
  B               NO          OK          NO        OK
  Note that a consequence of this chart is that it is not possible
    for *both* DTYPE_A and DTYPE_B to be 2D_type(1), as these lead
    to opposite requirements for the orientation of the BLACS grid,
    and as noted before, the *same* BLACS context must be used in
    all descriptors in a single ScaLAPACK subroutine call.
  Let A be a generic term for any 1D block cyclicly distributed array.
  Such a global array has an associated description vector DESCA.
  In the following comments, the character _ should be read as
  "of the global array".
  NOTATION        STORED IN  EXPLANATION
  --------------- ---------- ------------------------------------------
  DTYPE_A(global) DESCA( 1 ) The descriptor type. For 1D grids,
                                TYPE_A = 501: 1-by-P grid.
                                TYPE_A = 502: P-by-1 grid.
  CTXT_A (global) DESCA( 2 ) The BLACS context handle, indicating
                                the BLACS process grid A is distribu-
                                ted over. The context itself is glo-
                                bal, but the handle (the integer
                                value) may vary.
  N_A    (global) DESCA( 3 ) The size of the array dimension being
                                distributed.
  NB_A   (global) DESCA( 4 ) The blocking factor used to distribute
                                the distributed dimension of the array.
  SRC_A  (global) DESCA( 5 ) The process row or column over which the
                                first row or column of the array
                                is distributed.
  LLD_A  (local)  DESCA( 6 ) The leading dimension of the local array
                                storing the local blocks of the distri-
                                buted array A. Minimum value of LLD_A
                                depends on TYPE_A.
                                TYPE_A = 501: LLD_A >=
                                   size of undistributed dimension, 1.
                                TYPE_A = 502: LLD_A >=NB_A, 1.
  Reserved        DESCA( 7 ) Reserved for future use.
  =====================================================================
  Implemented for ScaLAPACK by:
     Andrew J. Cleary, Livermore National Lab and University of Tenn.,
     and Markus Hegland, Australian National University. Feb., 1997.
  Based on code written by    : Peter Arbenz, ETH Zurich, 1996.
  Last modified by:  Peter Arbenz, Institute of Scientific Computing,
    ETH, Zurich.
  =====================================================================
     .. Parameters ..

 
Display dynamic version Find AutoScroll Reload FontSize: - + Hide Comments Hide Blanks Frame FullScreen MailPrint

 
001        SUBROUTINE PSGBTRS( TRANS , N , BWL , BWU , NRHS , A , JA , DESCA , IPIV ,
002       $B , IB , DESCB , AF , LAF , WORK , LWORK , INFO )
003  
004  *     -- ScaLAPACK routine(version 1.7) --
005  *     University of Tennessee , Knoxville , Oak Ridge National Laboratory ,
006  *     and University of California , Berkeley.
007  *     April 3 , 2000
008  
009  *     .. Scalar Arguments ..
010        CHARACTER TRANS
011        INTEGER BWL , BWU , IB , INFO , JA , LAF , LWORK , N , NRHS
012        REAL ONE
013        PARAMETER( ONE = 1.0E + 0 )
014        REAL ZERO
015        PARAMETER( ZERO = 0.0E + 0 )
016        INTEGER INT_ONE
017        PARAMETER( INT_ONE = 1 )
018        INTEGER DESCMULT , BIGNUM
019        PARAMETER( DESCMULT = 100 , BIGNUM = DESCMULT*DESCMULT )
020        INTEGER BLOCK_CYCLIC_2D , CSRC_ , CTXT_ , DLEN_ , DTYPE_ ,
021       $LLD_ , MB_ , M_ , NB_ , N_ , RSRC_
022        PARAMETER( BLOCK_CYCLIC_2D = 1 , DLEN_ = 9 , DTYPE_ = 1 ,
023       $CTXT_ = 2 , M_ = 3 , N_ = 4 , MB_ = 5 , NB_ = 6 ,
024       $RSRC_ = 7 , CSRC_ = 8 , LLD_ = 9 )
025  *     ..
026  *     .. Local Scalars ..
027        INTEGER APTR , BBPTR , BM , BMN , BN , BNN , BW , CSRC ,
028       $FIRST_PROC , ICTXT , ICTXT_NEW , ICTXT_SAVE ,
029       $IDUM2 , IDUM3 , J , JA_NEW , L , LBWL , LBWU , LDBB ,
030       $LDW , LLDA , LLDB , LM , LMJ , LN , LPTR , MYCOL ,
031       $MYROW , NB , NEICOL , NP , NPACT , NPCOL , NPROW ,
032       $NPSTR , NP_SAVE , ODD_SIZE , PART_OFFSET ,
033       $RECOVERY_VAL , RETURN_CODE , STORE_M_B ,
034       $STORE_N_A , WORK_SIZE_MIN , WPTR
035  *     ..
036  *     .. Local Arrays ..
037        INTEGER DESCA_1XP( 7 ) , DESCB_PX1( 7 ) ,
038       $PARAM_CHECK( 17 , 3 )
039  *     ..
040  *     .. External Subroutines ..
041        EXTERNAL BLACS_GRIDEXIT , BLACS_GRIDINFO , SCOPY ,
042       $DESC_CONVERT , SGEMM , SGEMV , SGER , SGERV2D ,
043       $SGESD2D , SGETRS , SLACPY , SLASWP , SSCAL , SSWAP ,
044       $STRSM , GLOBCHK , PXERBLA , RESHAPE
045  *     ..
046  *     .. External Functions ..
047        LOGICAL LSAME
048        INTEGER NUMROC
049        EXTERNAL LSAME , NUMROC
050  *     ..
051  *     .. Intrinsic Functions ..
052        INTRINSIC ICHAR , MAX , MIN , MOD
053  *     ..
054  *     .. Executable Statements ..
055  
056  *     Test the input parameters
057  
058        INFO = 0
059  
060  *     Convert descriptor into standard form for easy access to
061  *     parameters , check that grid is of right shape.
062  
063        DESCA_1XP( 1 ) = 501
064        DESCB_PX1( 1 ) = 502
065  
066        CALL DESC_CONVERT( DESCA , DESCA_1XP , RETURN_CODE )
067  
068        IF( RETURN_CODE.NE.0 ) THEN
069            INFO = - ( 8*100 + 2 )
070        END IF
071  
072        CALL DESC_CONVERT( DESCB , DESCB_PX1 , RETURN_CODE )
073  
074        IF( RETURN_CODE.NE.0 ) THEN
075            INFO = - ( 11*100 + 2 )
076        END IF
077  
078  *     Consistency checks for DESCA and DESCB.
079  
080  *     Context must be the same
081        IF( DESCA_1XP( 2 ).NE.DESCB_PX1( 2 ) ) THEN
082            INFO = - ( 11*100 + 2 )
083        END IF
084  
085  *     These are alignment restrictions that may or may not be removed
086  *     in future releases. - Andy Cleary , April 14 , 1996.
087  
088  *     Block sizes must be the same
089        IF( DESCA_1XP( 4 ).NE.DESCB_PX1( 4 ) ) THEN
090            INFO = - ( 11*100 + 4 )
091        END IF
092  
093  *     Source processor must be the same
094  
095        IF( DESCA_1XP( 5 ).NE.DESCB_PX1( 5 ) ) THEN
096            INFO = - ( 11*100 + 5 )
097        END IF
098  
099  *     Get values out of descriptor for use in code.
100  
101        ICTXT = DESCA_1XP( 2 )
102        CSRC = DESCA_1XP( 5 )
103        NB = DESCA_1XP( 4 )
104        LLDA = DESCA_1XP( 6 )
105        STORE_N_A = DESCA_1XP( 3 )
106        LLDB = DESCB_PX1( 6 )
107        STORE_M_B = DESCB_PX1( 3 )
108  
109  *     Get grid parameters
110  
111        CALL BLACS_GRIDINFO( ICTXT , NPROW , NPCOL , MYROW , MYCOL )
112        NP = NPROW*NPCOL
113  
114        IF( LSAME( TRANS , 'N' ) ) THEN
115            IDUM2 = ICHAR( 'N' )
116        ELSE IF( LSAME( TRANS , 'T' ) ) THEN
117            IDUM2 = ICHAR( 'T' )
118        ELSE IF( LSAME( TRANS , 'C' ) ) THEN
119            IDUM2 = ICHAR( 'T' )
120        ELSE
121            INFO = - 1
122        END IF
123  
124        IF( LWORK.LT. - 1 ) THEN
125            INFO = - 16
126        ELSE IF( LWORK.EQ. - 1 ) THEN
127            IDUM3 = - 1
128        ELSE
129            IDUM3 = 1
130        END IF
131  
132        IF( N.LT.0 ) THEN
133            INFO = - 2
134        END IF
135  
136        IF( N + JA - 1.GT.STORE_N_A ) THEN
137            INFO = - ( 8*100 + 6 )
138        END IF
139  
140        IF(( BWL.GT.N - 1 ) .OR.( BWL.LT.0 ) ) THEN
141            INFO = - 3
142        END IF
143  
144        IF(( BWU.GT.N - 1 ) .OR.( BWU.LT.0 ) ) THEN
145            INFO = - 4
146        END IF
147  
148        IF( LLDA.LT.( 2*BWL + 2*BWU + 1 ) ) THEN
149            INFO = - ( 8*100 + 6 )
150        END IF
151  
152        IF( NB.LE.0 ) THEN
153            INFO = - ( 8*100 + 4 )
154        END IF
155  
156        BW = BWU + BWL
157  
158        IF( N + IB - 1.GT.STORE_M_B ) THEN
159            INFO = - ( 11*100 + 3 )
160        END IF
161  
162        IF( LLDB.LT.NB ) THEN
163            INFO = - ( 11*100 + 6 )
164        END IF
165  
166        IF( NRHS.LT.0 ) THEN
167            INFO = - 5
168        END IF
169  
170  *     Current alignment restriction
171  
172        IF( JA.NE.IB ) THEN
173            INFO = - 7
174        END IF
175  
176  *     Argument checking that is specific to Divide & Conquer routine
177  
178        IF( NPROW.NE.1 ) THEN
179            INFO = - ( 8*100 + 2 )
180        END IF
181  
182        IF( N.GT.NP*NB - MOD( JA - 1 , NB ) ) THEN
183            INFO = - ( 2 )
184            CALL PXERBLA( ICTXT , 'PSGBTRS , D&C alg. : only 1 block per proc'
185       $    , - INFO )
186            RETURN
187        END IF
188  
189        IF(( JA + N - 1.GT.NB ) .AND.( NB.LT.( BWL + BWU + 1 ) ) ) THEN
190            INFO = - ( 8*100 + 4 )
191            CALL PXERBLA( ICTXT , 'PSGBTRS , D&C alg. : NB too small' , - INFO )
192            RETURN
193        END IF
194  
195  *     Check worksize
196  
197        WORK_SIZE_MIN = NRHS*( NB + 2*BWL + 4*BWU )
198  
199        WORK( 1 ) = WORK_SIZE_MIN
200  
201        IF( LWORK.LT.WORK_SIZE_MIN ) THEN
202            IF( LWORK.NE. - 1 ) THEN
203                INFO = - 16
204                CALL PXERBLA( ICTXT , 'PSGBTRS : worksize error ' , - INFO )
205            END IF
206            RETURN
207        END IF
208  
209  *     Pack params and positions into arrays for global consistency check
210  
211        PARAM_CHECK( 17 , 1 ) = DESCB( 5 )
212        PARAM_CHECK( 16 , 1 ) = DESCB( 4 )
213        PARAM_CHECK( 15 , 1 ) = DESCB( 3 )
214        PARAM_CHECK( 14 , 1 ) = DESCB( 2 )
215        PARAM_CHECK( 13 , 1 ) = DESCB( 1 )
216        PARAM_CHECK( 12 , 1 ) = IB
217        PARAM_CHECK( 11 , 1 ) = DESCA( 5 )
218        PARAM_CHECK( 10 , 1 ) = DESCA( 4 )
219        PARAM_CHECK( 9 , 1 ) = DESCA( 3 )
220        PARAM_CHECK( 8 , 1 ) = DESCA( 1 )
221        PARAM_CHECK( 7 , 1 ) = JA
222        PARAM_CHECK( 6 , 1 ) = NRHS
223        PARAM_CHECK( 5 , 1 ) = BWU
224        PARAM_CHECK( 4 , 1 ) = BWL
225        PARAM_CHECK( 3 , 1 ) = N
226        PARAM_CHECK( 2 , 1 ) = IDUM3
227        PARAM_CHECK( 1 , 1 ) = IDUM2
228  
229        PARAM_CHECK( 17 , 2 ) = 1105
230        PARAM_CHECK( 16 , 2 ) = 1104
231        PARAM_CHECK( 15 , 2 ) = 1103
232        PARAM_CHECK( 14 , 2 ) = 1102
233        PARAM_CHECK( 13 , 2 ) = 1101
234        PARAM_CHECK( 12 , 2 ) = 10
235        PARAM_CHECK( 11 , 2 ) = 805
236        PARAM_CHECK( 10 , 2 ) = 804
237        PARAM_CHECK( 9 , 2 ) = 803
238        PARAM_CHECK( 8 , 2 ) = 801
239        PARAM_CHECK( 7 , 2 ) = 7
240        PARAM_CHECK( 6 , 2 ) = 5
241        PARAM_CHECK( 5 , 2 ) = 4
242        PARAM_CHECK( 4 , 2 ) = 3
243        PARAM_CHECK( 3 , 2 ) = 2
244        PARAM_CHECK( 2 , 2 ) = 16
245        PARAM_CHECK( 1 , 2 ) = 1
246  
247  *     Want to find errors with MIN( ) , so if no error , set it to a big
248  *     number. If there already is an error , multiply by the the
249  *     descriptor multiplier.
250  
251        IF( INFO.GE.0 ) THEN
252            INFO = BIGNUM
253        ELSE IF( INFO.LT. - DESCMULT ) THEN
254            INFO = - INFO
255        ELSE
256            INFO = - INFO*DESCMULT
257        END IF
258  
259  *     Check consistency across processors
260  
261        CALL GLOBCHK( ICTXT , 17 , PARAM_CHECK , 17 , PARAM_CHECK( 1 , 3 ) ,
262       $INFO )
263  
264  *     Prepare output : set info = 0 if no error , and divide by DESCMULT
265  *     if error is not in a descriptor entry.
266  
267        IF( INFO.EQ.BIGNUM ) THEN
268            INFO = 0
269        ELSE IF( MOD( INFO , DESCMULT ).EQ.0 ) THEN
270            INFO = - INFO / DESCMULT
271        ELSE
272            INFO = - INFO
273        END IF
274  
275        IF( INFO.LT.0 ) THEN
276            CALL PXERBLA( ICTXT , 'PSGBTRS' , - INFO )
277            RETURN
278        END IF
279  
280  *     Quick return if possible
281  
282        IF( N.EQ.0 )
283       $    RETURN
284  
285            IF( NRHS.EQ.0 )
286       $        RETURN
287  
288  *             Adjust addressing into matrix space to properly get into
289  *             the beginning part of the relevant data
290  
291                PART_OFFSET = NB*(( JA - 1 ) / ( NPCOL*NB ) )
292  
293                IF(( MYCOL - CSRC ).LT.( JA - PART_OFFSET - 1 ) / NB ) THEN
294                PART_OFFSET = PART_OFFSET + NB
295            END IF
296  
297            IF( MYCOL.LT.CSRC ) THEN
298                PART_OFFSET = PART_OFFSET - NB
299            END IF
300  
301  *         Form a new BLACS grid(the "standard form" grid) with only procs
302  *         holding part of the matrix , of size 1xNP where NP is adjusted ,
303  *         starting at csrc = 0 , with JA modified to reflect dropped procs.
304  
305  *         First processor to hold part of the matrix :
306  
307            FIRST_PROC = MOD(( JA - 1 ) / NB + CSRC , NPCOL )
308  
309  *         Calculate new JA one while dropping off unused processors.
310  
311            JA_NEW = MOD( JA - 1 , NB ) + 1
312  
313  *         Save and compute new value of NP
314  
315            NP_SAVE = NP
316            NP =( JA_NEW + N - 2 ) / NB + 1
317  
318  *         Call utility routine that forms "standard-form" grid
319  
320            CALL RESHAPE( ICTXT , INT_ONE , ICTXT_NEW , INT_ONE , FIRST_PROC ,
321       $    INT_ONE , NP )
322  
323  *         Use new context from standard grid as context.
324  
325            ICTXT_SAVE = ICTXT
326            ICTXT = ICTXT_NEW
327            DESCA_1XP( 2 ) = ICTXT_NEW
328            DESCB_PX1( 2 ) = ICTXT_NEW
329  
330  *         Get information about new grid.
331  
332            CALL BLACS_GRIDINFO( ICTXT , NPROW , NPCOL , MYROW , MYCOL )
333  
334  *         Drop out processors that do not have part of the matrix.
335  
336            IF( MYROW.LT.0 ) THEN
337                GO TO 100
338            END IF
339  
340  *         Begin main code
341  
342  *         Move data into workspace - communicate / copy(overlap)
343  
344            IF( MYCOL.LT.NPCOL - 1 ) THEN
345                CALL SGESD2D( ICTXT , BWU , NRHS , B( NB - BWU + 1 ) , LLDB , 0 ,
346       $        MYCOL + 1 )
347            END IF
348  
349            IF( MYCOL.LT.NPCOL - 1 ) THEN
350                LM = NB - BWU
351            ELSE
352                LM = NB
353            END IF
354  
355            IF( MYCOL.GT.0 ) THEN
356                WPTR = BWU + 1
357            ELSE
358                WPTR = 1
359            END IF
360  
361            LDW = NB + BWU + 2*BW + BWU
362  
363            CALL SLACPY( 'G' , LM , NRHS , B( 1 ) , LLDB , WORK( WPTR ) , LDW )
364  
365  *         Zero out rest of work
366  
367            DO 20 J = 1 , NRHS
368                DO 10 L = WPTR + LM , LDW
369                    WORK(( J - 1 )*LDW + L ) = ZERO
370     10         CONTINUE
371     20     CONTINUE
372  
373            IF( MYCOL.GT.0 ) THEN
374                CALL SGERV2D( ICTXT , BWU , NRHS , WORK( 1 ) , LDW , 0 , MYCOL - 1 )
375            END IF
376  
377  *         *******************************************************************
378  *         PHASE 1 : Local computation phase -- Solve L*X = B
379  *         *******************************************************************
380  
381  *         Size of main(or odd) partition in each processor
382  
383            ODD_SIZE = NUMROC( N , NB , MYCOL , 0 , NPCOL )
384  
385            IF( MYCOL.NE.0 ) THEN
386                LBWL = BW
387                LBWU = 0
388                APTR = 1
389            ELSE
390                LBWL = BWL
391                LBWU = BWU
392                APTR = 1 + BWU
393            END IF
394  
395            IF( MYCOL.NE.NPCOL - 1 ) THEN
396                LM = NB - LBWU
397                LN = NB - BW
398            ELSE IF( MYCOL.NE.0 ) THEN
399                LM = ODD_SIZE + BWU
400                LN = MAX( ODD_SIZE - BW , 0 )
401            ELSE
402                LM = N
403                LN = MAX( N - BW , 0 )
404            END IF
405  
406            DO 30 J = 1 , LN
407  
408                LMJ = MIN( LBWL , LM - J )
409                L = IPIV( J )
410  
411                IF( L.NE.J ) THEN
412                    CALL SSWAP( NRHS , WORK( L ) , LDW , WORK( J ) , LDW )
413                END IF
414  
415                LPTR = BW + 1 + ( J - 1 )*LLDA + APTR
416  
417                CALL SGER( LMJ , NRHS , - ONE , A( LPTR ) , 1 , WORK( J ) , LDW ,
418       $        WORK( J + 1 ) , LDW )
419  
420     30     CONTINUE
421  
422  *         *******************************************************************
423  *         PHASE 2 : Global computation phase -- Solve L*X = B
424  *         *******************************************************************
425  
426  *         Define the initial dimensions of the diagonal blocks
427  *         The offdiagonal blocks(for MYCOL > 0) are of size BM by BW
428  
429            IF( MYCOL.NE.NPCOL - 1 ) THEN
430                BM = BW - LBWU
431                BN = BW
432            ELSE
433                BM = MIN( BW , ODD_SIZE ) + BWU
434                BN = MIN( BW , ODD_SIZE )
435            END IF
436  
437  *         Pointer to first element of block bidiagonal matrix in AF
438  *         Leading dimension of block bidiagonal system
439  
440            BBPTR =( NB + BWU )*BW + 1
441            LDBB = 2*BW + BWU
442  
443            IF( NPCOL.EQ.1 ) THEN
444  
445  *             In this case the loop over the levels will not be
446  *             performed.
447                CALL SGETRS( 'N' , N - LN , NRHS , AF( BBPTR + BW*LDBB ) , LDBB ,
448       $        IPIV( LN + 1 ) , WORK( LN + 1 ) , LDW , INFO )
449  
450            END IF
451  
452  *         Loop over levels ...
453  
454  *         The two integers NPACT(nu. of active processors) and NPSTR
455  *         (stride between active processors) is used to control the
456  *         loop.
457  
458            NPACT = NPCOL
459            NPSTR = 1
460  
461  *         Begin loop over levels
462     40 CONTINUE
463        IF( NPACT.LE.1 )
464       $    GO TO 50
465  
466  *         Test if processor is active
467            IF( MOD( MYCOL , NPSTR ).EQ.0 ) THEN
468  
469  *             Send / Receive blocks
470  
471                IF( MOD( MYCOL , 2*NPSTR ).EQ.0 ) THEN
472  
473                    NEICOL = MYCOL + NPSTR
474  
475                    IF( NEICOL / NPSTR.LE.NPACT - 1 ) THEN
476  
477                        IF( NEICOL / NPSTR.LT.NPACT - 1 ) THEN
478                            BMN = BW
479                        ELSE
480                            BMN = MIN( BW , NUMROC( N , NB , NEICOL , 0 , NPCOL ) ) +
481       $                    BWU
482                        END IF
483  
484                        CALL SGESD2D( ICTXT , BM , NRHS , WORK( LN + 1 ) , LDW , 0 ,
485       $                NEICOL )
486  
487                        IF( NPACT.NE.2 ) THEN
488  
489  *                         Receive answers back from partner processor
490  
491                            CALL SGERV2D( ICTXT , BM + BMN - BW , NRHS , WORK( LN + 1 ) ,
492       $                    LDW , 0 , NEICOL )
493  
494                            BM = BM + BMN - BW
495  
496                        END IF
497  
498                    END IF
499  
500                ELSE
501  
502                    NEICOL = MYCOL - NPSTR
503  
504                    IF( NEICOL.EQ.0 ) THEN
505                        BMN = BW - BWU
506                    ELSE
507                        BMN = BW
508                    END IF
509  
510                    CALL SLACPY( 'G' , BM , NRHS , WORK( LN + 1 ) , LDW ,
511       $            WORK( NB + BWU + BMN + 1 ) , LDW )
512  
513                    CALL SGERV2D( ICTXT , BMN , NRHS , WORK( NB + BWU + 1 ) , LDW , 0 ,
514       $            NEICOL )
515  
516  *                 and do the permutations and eliminations
517  
518                    IF( NPACT.NE.2 ) THEN
519  
520  *                     Solve locally for BW variables
521  
522                        CALL SLASWP( NRHS , WORK( NB + BWU + 1 ) , LDW , 1 , BW ,
523       $                IPIV( LN + 1 ) , 1 )
524  
525                        CALL STRSM( 'L' , 'L' , 'N' , 'U' , BW , NRHS , ONE ,
526       $                AF( BBPTR + BW*LDBB ) , LDBB , WORK( NB + BWU + 1 ) ,
527       $                LDW )
528  
529  *                     Use soln just calculated to update RHS
530  
531                        CALL SGEMM( 'N' , 'N' , BM + BMN - BW , NRHS , BW , - ONE ,
532       $                AF( BBPTR + BW*LDBB + BW ) , LDBB ,
533       $                WORK( NB + BWU + 1 ) , LDW , ONE ,
534       $                WORK( NB + BWU + 1 + BW ) , LDW )
535  
536  *                     Give answers back to partner processor
537  
538                        CALL SGESD2D( ICTXT , BM + BMN - BW , NRHS ,
539       $                WORK( NB + BWU + 1 + BW ) , LDW , 0 , NEICOL )
540  
541                    ELSE
542  
543  *                     Finish up calculations for final level
544  
545                        CALL SLASWP( NRHS , WORK( NB + BWU + 1 ) , LDW , 1 , BM + BMN ,
546       $                IPIV( LN + 1 ) , 1 )
547  
548                        CALL STRSM( 'L' , 'L' , 'N' , 'U' , BM + BMN , NRHS , ONE ,
549       $                AF( BBPTR + BW*LDBB ) , LDBB , WORK( NB + BWU + 1 ) ,
550       $                LDW )
551                    END IF
552  
553                END IF
554  
555                NPACT =( NPACT + 1 ) / 2
556                NPSTR = NPSTR*2
557                GO TO 40
558  
559            END IF
560  
561     50 CONTINUE
562  
563  *     *************************************
564  *     BACKSOLVE
565  *     *******************************************************************
566  *     PHASE 2 : Global computation phase -- Solve U*Y = X
567  *     *******************************************************************
568  
569        IF( NPCOL.EQ.1 ) THEN
570  
571  *         In this case the loop over the levels will not be
572  *         performed.
573  *         In fact , the backsolve portion was done in the call to
574  *         SGETRS in the frontsolve.
575  
576        END IF
577  
578  *     Compute variable needed to reverse loop structure in
579  *     reduced system.
580  
581        RECOVERY_VAL = NPACT*NPSTR - NPCOL
582  
583  *     Loop over levels
584  *     Terminal values of NPACT and NPSTR from frontsolve are used
585  
586     60 CONTINUE
587        IF( NPACT.GE.NPCOL )
588       $    GO TO 80
589  
590            NPSTR = NPSTR / 2
591  
592            NPACT = NPACT*2
593  
594  *         Have to adjust npact for non - power - of - 2
595  
596            NPACT = NPACT - MOD(( RECOVERY_VAL / NPSTR ) , 2 )
597  
598  *         Find size of submatrix in this proc at this level
599  
600            IF( MYCOL / NPSTR.LT.NPACT - 1 ) THEN
601                BN = BW
602            ELSE
603                BN = MIN( BW , NUMROC( N , NB , NPCOL - 1 , 0 , NPCOL ) )
604            END IF
605  
606  *         If this processor is even in this level...
607  
608            IF( MOD( MYCOL , 2*NPSTR ).EQ.0 ) THEN
609  
610                NEICOL = MYCOL + NPSTR
611  
612                IF( NEICOL / NPSTR.LE.NPACT - 1 ) THEN
613  
614                    IF( NEICOL / NPSTR.LT.NPACT - 1 ) THEN
615                        BMN = BW
616                        BNN = BW
617                    ELSE
618                        BMN = MIN( BW , NUMROC( N , NB , NEICOL , 0 , NPCOL ) ) + BWU
619                        BNN = MIN( BW , NUMROC( N , NB , NEICOL , 0 , NPCOL ) )
620                    END IF
621  
622                    IF( NPACT.GT.2 ) THEN
623  
624                        CALL SGESD2D( ICTXT , 2*BW , NRHS , WORK( LN + 1 ) , LDW , 0 ,
625       $                NEICOL )
626  
627                        CALL SGERV2D( ICTXT , BW , NRHS , WORK( LN + 1 ) , LDW , 0 ,
628       $                NEICOL )
629  
630                    ELSE
631  
632                        CALL SGERV2D( ICTXT , BW , NRHS , WORK( LN + 1 ) , LDW , 0 ,
633       $                NEICOL )
634  
635                    END IF
636  
637                END IF
638  
639            ELSE
640  *             This processor is odd on this level
641  
642                NEICOL = MYCOL - NPSTR
643  
644                IF( NEICOL.EQ.0 ) THEN
645                    BMN = BW - BWU
646                ELSE
647                    BMN = BW
648                END IF
649  
650                IF( NEICOL.LT.NPCOL - 1 ) THEN
651                    BNN = BW
652                ELSE
653                    BNN = MIN( BW , NUMROC( N , NB , NEICOL , 0 , NPCOL ) )
654                END IF
655  
656                IF( NPACT.GT.2 ) THEN
657  
658  *                 Move RHS to make room for received solutions
659  
660                    CALL SLACPY( 'G' , BW , NRHS , WORK( NB + BWU + 1 ) , LDW ,
661       $            WORK( NB + BWU + BW + 1 ) , LDW )
662  
663                    CALL SGERV2D( ICTXT , 2*BW , NRHS , WORK( LN + 1 ) , LDW , 0 ,
664       $            NEICOL )
665  
666                    CALL SGEMM( 'N' , 'N' , BW , NRHS , BN , - ONE , AF( BBPTR ) , LDBB ,
667       $            WORK( LN + 1 ) , LDW , ONE , WORK( NB + BWU + BW + 1 ) ,
668       $            LDW )
669  
670                    IF( MYCOL.GT.NPSTR ) THEN
671  
672                        CALL SGEMM( 'N' , 'N' , BW , NRHS , BW , - ONE ,
673       $                AF( BBPTR + 2*BW*LDBB ) , LDBB , WORK( LN + BW + 1 ) ,
674       $                LDW , ONE , WORK( NB + BWU + BW + 1 ) , LDW )
675  
676                    END IF
677  
678                    CALL STRSM( 'L' , 'U' , 'N' , 'N' , BW , NRHS , ONE ,
679       $            AF( BBPTR + BW*LDBB ) , LDBB , WORK( NB + BWU + BW + 1 ) ,
680       $            LDW )
681  
682  *                 Send new solution to neighbor
683  
684                    CALL SGESD2D( ICTXT , BW , NRHS , WORK( NB + BWU + BW + 1 ) , LDW , 0 ,
685       $            NEICOL )
686  
687  *                 Copy new solution into expected place
688  
689                    CALL SLACPY( 'G' , BW , NRHS , WORK( NB + BWU + 1 + BW ) , LDW ,
690       $            WORK( LN + BW + 1 ) , LDW )
691  
692                ELSE
693  
694  *                 Solve with local diagonal block
695  
696                    CALL STRSM( 'L' , 'U' , 'N' , 'N' , BN + BNN , NRHS , ONE ,
697       $            AF( BBPTR + BW*LDBB ) , LDBB , WORK( NB + BWU + 1 ) ,
698       $            LDW )
699  
700  *                 Send new solution to neighbor
701  
702                    CALL SGESD2D( ICTXT , BW , NRHS , WORK( NB + BWU + 1 ) , LDW , 0 ,
703       $            NEICOL )
704  
705  *                 Shift solutions into expected positions
706  
707                    CALL SLACPY( 'G' , BNN + BN - BW , NRHS , WORK( NB + BWU + 1 + BW ) , LDW ,
708       $            WORK( LN + 1 ) , LDW )
709  
710                    IF(( NB + BWU + 1 ).NE.( LN + 1 + BW ) ) THEN
711  
712  *                     Copy one row at a time since spaces may overlap
713  
714                        DO 70 J = 1 , BW
715                            CALL SCOPY( NRHS , WORK( NB + BWU + J ) , LDW ,
716       $                    WORK( LN + BW + J ) , LDW )
717     70                 CONTINUE
718  
719                    END IF
720  
721                END IF
722  
723            END IF
724  
725            GO TO 60
726  
727     80 CONTINUE
728  *     End of loop over levels
729  
730  *     *******************************************************************
731  *     PHASE 1 :(Almost) Local computation phase -- Solve U*Y = X
732  *     *******************************************************************
733  
734  *     Reset BM to value it had before reduced system frontsolve...
735  
736        IF( MYCOL.NE.NPCOL - 1 ) THEN
737            BM = BW - LBWU
738        ELSE
739            BM = MIN( BW , ODD_SIZE ) + BWU
740        END IF
741  
742  *     First metastep is to account for the fillin blocks AF
743  
744        IF( MYCOL.LT.NPCOL - 1 ) THEN
745  
746            CALL SGESD2D( ICTXT , BW , NRHS , WORK( NB - BW + 1 ) , LDW , 0 ,
747       $    MYCOL + 1 )
748  
749        END IF
750  
751        IF( MYCOL.GT.0 ) THEN
752  
753            CALL SGERV2D( ICTXT , BW , NRHS , WORK( NB + BWU + 1 ) , LDW , 0 ,
754       $    MYCOL - 1 )
755  
756  *         Modify local right hand sides with received rhs's
757  
758            CALL SGEMM( 'T' , 'N' , LM - BM , NRHS , BW , - ONE , AF( 1 ) , BW ,
759       $    WORK( NB + BWU + 1 ) , LDW , ONE , WORK( 1 ) , LDW )
760  
761        END IF
762  
763        DO 90 J = LN , 1 , - 1
764  
765            LMJ = MIN( BW , ODD_SIZE - 1 )
766  
767            LPTR = BW - 1 + J*LLDA + APTR
768  
769  *         In the following , the TRANS = T option is used to reverse
770  *         the order of multiplication , not as a true transpose
771  
772            CALL SGEMV( 'T' , LMJ , NRHS , - ONE , WORK( J + 1 ) , LDW , A( LPTR ) ,
773       $    LLDA - 1 , ONE , WORK( J ) , LDW )
774  
775  *         Divide by diagonal element
776  
777            CALL SSCAL( NRHS , ONE / A( LPTR - LLDA + 1 ) , WORK( J ) , LDW )
778     90 CONTINUE
779  
780        CALL SLACPY( 'G' , ODD_SIZE , NRHS , WORK( 1 ) , LDW , B( 1 ) , LLDB )
781  
782  *     Free BLACS space used to hold standard - form grid.
783  
784        ICTXT = ICTXT_SAVE
785        IF( ICTXT.NE.ICTXT_NEW ) THEN
786            CALL BLACS_GRIDEXIT( ICTXT_NEW )
787        END IF
788  
789    100 CONTINUE
790  
791  *     Restore saved input parameters
792  
793        NP = NP_SAVE
794  
795  *     Output worksize
796  
797        WORK( 1 ) = WORK_SIZE_MIN
798  
799        RETURN
800  
801  *     End of PSGBTRS
802  
803        END