%!PS-Adobe-2.0 %%Creator: dvips 5.485 Copyright 1986-92 Radical Eye Software %%Title: siam_cover.dvi %%Pages: 48 1 %%BoundingBox: 0 0 612 792 %%EndComments %DVIPSCommandLine: dvips siam_cover -o siam_cover.ps %%BeginProcSet: tex.pro /TeXDict 250 dict def TeXDict begin /N{def}def /B{bind def}N /S{exch}N /X{S N} B /TR{translate}N /isls false N /vsize 11 72 mul N /@rigin{isls{[0 -1 1 0 0 0] concat}if 72 Resolution div 72 VResolution div neg scale isls{Resolution hsize -72 div mul 0 TR}if Resolution VResolution vsize -72 div 1 add mul TR matrix currentmatrix dup dup 4 get round 4 exch put dup dup 5 get round 5 exch put setmatrix}N /@landscape{/isls true N}B /@manualfeed{statusdict /manualfeed true put}B /@copies{/#copies X}B /FMat[1 0 0 -1 0 0]N /FBB[0 0 0 0]N /nn 0 N /IE 0 N /ctr 0 N /df-tail{/nn 8 dict N nn begin /FontType 3 N /FontMatrix fntrx N /FontBBox FBB N string /base X array /BitMaps X /BuildChar{ CharBuilder}N /Encoding IE N end dup{/foo setfont}2 array copy cvx N load 0 nn put /ctr 0 N[}B /df{/sf 1 N /fntrx FMat N df-tail}B /dfs{div /sf X /fntrx[sf 0 0 sf neg 0 0]N df-tail}B /E{pop nn dup definefont setfont}B /ch-width{ch-data dup length 5 sub get}B /ch-height{ch-data dup length 4 sub get}B /ch-xoff{128 ch-data dup length 3 sub get sub}B /ch-yoff{ch-data dup length 2 sub get 127 sub}B /ch-dx{ch-data dup length 1 sub get}B /ch-image{ch-data dup type /stringtype ne{ctr get /ctr ctr 1 add N}if}B /id 0 N /rw 0 N /rc 0 N /gp 0 N /cp 0 N /G 0 N /sf 0 N /CharBuilder{save 3 1 roll S dup /base get 2 index get S /BitMaps get S get /ch-data X pop /ctr 0 N ch-dx 0 ch-xoff ch-yoff ch-height sub ch-xoff ch-width add ch-yoff setcachedevice ch-width ch-height true[1 0 0 -1 -.1 ch-xoff sub ch-yoff .1 add]{ch-image}imagemask restore}B /D{/cc X dup type /stringtype ne{]}if nn /base get cc ctr put nn /BitMaps get S ctr S sf 1 ne{dup dup length 1 sub dup 2 index S get sf div put}if put /ctr ctr 1 add N} B /I{cc 1 add D}B /bop{userdict /bop-hook known{bop-hook}if /SI save N @rigin 0 0 moveto pop}N /eop{SI restore showpage userdict /eop-hook known{eop-hook} if}N /@start{userdict /start-hook known{start-hook}if /VResolution X /Resolution X 1000 div /DVImag X /IE 256 array N 0 1 255{IE S 1 string dup 0 3 index put cvn put}for 65781.76 div /vsize X 65781.76 div /hsize X}N /p{show}N /RMat[1 0 0 -1 0 0]N /BDot 260 string N /rulex 0 N /ruley 0 N /v{/ruley X /rulex X V}B /V statusdict begin /product where{pop product dup length 7 ge{0 7 getinterval(Display)eq}{pop false}ifelse}{false}ifelse end{{gsave TR -.1 -.1 TR 1 1 scale rulex ruley false RMat{BDot}imagemask grestore}}{{gsave TR -.1 -.1 TR rulex ruley scale 1 1 false RMat{BDot}imagemask grestore}}ifelse B /a{ moveto}B /delta 0 N /tail{dup /delta X 0 rmoveto}B /M{S p delta add tail}B /b{ S p tail}B /c{-4 M}B /d{-3 M}B /e{-2 M}B /f{-1 M}B /g{0 M}B /h{1 M}B /i{2 M}B /j{3 M}B /k{4 M}B /w{0 rmoveto}B /l{p -4 w}B /m{p -3 w}B /n{p -2 w}B /o{p -1 w }B /q{p 1 w}B /r{p 2 w}B /s{p 3 w}B /t{p 4 w}B /x{0 S rmoveto}B /y{3 2 roll p a}B /bos{/SS save N}B /eos{SS restore}B end %%EndProcSet %%BeginProcSet: special.pro TeXDict begin /SDict 200 dict N SDict begin /@SpecialDefaults{/hs 612 N /vs 792 N /ho 0 N /vo 0 N /hsc 1 N /vsc 1 N /ang 0 N /CLIP 0 N /rwiSeen false N /rhiSeen false N /letter{}N /note{}N /a4{}N /legal{}N}B /@scaleunit 100 N /@hscale{@scaleunit div /hsc X}B /@vscale{@scaleunit div /vsc X}B /@hsize{/hs X /CLIP 1 N}B /@vsize{/vs X /CLIP 1 N}B /@clip{/CLIP 2 N}B /@hoffset{/ho X}B /@voffset{/vo X}B /@angle{/ang X}B /@rwi{10 div /rwi X /rwiSeen true N}B /@rhi {10 div /rhi X /rhiSeen true N}B /@llx{/llx X}B /@lly{/lly X}B /@urx{/urx X}B /@ury{/ury X}B /magscale true def end /@MacSetUp{userdict /md known{userdict /md get type /dicttype eq{userdict begin md length 10 add md maxlength ge{/md md dup length 20 add dict copy def}if end md begin /letter{}N /note{}N /legal{ }N /od{txpose 1 0 mtx defaultmatrix dtransform S atan/pa X newpath clippath mark{transform{itransform moveto}}{transform{itransform lineto}}{6 -2 roll transform 6 -2 roll transform 6 -2 roll transform{itransform 6 2 roll itransform 6 2 roll itransform 6 2 roll curveto}}{{closepath}}pathforall newpath counttomark array astore /gc xdf pop ct 39 0 put 10 fz 0 fs 2 F/|______Courier fnt invertflag{PaintBlack}if}N /txpose{pxs pys scale ppr aload pop por{noflips{pop S neg S TR pop 1 -1 scale}if xflip yflip and{pop S neg S TR 180 rotate 1 -1 scale ppr 3 get ppr 1 get neg sub neg ppr 2 get ppr 0 get neg sub neg TR}if xflip yflip not and{pop S neg S TR pop 180 rotate ppr 3 get ppr 1 get neg sub neg 0 TR}if yflip xflip not and{ppr 1 get neg ppr 0 get neg TR}if}{noflips{TR pop pop 270 rotate 1 -1 scale}if xflip yflip and{TR pop pop 90 rotate 1 -1 scale ppr 3 get ppr 1 get neg sub neg ppr 2 get ppr 0 get neg sub neg TR}if xflip yflip not and{TR pop pop 90 rotate ppr 3 get ppr 1 get neg sub neg 0 TR}if yflip xflip not and{TR pop pop 270 rotate ppr 2 get ppr 0 get neg sub neg 0 S TR}if}ifelse scaleby96{ppr aload pop 4 -1 roll add 2 div 3 1 roll add 2 div 2 copy TR .96 dup scale neg S neg S TR}if}N /cp{pop pop showpage pm restore}N end}if}if}N /normalscale{Resolution 72 div VResolution 72 div neg scale magscale{DVImag dup scale}if 0 setgray}N /psfts{S 65781.76 div N}N /startTexFig{/psf$SavedState save N userdict maxlength dict begin /magscale false def normalscale currentpoint TR /psf$ury psfts /psf$urx psfts /psf$lly psfts /psf$llx psfts /psf$y psfts /psf$x psfts currentpoint /psf$cy X /psf$cx X /psf$sx psf$x psf$urx psf$llx sub div N /psf$sy psf$y psf$ury psf$lly sub div N psf$sx psf$sy scale psf$cx psf$sx div psf$llx sub psf$cy psf$sy div psf$ury sub TR /showpage{}N /erasepage{}N /copypage{}N /p 3 def @MacSetUp}N /doclip{psf$llx psf$lly psf$urx psf$ury currentpoint 6 2 roll newpath 4 copy 4 2 roll moveto 6 -1 roll S lineto S lineto S lineto closepath clip newpath moveto}N /endTexFig{end psf$SavedState restore}N /@beginspecial{ SDict begin /SpecialSave save N gsave normalscale currentpoint TR @SpecialDefaults count /ocount X /dcount countdictstack N}N /@setspecial{CLIP 1 eq{newpath 0 0 moveto hs 0 rlineto 0 vs rlineto hs neg 0 rlineto closepath clip}if ho vo TR hsc vsc scale ang rotate rwiSeen{rwi urx llx sub div rhiSeen{ rhi ury lly sub div}{dup}ifelse scale llx neg lly neg TR}{rhiSeen{rhi ury lly sub div dup scale llx neg lly neg TR}if}ifelse CLIP 2 eq{newpath llx lly moveto urx lly lineto urx ury lineto llx ury lineto closepath clip}if /showpage{}N /erasepage{}N /copypage{}N newpath}N /@endspecial{count ocount sub{pop}repeat countdictstack dcount sub{end}repeat grestore SpecialSave restore end}N /@defspecial{SDict begin}N /@fedspecial{end}B /li{lineto}B /rl{ rlineto}B /rc{rcurveto}B /np{/SaveX currentpoint /SaveY X N 1 setlinecap newpath}N /st{stroke SaveX SaveY moveto}N /fil{fill SaveX SaveY moveto}N /ellipse{/endangle X /startangle X /yrad X /xrad X /savematrix matrix currentmatrix N TR xrad yrad scale 0 0 1 startangle endangle arc savematrix setmatrix}N end %%EndProcSet TeXDict begin 40258431 52099146 1000 300 300 @start /Fa 1 49 df<1F00318060C04040C060C060C060C060C060C060C060C060404060C031801F000B107F8F0F> 48 D E /Fb 12 68 dfc 2 49 df0 D<060F0F0E1E1E1C3C3838307070 60E0C04008117F910A>48 D E /Fd 15 117 df<0402000C06000C06000C0600180C00180C0018 0C00180C003018003018803018803038807859006F8E00600000600000C00000C00000C0000080 000011147E8D15>22 D<780218061806180C300C301830183030606060C061806600D800E0000F 0E7E8D11>I<60F0F070101020204040040A7D830A>59 D<07FFF800E00E00E00700E00700E007 01C00701C00701C00701C00E03801C03807003FFC00380000700000700000700000700000E0000 0E00000E00000E00001C0000FF800018177F9616>80 D<001FC000707001C03803001C06000C0E 000E1C000E18000E38000E30000E70000E70000E70000E60001CE0001C60003860003870007070 F0E03109C0190B800F0E0003F810000810000C30000C60000FC0000FC0000700171D7F961C>I< 1FFFFE381C0E201C04601C04401C04403804803804003800003800007000007000007000007000 00E00000E00000E00000E00001C00001C00001C00001C00003C0003FFC0017177F9615>84 D<7C0018001800180018003000300030003000678068C070406060C060C060C060C06080C080C0 8180C10046003C000B177E960F>98 D<1F0006000600060006000C000C000C000C0018F01B181C 08180838183018301830186030603160616062C026C03810177E9614>104 D<0300038003000000000000000000000000001C002400460046008C000C001800180018003100 3100320032001C0009177F960C>I<00180038001000000000000000000000000001C002200430 0430086000600060006000C000C000C000C001800180018001806300E300C60078000D1D80960E >I<1C3C22462382230346030603060306030C060C060C0C0C081A3019E0180018003000300030 00FC001014808D12>112 D<071018D0307060706060C060C060C06080C080C080C0C1C0478039 80018001800300030003001FC00C147E8D10>I<30F049184E384C309C00180018001800300030 0030003000600060000D0E7F8D10>I<07C00C201870187038001E000FC003E000606060E060C0 C0C1803F000C0E7E8D10>I<030003000600060006000600FFC00C000C000C0018001800180018 00300030803080310032001C000A147F930D>I E /Fe 13 121 df<07C018303018701C600C60 0CE00EE00EE00EE00EE00EE00EE00EE00EE00E600C600C701C30181C7007C00F157F9412>48 D<06000E00FE000E000E000E000E000E000E000E000E000E000E000E000E000E000E000E000E00 0E00FFE00B157D9412>I<0F8030E040708030C038E0384038003800700070006000C001800300 06000C08080810183FF07FF0FFF00D157E9412>I<0FE030306018701C701C001C001800380060 07E000300018000C000E000EE00EE00EC00C401830300FE00F157F9412>I<01F00608080C181C 301C70006000E000E3E0EC30F018F00CE00EE00EE00E600E600E300C3018183007C00F157F9412 >54 D<3FC0706070302038003803F81E3830387038E039E039E07970FF1F1E100E7F8D12>97 D<07F01838303870106000E000E000E000E000600070083008183007C00D0E7F8D10>99 D<0FC0186030307038E018FFF8E000E000E000600070083010183007C00D0E7F8D10>101 D108 DI<07C2001C2600381E00700E00600E 00E00E00E00E00E00E00E00E00600E00700E00301E001C2E0007CE00000E00000E00000E00000E 00000E00003F8011147F8D13>113 D<1F4060C0C040C040E000FF007F801FC001E080608060C0 60E0C09F000B0E7F8D0E>115 D120 D E /Ff 43 122 dfg 60 123 dfh 47 123 dfi 57 123 dfj 2 51 df<0C001C00EC000C000C000C000C000C000C000C000C000C000C000C000C000C00 0C000C00FFC00A137D9211>49 D<1F0060C06060F070F030603000700070006000C001C0018002 0004000810101020207FE0FFE00C137E9211>I E /Fk 41 123 dfl 13 123 dfm 89 128 dfn 22 122 dfo 55 123 dfp 22 120 df<0000030000000000030000 0000000300000000000780000000000780000000000FC0000000000FC0000000000FC000000000 17E00000000013E00000000013E00000000023F00000000021F00000000021F00000000040F800 00000040F80000000040F800000000807C00000000807C00000001007E00000001003E00000001 003E00000002003F00000002001F00000002001F00000004000F80000004000F80000004000F80 0000080007C00000080007C00000180007E000001FFFFFE000001FFFFFE00000200003F0000020 0001F00000200001F00000400000F80000400000F80000400000F800008000007C00008000007C 00018000007E00010000003E00010000003E00030000003F00030000001F00070000001F001F80 00003F80FFE00001FFFCFFE00001FFFC2E327EB132>65 D<00003FE0010001FFF8030007F01E03 001F800307003E000087007800004F00F000002F01E000001F03C000000F078000000F0F800000 070F000000071F000000031E000000033E000000033C000000017C000000017C000000017C0000 00017800000000F800000000F800000000F800000000F800000000F800000000F800000000F800 000000F800000000F800000000F800000000F80000000078000000007C000000007C000000017C 000000013C000000013E000000011E000000011F000000020F000000020F800000060780000004 03C000000801E000000800F00000100078000020003E0000C0001F8003800007F00F000001FFFC 0000003FE00028337CB130>67 D69 D72 DI76 DII80 D82 D<007F802001FFE02007C078600F001C601E0006E03C0003E0380001E0 780000E0700000E070000060F0000060F0000060F0000020F0000020F0000020F8000020F80000 007C0000007E0000003F0000003FC000001FF800000FFF800007FFF00003FFFC0000FFFF00000F FF800000FFC000001FE0000007E0000003F0000001F0000000F0000000F8000000F88000007880 000078800000788000007880000078C0000078C0000070E00000F0E00000E0F00000E0F80001C0 EC000380C7000700C1F01E00807FFC00800FF0001D337CB125>I<7FFFFFFFFFE07FFFFFFFFFE0 7E000F8007E078000F8001E070000F8000E060000F80006040000F80002040000F800020C0000F 800030C0000F80003080000F80001080000F80001080000F80001080000F80001080000F800010 80000F80001000000F80000000000F80000000000F80000000000F80000000000F80000000000F 80000000000F80000000000F80000000000F80000000000F80000000000F80000000000F800000 00000F80000000000F80000000000F80000000000F80000000000F80000000000F80000000000F 80000000000F80000000000F80000000000F80000000000F80000000000F80000000000F800000 00000F80000000000F80000000000F80000000000F80000000000F80000000001FC00000000FFF FF8000000FFFFF80002C317EB030>I<0780000000FF80000000FF800000000F80000000078000 000007800000000780000000078000000007800000000780000000078000000007800000000780 0000000780000000078000000007800000000780000000078000000007800000000781FC000007 86078000078801C000079000E00007A000700007C00038000780003C000780001E000780001E00 0780001F000780000F000780000F000780000F800780000F800780000F800780000F800780000F 800780000F800780000F800780000F000780000F000780001F000780001E000780001E00078000 3C0007C00038000720007000072000E000061801C00006060700000401F8000021327EB125>98 D<000000F00000001FF00000001FF000000001F000000000F000000000F000000000F000000000 F000000000F000000000F000000000F000000000F000000000F000000000F000000000F0000000 00F000000000F000000000F000000000F000000FC0F000007030F00001C00CF000038002F00007 0001F0000E0001F0001E0000F0003C0000F0003C0000F0007C0000F000780000F000780000F000 F80000F000F80000F000F80000F000F80000F000F80000F000F80000F000F80000F000780000F0 00780000F0007C0000F0003C0000F0003C0000F0001E0000F0000E0001F000070002F000038004 F00001C008F80000F030FF80001FC0FF8021327EB125>100 D<003F800000E0E0000380380007 003C000E001E001E001E001C000F003C000F007C000F0078000F8078000780F8000780F8000780 FFFFFF80F8000000F8000000F8000000F8000000F8000000F8000000780000007C0000003C0000 003C0000801E0000800E0001000F0002000700020001C00C0000F03000001FC000191F7E9E1D> I<0F001F801F801F801F800F000000000000000000000000000000000000000000000007807F80 7F800F800780078007800780078007800780078007800780078007800780078007800780078007 800780078007800780078007800FC0FFF8FFF80D307EAF12>105 D<0780FE001FC000FF830780 60F000FF8C03C18078000F9001E2003C0007A001E4003C0007A000F4001E0007C000F8001E0007 C000F8001E00078000F0001E00078000F0001E00078000F0001E00078000F0001E00078000F000 1E00078000F0001E00078000F0001E00078000F0001E00078000F0001E00078000F0001E000780 00F0001E00078000F0001E00078000F0001E00078000F0001E00078000F0001E00078000F0001E 00078000F0001E00078000F0001E00078000F0001E00078000F0001E000FC001F8003F00FFFC1F FF83FFF0FFFC1FFF83FFF0341F7E9E38>109 D<001FC00000F0780001C01C00070007000F0007 801E0003C01C0001C03C0001E03C0001E0780000F0780000F0780000F0F80000F8F80000F8F800 00F8F80000F8F80000F8F80000F8F80000F8F80000F8780000F07C0001F03C0001E03C0001E01E 0003C01E0003C00F00078007800F0001C01C0000F07800001FC0001D1F7E9E21>111 D<00400000400000400000400000400000C00000C00000C00001C00001C00003C00007C0000FC0 001FFFE0FFFFE003C00003C00003C00003C00003C00003C00003C00003C00003C00003C00003C0 0003C00003C00003C00003C00003C00003C01003C01003C01003C01003C01003C01003C01003C0 1001C02001E02000E0400078C0001F00142C7FAB19>116 D<078000F000FF801FF000FF801FF0 000F8001F000078000F000078000F000078000F000078000F000078000F000078000F000078000 F000078000F000078000F000078000F000078000F000078000F000078000F000078000F0000780 00F000078000F000078000F000078000F000078000F000078001F000078001F000078001F00003 8002F00003C004F00001C008F800007030FF80001FC0FF80211F7E9E25>III E end %%EndProlog %%BeginSetup %%Feature: *Resolution 300 TeXDict begin %%EndSetup %%Page: 1 1 0 bop 637 125 a Fp(TECHNICAL)21 b(P)-6 b(APER)562 225 y(Submitted)21 b(to)h(SIAM)f(Review)151 552 y Fo(SOFTW)-8 b(ARE)22 b(LIBRARIES)h(F)n(OR)g (LINEAR)f(ALGEBRA)-29 634 y(COMPUT)-6 b(A)g(TIONS)23 b(ON)f(HIGH)h(PERF)n (ORMANCE)f(COMPUTERS)1933 613 y Fn(1)748 915 y Fm(Jac)o(k)16 b(J.)g(Dongarra)1131 897 y Fl(zx)757 981 y Fm(Da)o(vid)g(W.)g(W)l(alk)o(er) 1145 963 y Fl(x)493 1172 y(z)533 1190 y Fm(Departmen)o(t)f(of)h(Computer)g (Science)533 1256 y(Univ)o(ersit)o(y)e(of)i(T)l(ennessee)533 1322 y(107)i(Ayres)d(Hall)533 1388 y(Kno)o(xville,)f(TN)i(37996-1301)493 1447 y Fl(x)533 1465 y Fm(Mathematical)e(Sciences)h(Section)533 1531 y(Oak)i(Ridge)e(National)i(Lab)q(oratory)533 1597 y(P)l(.)f(O.)g(Bo)o(x) f(2008,)j(Bldg.)d(6012)533 1664 y(Oak)i(Ridge,)e(TN)h(37831-6367)722 1958 y(Corresp)q(onding)i(author:)777 2024 y(Da)o(vid)e(W.)g(W)l(alk)o(er)622 2090 y(Oak)h(Ridge)e(National)i(Lab)q(oratory)801 2156 y(P)l(.)f(O.)g(Bo)o(x) f(2008)673 2222 y(Oak)h(Ridge,)f(TN)i(37831-6367)725 2289 y(\(615\))g (574-7401)i(\(o\016ce\))746 2355 y(\(615\))f(574-0680)h(\(fax\))584 2421 y Fk(walker@msr.)o(ep)o(m.o)o(rnl)o(.go)o(v)13 b Fm(\(email\))p -57 2519 816 2 v -1 2549 a Fj(1)18 2564 y Fm(This)18 b(w)o(ork)f(w)o(as)h (supp)q(orted)h(in)f(part)g(b)o(y)f(ARP)l(A)g(under)g(con)o(tract)h(n)o(um)o (b)q(er)e(D)o(AAL03-91-C-0047)21 b(ad-)-57 2631 y(ministered)13 b(b)o(y)j(AR)o(O,)e(and)j(in)e(part)i(b)o(y)e(DOE)h(under)g(con)o(tract)g(n)o (um)o(b)q(er)e(DE-A)o(C05-84OR21400)q(,)k(and)f(b)o(y)-57 2697 y(National)f(Science)f(F)l(oundation)i(Gran)o(t)g(n)o(um)o(b)q(er)d (ASC-ASC-9005933.)956 2825 y(i)p eop %%Page: 3 2 1 bop 302 125 a Fi(SOFTW)-6 b(ARE)19 b(LIBRARIES)f(F)n(OR)h(LINEAR)f(ALGEBRA) 165 191 y(COMPUT)-5 b(A)g(TIONS)21 b(ON)d(HIGH)g(PERF)n(ORMANCE)i(COMPUTERS) 780 417 y Fm(Jac)o(k)c(J.)f(Dongarra)777 483 y(Da)o(vid)h(W.)g(W)l(alk)o(er) 856 630 y Fi(Abstract)-57 802 y Fm(This)c(pap)q(er)g(discusses)g(the)f (design)h(of)f(linear)g(algebra)h(libraries)f(for)h(high)f(p)q(erformance)g (computers.)18 b(P)o(artic-)-57 868 y(ular)13 b(emphasis)f(is)h(placed)g(on)g (the)g(dev)o(elopmen)o(t)d(of)k(scalable)e(algorithms)h(for)g(MIMD)f (distributed)h(memory)-57 935 y(concurren)o(t)f(computers.)18 b(A)12 b(brief)g(description)f(of)i(the)f(EISP)l(A)o(CK,)f(LINP)l(A)o(CK,)g (and)i(LAP)l(A)o(CK)f(libraries)f(is)-57 1001 y(giv)o(en,)h(follo)o(w)o(ed)h (b)o(y)f(an)i(outline)e(of)i(ScaLAP)l(A)o(CK,)e(whic)o(h)h(is)f(a)i (distributed)e(memory)f(v)o(ersion)h(of)h(LAP)l(A)o(CK)-57 1067 y(curren)o(tly)18 b(under)h(dev)o(elopmen)o(t.)26 b(The)20 b(imp)q(ortance)d(of)j(blo)q(c)o(k-partitioned)e(algorithms)h(in)g(reducing)f (the)-57 1133 y(frequency)c(of)h(data)h(mo)o(v)o(em)o(en)o(t)11 b(b)q(et)o(w)o(een)k(di\013eren)o(t)f(lev)o(els)f(of)i(hierarc)o(hical)e (memory)f(is)j(stressed.)21 b(The)15 b(use)-57 1199 y(of)22 b(suc)o(h)e(algorithms)h(helps)g(reduce)f(the)h(message)g(startup)h(costs)f (on)h(distributed)e(memory)f(concurren)o(t)-57 1266 y(computers.)36 b(Other)21 b(k)o(ey)f(ideas)i(in)f(our)h(approac)o(h)g(are)g(the)f(use)g(of)h (distributed)f(v)o(ersions)g(of)h(the)f(Lev)o(el)-57 1332 y(3)d(Basic)f (Linear)h(Algebra)g(Subprograms)g(\(BLAS\))f(as)i(computational)e(building)g (blo)q(c)o(ks,)g(and)h(the)g(use)g(of)-57 1398 y(Basic)13 b(Linear)g(Algebra) g(Comm)o(unication)e(Subprograms)j(\(BLA)o(CS\))f(as)h(comm)o(uni)o(cation)d (building)i(blo)q(c)o(ks.)-57 1464 y(T)l(ogether)g(the)g(distributed)g(BLAS)g (and)g(the)g(BLA)o(CS)g(can)g(b)q(e)g(used)h(to)f(construct)g(higher-lev)o (el)e(algorithms,)-57 1530 y(and)17 b(hide)f(man)o(y)e(details)i(of)h(the)f (parallelism)d(from)j(the)g(application)g(dev)o(elop)q(er.)-57 1618 y(The)e(blo)q(c)o(k-cyclic)e(data)j(distribution)f(is)g(describ)q(ed,)f (and)i(adopted)g(as)g(a)f(go)q(o)q(d)i(w)o(a)o(y)e(of)h(distributing)e(blo)q (c)o(k-)-57 1684 y(partitioned)j(matrices.)21 b(Blo)q(c)o(k-partitioned)16 b(v)o(ersions)g(of)h(the)g(Cholesky)f(and)i(LU)e(factorizations)h(are)g(pre-) -57 1750 y(sen)o(ted,)d(and)g(optimization)f(issues)h(asso)q(ciated)h(with)f (the)g(implem)o(en)o(tation)d(of)k(the)f(LU)g(factorization)g(algo-)-57 1817 y(rithm)f(on)h(distributed)g(memory)d(concurren)o(t)j(computers)f(are)h (discussed,)g(together)h(with)f(its)g(p)q(erformance)-57 1883 y(on)j(the)f(In)o(tel)f(Delta)h(system.)j(Finally)l(,)c(approac)o(hes)i(to)f (the)g(design)h(of)f(library)g(in)o(terfaces)f(are)h(review)o(ed.)-57 2020 y Fi(Key)i(w)n(ords:)49 b Fm(P)o(arallel)15 b(computing,)g(linear)g (algebra)i(soft)o(w)o(are)f(libraries,)f(LAP)l(A)o(CK,)h(ScaLAP)l(A)o(CK)-57 2120 y Fi(AMS)j(sub)s(ject)g(classi\014cations:)48 b Fm(65Y05,)16 b(65Y10,)h(65F05)943 2825 y(iii)p eop %%Page: 1 3 2 bop -57 125 a Fh(1)83 b(In)n(tro)r(duction)-57 262 y Fm(The)15 b(increasing)g(a)o(v)m(ailabilit)o(y)f(of)h(adv)m(anced-arc)o(hitecture)g (computers)f(is)h(ha)o(ving)h(a)f(v)o(ery)f(signi\014can)o(t)i(e\013ect)-57 328 y(on)h(all)f(spheres)h(of)g(scien)o(ti\014c)e(computation,)h(including)f (algorithm)h(researc)o(h)g(and)h(soft)o(w)o(are)g(dev)o(elopmen)o(t)-57 394 y(in)23 b(n)o(umerical)e(linear)i(algebra.)43 b(Linear)24 b(algebra|in)f(particular,)i(the)e(solution)h(of)g(linear)e(systems)h(of)-57 460 y(equations|lies)15 b(at)i(the)f(heart)h(of)g(most)e(calculations)h(in)g (scien)o(ti\014c)f(computing.)20 b(This)d(c)o(hapter)f(discusses)-57 527 y(some)c(of)h(the)f(recen)o(t)g(dev)o(elopmen)o(ts)e(in)i(linear)g (algebra)h(designed)g(to)g(exploit)f(these)g(adv)m(anced-arc)o(hitecture)-57 593 y(computers.)19 b(P)o(articular)12 b(atten)o(tion)g(will)f(b)q(e)i(paid)g (to)g(dense)f(factorization)h(routines,)f(suc)o(h)h(as)g(the)f(Cholesky)-57 659 y(and)h(LU)g(factorizations,)g(and)g(these)f(will)g(b)q(e)g(used)h(as)g (examples)e(to)i(highligh)o(t)f(the)g(most)g(imp)q(ortan)o(t)f(factors)-57 725 y(that)h(m)o(ust)e(b)q(e)i(considered)f(in)h(designing)f(linear)g (algebra)h(soft)o(w)o(are)g(for)g(adv)m(anced-arc)o(hitecture)f(computers.) -57 791 y(W)l(e)k(use)h(these)g(factorization)f(routines)h(for)g(illustrativ) o(e)e(purp)q(oses)j(not)f(only)g(b)q(ecause)f(they)h(are)g(relativ)o(ely)-57 858 y(simple,)10 b(but)j(also)g(b)q(ecause)f(of)h(their)e(imp)q(ortance)g(in) h(sev)o(eral)g(scien)o(ti\014c)e(and)j(engineering)e(applications)i(that)-57 924 y(mak)o(e)j(use)i(of)g(b)q(oundary)h(elemen)o(t)c(metho)q(ds.)25 b(These)18 b(applications)f(include)g(electromagnetic)e(scattering)-57 990 y(and)i(computational)e(\015uid)h(dynamics)f(problems,)g(as)h(discussed)h (in)f(more)e(detail)i(in)g(Section)g(4.1.)-57 1078 y(Muc)o(h)i(of)g(the)g(w)o (ork)g(in)g(dev)o(eloping)g(linear)f(algebra)i(soft)o(w)o(are)f(for)h(adv)m (anced-arc)o(hitecture)e(computers)g(is)-57 1144 y(motiv)m(ated)11 b(b)o(y)i(the)f(need)g(to)h(solv)o(e)f(large)h(problems)e(on)j(the)e(fastest) h(computers)f(a)o(v)m(ailable.)19 b(In)12 b(this)h(c)o(hapter,)-57 1210 y(w)o(e)i(fo)q(cus)h(on)g(four)g(basic)g(issues:)21 b(\(1\))16 b(the)f(motiv)m(ation)g(for)g(the)h(w)o(ork;)f(\(2\))h(the)f(dev)o(elopmen)o (t)e(of)j(standards)-57 1276 y(for)k(use)f(in)h(linear)f(algebra)h(and)g(the) f(building)g(blo)q(c)o(ks)g(for)h(a)g(library;)g(\(3\))g(asp)q(ects)h(of)e (algorithm)g(design)-57 1343 y(and)e(parallel)e(implem)o(en)o(tation;)e(and)k (\(4\))f(future)g(directions)g(for)g(researc)o(h.)-57 1430 y(F)l(or)g(the)f(past)i(15)f(y)o(ears)g(or)g(so,)g(there)f(has)h(b)q(een)g(a) g(great)g(deal)g(of)g(activit)o(y)e(in)h(the)g(area)i(of)f(algorithms)e(and) -57 1496 y(soft)o(w)o(are)i(for)h(solving)f(linear)g(algebra)h(problems.)j (The)c(linear)g(algebra)h(comm)o(uni)o(t)o(y)c(has)k(long)g(recognized)-57 1563 y(the)j(need)g(for)h(help)f(in)g(dev)o(eloping)f(algorithms)h(in)o(to)g (soft)o(w)o(are)g(libraries,)g(and)h(sev)o(eral)e(y)o(ears)h(ago,)j(as)e(a) -57 1629 y(comm)o(unit)n(y)15 b(e\013ort,)k(put)f(together)h(a)f Fg(de)i(facto)e Fm(standard)i(for)e(iden)o(tifying)f(basic)h(op)q(erations)h (required)e(in)-57 1695 y(linear)i(algebra)h(algorithms)f(and)h(soft)o(w)o (are.)32 b(The)19 b(hop)q(e)i(w)o(as)f(that)g(the)f(routines)h(making)f(up)h (this)f(stan-)-57 1761 y(dard,)d(kno)o(wn)g(collectiv)o(ely)c(as)17 b(the)e(Basic)g(Linear)h(Algebra)g(Subprograms)g(\(BLAS\),)f(w)o(ould)h(b)q (e)g(e\016cien)o(tly)-57 1828 y(implem)o(en)o(te)o(d)c(on)j(adv)m(anced-arc)o (hitecture)f(computers)f(b)o(y)h(man)o(y)f(man)o(ufacturers,)g(making)h(it)g (p)q(ossible)h(to)-57 1894 y(reap)f(the)f(p)q(ortabilit)o(y)g(b)q(ene\014ts)g (of)h(ha)o(ving)f(them)f(e\016cien)o(tly)f(implem)o(en)o(te)o(d)g(on)j(a)g (wide)f(range)h(of)f(mac)o(hines.)-57 1960 y(This)j(goal)h(has)g(b)q(een)f (largely)g(realized.)-57 2048 y(The)f(k)o(ey)e(insigh)o(t)h(of)h(our)g (approac)o(h)g(to)g(designing)g(linear)f(algebra)h(algorithms)f(for)h(adv)m (anced)g(arc)o(hitecture)-57 2114 y(computers)23 b(is)h(that)h(the)f (frequency)f(with)i(whic)o(h)e(data)i(are)g(mo)o(v)o(ed)d(b)q(et)o(w)o(een)i (di\013eren)o(t)f(lev)o(els)g(of)h(the)-57 2180 y(memory)19 b(hierarc)o(h)o(y)i(m)o(ust)g(b)q(e)i(minim)o(iz)o(ed)c(in)j(order)h(to)f (attain)h(high)g(p)q(erformance.)38 b(Th)o(us,)24 b(our)e(main)-57 2246 y(algorithmic)13 b(approac)o(h)i(for)g(exploiting)e(b)q(oth)j(v)o (ectorization)d(and)i(parallelism)d(in)i(our)h(impleme)o(n)o(tations)d(is)-57 2312 y(the)i(use)g(of)g(blo)q(c)o(k-partitioned)g(algorithms,)f(particularly) g(in)h(conjunction)g(with)g(highly-tuned)g(k)o(ernels)f(for)-57 2379 y(p)q(erforming)k(matrix-v)o(ector)f(and)j(matrix-matrix)d(op)q (erations)j(\(the)f(Lev)o(el)f(2)i(and)g(3)g(BLAS\).)f(In)g(general,)-57 2445 y(the)23 b(use)g(of)g(blo)q(c)o(k-partitioned)f(algorithms)g(requires)g (data)i(to)f(b)q(e)h(mo)o(v)o(ed)c(as)k(blo)q(c)o(ks,)g(rather)f(than)g(as) -57 2511 y(v)o(ectors)18 b(or)h(scalars,)g(so)g(that)g(although)g(the)g (total)f(amoun)o(t)g(of)h(data)g(mo)o(v)o(ed)d(is)j(unc)o(hanged,)g(the)f (latency)-57 2577 y(\(or)h(startup)h(cost\))f(asso)q(ciated)h(with)f(the)g (mo)o(v)o(eme)o(n)o(t)d(is)j(greatly)g(reduced)f(b)q(ecause)h(few)o(er)f (messages)h(are)-57 2644 y(needed)d(to)g(mo)o(v)o(e)e(the)i(data.)951 2825 y(1)p eop %%Page: 2 4 3 bop -57 125 a Fm(A)18 b(second)h(k)o(ey)f(idea)g(is)h(that)g(the)f(p)q (erformance)g(of)h(an)g(algorithm)f(can)h(b)q(e)g(tuned)f(b)o(y)h(a)g(user)g (b)o(y)f(v)m(arying)-57 191 y(the)e(parameters)g(that)h(sp)q(ecify)f(the)g (data)h(la)o(y)o(out.)22 b(On)16 b(shared)h(memory)d(mac)o(hines,)g(this)j (is)f(con)o(trolled)g(b)o(y)-57 257 y(the)h(blo)q(c)o(k)g(size,)f(while)h(on) h(distributed)e(memory)f(mac)o(hines)g(it)i(is)g(con)o(trolled)g(b)o(y)g(the) g(blo)q(c)o(k)g(size)f(and)i(the)-57 323 y(con\014guration)f(of)g(the)f (logical)g(pro)q(cess)h(mesh,)d(as)j(describ)q(ed)f(in)g(more)e(detail)i(in)g (Section)g(5.)-57 408 y(In)h(Section)h(1,)g(w)o(e)f(\014rst)h(giv)o(e)f(an)h (o)o(v)o(erview)e(of)i(some)f(of)h(the)f(ma)s(jor)g(soft)o(w)o(are)h(pro)s (jects)f(aimed)f(at)i(solving)-57 474 y(dense)13 b(linear)f(algebra)h (problems.)19 b(Next,)12 b(w)o(e)g(describ)q(e)g(the)g(t)o(yp)q(es)h(of)g (mac)o(hine)e(that)i(b)q(ene\014t)g(most)f(from)f(the)-57 540 y(use)j(of)g(blo)q(c)o(k-partitioned)g(algorithms,)f(and)h(discuss)g(what)h (is)f(mean)o(t)e(b)o(y)h(high-qualit)o(y)l(,)g(reusable)h(soft)o(w)o(are)-57 607 y(for)g(adv)m(anced-arc)o(hitecture)f(computers.)19 b(Section)14 b(2)g(discusses)g(the)g(role)f(of)h(the)g(BLAS)g(in)f(p)q(ortabilit)o(y)g (and)-57 673 y(p)q(erformance)19 b(on)i(high-p)q(erformance)f(computers.)33 b(W)l(e)20 b(discuss)g(the)h(design)f(of)h(these)f(building)g(blo)q(c)o(ks,) -57 739 y(and)d(their)f(use)h(in)f(blo)q(c)o(k-partitioned)g(algorithms,)f (in)i(Section)f(3.)22 b(Section)16 b(4)h(fo)q(cuses)h(on)f(the)f(design)h(of) g(a)-57 805 y(blo)q(c)o(k-partitioned)h(algorithm)g(for)i(LU)f (factorization,)g(and)g(Sections)g(5,)h(6,)f(and)h(7)f(use)g(this)g(example)e (to)-57 872 y(illustrate)g(the)i(most)f(imp)q(ortan)o(t)f(factors)i(in)f (impleme)o(n)o(ting)e(dense)i(linear)g(algebra)h(routines)f(on)h(MIMD,)-57 938 y(distributed)e(mem)o(ory)l(,)e(concurren)o(t)h(computers.)23 b(Section)16 b(5)i(deals)f(with)g(the)g(issue)g(of)h(mapping)e(the)h(data)-57 1004 y(on)o(to)h(the)g(hierarc)o(hical)d(memory)g(of)j(a)g(concurren)o(t)f (computer.)24 b(The)17 b(la)o(y)o(out)h(of)f(an)i(application's)e(data)h(is) -57 1070 y(crucial)d(in)h(determining)e(the)h(p)q(erformance)g(and)i (scalabilit)o(y)d(of)j(the)f(parallel)f(co)q(de.)21 b(In)16 b(Sections)g(6)g(and)h(7,)-57 1136 y(details)g(of)g(the)g(parallel)f(implem)o (en)o(tation)e(and)k(optimization)d(issues)i(are)h(discussed.)23 b(Section)17 b(8)g(presen)o(ts)-57 1203 y(some)e(future)h(directions)g(for)g (in)o(v)o(estigation.)-57 1370 y Fo(1.1)70 b(Dense)22 b(Linear)g(Algebra)h (Libraries)-57 1487 y Fm(Ov)o(er)13 b(the)h(past)h(t)o(w)o(en)o(t)o(y-\014v)o (e)d(y)o(ears,)h(the)h(\014rst)h(author)g(has)g(b)q(een)f(directly)e(in)o(v)o (olv)o(ed)g(in)i(the)g(dev)o(elopmen)o(t)d(of)-57 1553 y(sev)o(eral)18 b(imp)q(ortan)o(t)g(pac)o(k)m(ages)h(of)g(dense)g(linear)f(algebra)h(soft)o (w)o(are:)27 b(EISP)l(A)o(CK,)17 b(LINP)l(A)o(CK,)h(LAP)l(A)o(CK,)-57 1619 y(and)h(the)f(BLAS.)f(In)h(addition,)g(b)q(oth)h(authors)h(are)e(curren) o(tly)e(in)o(v)o(olv)o(ed)g(in)i(the)g(dev)o(elopmen)o(t)d(of)k(ScaLA-)-57 1685 y(P)l(A)o(CK,)g(a)i(scalable)f(v)o(ersion)g(of)h(LAP)l(A)o(CK)f(for)g (distributed)g(memory)e(concurren)o(t)h(computers.)33 b(In)20 b(this)-57 1751 y(section,)15 b(w)o(e)g(giv)o(e)g(a)h(brief)f(review)g(of)h (these)f(pac)o(k)m(ages|their)g(history)l(,)h(their)e(adv)m(an)o(tages,)j (and)f(their)f(limi-)-57 1818 y(tations)i(on)g(high-p)q(erformance)e (computers.)-57 1969 y Fi(1.1.1)55 b(EISP)-5 b(A)n(CK)-57 2086 y Fm(EISP)l(A)o(CK)17 b(is)h(a)g(collection)e(of)i(F)l(ortran)g(subroutines)h (that)f(compute)e(the)i(eigen)o(v)m(alues)e(and)j(eigen)o(v)o(ectors)-57 2152 y(of)g(nine)g(classes)g(of)g(matrices:)25 b(complex)17 b(general,)i(complex)e(Hermitian,)g(real)i(general,)g(real)f(symmetric)o(,) -57 2218 y(real)c(symmetri)o(c)d(banded,)k(real)e(symmetric)e(tridiagonal,)j (sp)q(ecial)g(real)f(tridiagonal,)i(generalized)e(real,)g(and)-57 2284 y(generalized)19 b(real)h(symmetric)d(matrices.)32 b(In)20 b(addition,)i(t)o(w)o(o)e(routines)g(are)h(included)e(that)i(use)g(singular) -57 2351 y(v)m(alue)16 b(decomp)q(osition)f(to)i(solv)o(e)e(certain)h (least-squares)h(problems.)-57 2435 y(EISP)l(A)o(CK)k(is)h(primarily)d(based) j(on)g(a)h(collection)d(of)i(Algol)f(pro)q(cedures)h(dev)o(elop)q(ed)f(in)g (the)h(1960s)h(and)-57 2501 y(collected)12 b(b)o(y)i(J.)f(H.)g(Wilkinson)g (and)i(C.)e(Reinsc)o(h)g(in)h(a)g(v)o(olume)e(en)o(titled)g Fg(Line)n(ar)i(A)o(lgebr)n(a)h Fm(in)e(the)h Fg(Handb)n(o)n(ok)-57 2568 y(for)20 b(A)o(utomatic)h(Computation)f Fm([57])g(series.)31 b(This)20 b(v)o(olume)d(w)o(as)j(not)g(designed)g(to)g(co)o(v)o(er)f(ev)o (ery)f(p)q(ossible)-57 2634 y(metho)q(d)j(of)g(solution;)j(rather,)e (algorithms)f(w)o(ere)f(c)o(hosen)h(on)h(the)f(basis)h(of)g(their)f (generalit)o(y)l(,)f(elegance,)-57 2700 y(accuracy)l(,)15 b(sp)q(eed,)h(or)h (econom)o(y)d(of)j(storage.)951 2825 y(2)p eop %%Page: 3 5 4 bop -57 125 a Fm(Since)21 b(the)h(release)f(of)h(EISP)l(A)o(CK)g(in)g (1972,)i(o)o(v)o(er)d(ten)h(thousand)h(copies)f(of)g(the)g(collection)f(ha)o (v)o(e)g(b)q(een)-57 191 y(distributed)16 b(w)o(orldwide.)-57 348 y Fi(1.1.2)55 b(LINP)-5 b(A)n(CK)-57 468 y Fm(LINP)l(A)o(CK)21 b(is)h(a)g(collection)f(of)h(F)l(ortran)h(subroutines)f(that)h(analyze)e(and) i(solv)o(e)e(linear)g(equations)h(and)-57 534 y(linear)f(least-squares)g (problems.)36 b(The)21 b(pac)o(k)m(age)h(solv)o(es)f(linear)g(systems)f (whose)i(matrices)d(are)j(general,)-57 601 y(banded,)h(symmetri)o(c)18 b(inde\014nite,)k(symmetri)o(c)c(p)q(ositiv)o(e)j(de\014nite,)h(triangular,)g (and)g(tridiagonal)g(square.)-57 667 y(In)g(addition,)h(the)f(pac)o(k)m(age)h (computes)e(the)h(QR)g(and)h(singular)g(v)m(alue)f(decomp)q(ositions)f(of)i (rectangular)-57 733 y(matrices)14 b(and)j(applies)f(them)f(to)h (least-squares)h(problems.)-57 821 y(LINP)l(A)o(CK)i(is)h(organized)g(around) g(four)h(matrix)d(factorizations:)28 b(LU)20 b(factorization,)g(piv)o(oted)f (Cholesky)-57 887 y(factorization,)h(QR)g(factorization,)g(and)g(singular)g (v)m(alue)g(decomp)q(osition.)31 b(The)19 b(term)f(LU)i(factorization)-57 953 y(is)f(used)h(here)f(in)g(a)h(v)o(ery)f(general)g(sense)h(to)g(mean)e (the)i(factorization)f(of)h(a)g(square)g(matrix)e(in)o(to)h(a)h(lo)o(w)o(er) -57 1019 y(triangular)f(part)g(and)h(an)f(upp)q(er)g(triangular)g(part,)g(p)q (erhaps)h(with)e(piv)o(oting.)29 b(These)18 b(factorizations)h(will)-57 1086 y(b)q(e)h(treated)g(at)g(greater)g(length)f(later,)h(when)g(the)g (actual)f(LINP)l(A)o(CK)g(subroutines)i(are)f(discussed.)31 b(But)-57 1152 y(\014rst)17 b(a)f(digression)h(on)f(organization)h(and)g (factors)g(in\015uencing)f(LINP)l(A)o(CK's)f(e\016ciency)f(is)i(necessary)l (.)-57 1239 y(LINP)l(A)o(CK)f(uses)i(column-orien)o(ted)d(algorithms)i(to)g (increase)g(e\016ciency)e(b)o(y)i(preserving)g(lo)q(calit)o(y)f(of)i(refer-) -57 1306 y(ence.)j(This)14 b(means)f(that)i(if)f(a)g(program)g(references)f (an)i(item)d(in)i(a)g(particular)g(blo)q(c)o(k,)g(the)g(next)g(reference)e (is)-57 1372 y(lik)o(ely)g(to)j(b)q(e)g(in)g(the)f(same)g(blo)q(c)o(k.)20 b(By)14 b(column)g(orien)o(tation)g(w)o(e)h(mean)e(that)i(the)g(LINP)l(A)o (CK)f(co)q(des)h(alw)o(a)o(ys)-57 1438 y(reference)k(arra)o(ys)h(do)o(wn)h (columns,)e(not)i(across)g(ro)o(ws.)34 b(This)20 b(w)o(orks)h(b)q(ecause)f(F) l(ortran)h(stores)g(arra)o(ys)f(in)-57 1504 y(column)c(ma)s(jor)g(order.)24 b(Th)o(us,)17 b(as)g(one)h(pro)q(ceeds)f(do)o(wn)h(a)f(column)f(of)h(an)h (arra)o(y)l(,)e(the)h(memory)d(references)-57 1570 y(pro)q(ceed)j(sequen)o (tially)d(in)j(memory)l(.)i(On)e(the)f(other)h(hand,)g(as)g(one)g(pro)q (ceeds)g(across)h(a)f(ro)o(w,)f(the)h(memory)-57 1637 y(references)i(jump)g (across)j(memory)l(,)c(the)i(length)h(of)f(the)h(jump)e(b)q(eing)i(prop)q (ortional)g(to)g(the)f(length)h(of)g(a)-57 1703 y(column.)g(The)16 b(e\013ects)g(of)h(column)e(orien)o(tation)i(are)f(quite)g(dramatic:)k(on)d (systems)f(with)g(virtual)g(or)h(cac)o(he)-57 1769 y(memories,)11 b(the)j(LINP)l(A)o(CK)f(co)q(des)i(will)e(signi\014can)o(tly)g(outp)q(erform) h(co)q(des)g(that)h(are)f(not)h(column)d(orien)o(ted.)-57 1835 y(W)l(e)k(note,)g(ho)o(w)o(ev)o(er,)e(that)j(textb)q(o)q(ok)g(examples)d(of)j (matrix)d(algorithms)i(are)g(seldom)f(column)g(orien)o(ted.)-57 1923 y(Another)f(imp)q(ortan)o(t)g(factor)h(in\015uencing)f(the)h (e\016ciency)d(of)j(LINP)l(A)o(CK)f(is)h(the)f(use)h(of)g(the)f(Lev)o(el)g(1) h(BLAS;)-57 1989 y(there)h(are)g(three)g(e\013ects.)-57 2077 y(First,)c(the)g(o)o(v)o(erhead)g(en)o(tailed)f(in)h(calling)g(the)g(BLAS)g (reduces)g(the)g(e\016ciency)e(of)j(the)f(co)q(de.)20 b(This)12 b(reduction)-57 2143 y(is)i(negligible)f(for)i(large)f(matrices,)e(but)j(it)f (can)g(b)q(e)h(quite)e(signi\014can)o(t)h(for)h(small)d(matrices.)19 b(The)14 b(matrix)f(size)-57 2209 y(at)h(whic)o(h)f(it)g(b)q(ecomes)f(unimp)q (ortan)o(t)h(v)m(aries)g(from)g(system)f(to)i(system;)e(for)i(square)g (matrices)e(it)h(is)g(t)o(ypically)-57 2275 y(b)q(et)o(w)o(een)20 b Ff(n)j Fm(=)g(25)f(and)g Ff(n)h Fm(=)f(100.)38 b(If)21 b(this)g(seems)f (lik)o(e)g(an)i(unacceptably)f(large)g(o)o(v)o(erhead,)h(remem)n(b)q(er)-57 2342 y(that)d(on)g(man)o(y)e(mo)q(dern)h(systems)g(the)g(solution)h(of)g(a)g (system)e(of)i(order)g(25)g(or)g(less)f(is)h(itself)e(a)i(negligible)-57 2408 y(calculation.)h(Nonetheless,)14 b(it)g(cannot)h(b)q(e)g(denied)f(that)h (a)g(p)q(erson)g(whose)h(programs)e(dep)q(end)h(critically)e(on)-57 2474 y(solving)19 b(small)f(matrix)g(problems)g(in)h(inner)f(lo)q(ops)j(will) d(b)q(e)h(b)q(etter)g(o\013)i(with)e(BLAS-less)g(v)o(ersions)g(of)g(the)-57 2540 y(LINP)l(A)o(CK)13 b(co)q(des.)21 b(F)l(ortunately)l(,)13 b(the)g(BLAS)h(can)g(b)q(e)g(remo)o(v)o(ed)d(from)h(the)i(smaller,)e(more)g (frequen)o(tly)g(used)-57 2607 y(program)k(in)g(a)h(short)g(editing)e (session.)-57 2694 y(Second,)26 b(the)d(BLAS)h(impro)o(v)o(e)e(the)h (e\016ciency)f(of)j(programs)f(when)g(they)g(are)g(run)g(on)h(nonoptimizing) 951 2825 y(3)p eop %%Page: 4 6 5 bop -57 125 a Fm(compilers.)18 b(This)d(is)f(b)q(ecause)h(doubly)f (subscripted)g(arra)o(y)g(references)f(in)h(the)g(inner)g(lo)q(op)h(of)g(the) f(algorithm)-57 191 y(are)k(replaced)e(b)o(y)h(singly)h(subscripted)f(arra)o (y)h(references)e(in)h(the)g(appropriate)i(BLAS.)d(The)i(e\013ect)f(can)h(b)q (e)-57 257 y(seen)e(for)g(matrices)f(of)h(quite)g(small)e(order,)i(and)h(for) g(large)f(orders)g(the)g(sa)o(vings)h(are)f(quite)g(signi\014can)o(t.)-57 345 y(Finally)l(,)g(impro)o(v)o(ed)f(e\016ciency)h(can)h(b)q(e)h(ac)o(hiev)o (ed)e(b)o(y)h(co)q(ding)h(a)g(set)f(of)h(BLAS)g([17])f(to)h(tak)o(e)f(adv)m (an)o(tage)i(of)-57 411 y(the)14 b(sp)q(ecial)f(features)h(of)h(the)e (computers)g(on)i(whic)o(h)e(LINP)l(A)o(CK)g(is)h(b)q(eing)g(run.)21 b(F)l(or)14 b(most)f(computers,)g(this)-57 477 y(simply)h(means)g(pro)q (ducing)j(mac)o(hine-language)d(v)o(ersions.)21 b(Ho)o(w)o(ev)o(er,)13 b(the)j(co)q(de)g(can)g(also)g(tak)o(e)f(adv)m(an)o(tage)-57 544 y(of)i(more)d(exotic)i(arc)o(hitectural)f(features,)g(suc)o(h)h(as)h(v)o (ector)f(op)q(erations.)-57 631 y(F)l(urther)g(details)g(ab)q(out)h(the)f (BLAS)g(are)g(presen)o(ted)g(in)g(Section)f(2.)-57 789 y Fi(1.1.3)55 b(LAP)-5 b(A)n(CK)-57 908 y Fm(LAP)l(A)o(CK)10 b([14)q(])g(pro)o(vides)h (routines)f(for)i(solving)e(systems)g(of)h(sim)o(ultaneous)f(linear)g (equations,)i(least-squares)-57 975 y(solutions)17 b(of)g(linear)f(systems)f (of)i(equations,)f(eigen)o(v)m(alue)f(problems,)g(and)i(singular)g(v)m(alue)f (problems.)21 b(The)-57 1041 y(asso)q(ciated)14 b(matrix)c(factorizations)j (\(LU,)f(Cholesky)l(,)h(QR,)e(SVD,)h(Sc)o(h)o(ur,)h(generalized)e(Sc)o(h)o (ur\))h(are)g(also)h(pro-)-57 1107 y(vided,)h(as)i(are)g(related)f (computations)g(suc)o(h)g(as)h(reordering)f(of)h(the)f(Sc)o(h)o(ur)g (factorizations)g(and)i(estimating)-57 1173 y(condition)i(n)o(um)o(b)q(ers.) 29 b(Dense)19 b(and)h(banded)g(matrices)d(are)j(handled,)f(but)h(not)f (general)g(sparse)h(matrices.)-57 1239 y(In)e(all)f(areas,)i(similar)d (functionalit)o(y)g(is)i(pro)o(vided)f(for)h(real)g(and)h(complex)c (matrices,)h(in)i(b)q(oth)h(single)e(and)-57 1306 y(double)f(precision.)-57 1393 y(The)21 b(original)h(goal)g(of)f(the)h(LAP)l(A)o(CK)f(pro)s(ject)f(w)o (as)i(to)g(mak)o(e)d(the)j(widely)e(used)h(EISP)l(A)o(CK)g(and)h(LIN-)-57 1460 y(P)l(A)o(CK)c(libraries)f(run)h(e\016cien)o(tly)d(on)k(shared-memory)d (v)o(ector)h(and)i(parallel)e(pro)q(cessors.)28 b(On)18 b(these)g(ma-)-57 1526 y(c)o(hines,)d(LINP)l(A)o(CK)g(and)h(EISP)l(A)o(CK)f(are)h(ine\016cien)o (t)e(b)q(ecause)i(their)f(memory)e(access)j(patterns)h(disregard)-57 1592 y(the)h(m)o(ultila)o(y)n(ere)o(d)d(memory)g(hierarc)o(hies)i(of)h(the)g (mac)o(hines,)e(thereb)o(y)g(sp)q(ending)j(to)q(o)g(m)o(uc)o(h)d(time)g(mo)o (ving)-57 1658 y(data)j(instead)f(of)h(doing)g(useful)e(\015oating-p)q(oin)o (t)j(op)q(erations.)28 b(LAP)l(A)o(CK)18 b(addresses)h(this)f(problem)e(b)o (y)i(re-)-57 1724 y(organizing)g(the)f(algorithms)g(to)h(use)f(blo)q(c)o(k)g (matrix)f(op)q(erations,)j(suc)o(h)e(as)h(matrix)e(m)o(ultiplic)o(ation,)f (in)i(the)-57 1791 y(innermost)12 b(lo)q(ops)h([3,)g(14)q(].)19 b(These)13 b(blo)q(c)o(k)f(op)q(erations)i(can)f(b)q(e)g(optimized)e(for)i (eac)o(h)g(arc)o(hitecture)e(to)i(accoun)o(t)-57 1857 y(for)18 b(the)g(memory)e(hierarc)o(h)o(y)g([2],)i(and)h(so)g(pro)o(vide)e(a)i(transp) q(ortable)g(w)o(a)o(y)f(to)h(ac)o(hiev)o(e)d(high)j(e\016ciency)d(on)-57 1923 y(div)o(erse)f(mo)q(dern)h(mac)o(hines.)k(Here)15 b(w)o(e)h(use)h(the)f (term)f(\\transp)q(ortable")j(instead)f(of)f(\\p)q(ortable")i(b)q(ecause,)-57 1989 y(for)h(fastest)h(p)q(ossible)f(p)q(erformance,)f(LAP)l(A)o(CK)h (requires)f(that)i(highly)e(optimized)f(blo)q(c)o(k)h(matrix)g(op)q(era-)-57 2055 y(tions)h(b)q(e)g(already)g(implem)o(en)n(ted)d(on)k(eac)o(h)e(mac)o (hine.)27 b(In)18 b(other)h(w)o(ords,)h(the)f(correctness)f(of)h(the)g(co)q (de)g(is)-57 2122 y(p)q(ortable,)d(but)h(high)f(p)q(erformance)f(is)h(not|if) g(w)o(e)g(limit)e(ourselv)o(es)h(to)i(a)f(single)g(F)l(ortran)h(source)f(co)q (de.)-57 2209 y(LAP)l(A)o(CK)j(can)g(b)q(e)g(regarded)g(as)h(a)f(successor)h (to)f(LINP)l(A)o(CK)f(and)i(EISP)l(A)o(CK.)e(It)g(has)i(virtually)e(all)g (the)-57 2275 y(capabilities)12 b(of)i(these)g(t)o(w)o(o)f(pac)o(k)m(ages)i (and)f(m)o(uc)o(h)d(more)i(b)q(esides.)20 b(LAP)l(A)o(CK)13 b(impro)o(v)o(es)e(on)k(LINP)l(A)o(CK)d(and)-57 2342 y(EISP)l(A)o(CK)g(in)g (four)h(main)e(resp)q(ects:)19 b(sp)q(eed,)13 b(accuracy)l(,)f(robustness)i (and)f(functionalit)o(y)l(.)18 b(While)12 b(LINP)l(A)o(CK)-57 2408 y(and)20 b(EISP)l(A)o(CK)f(are)h(based)g(on)h(the)e(v)o(ector)g(op)q (eration)i(k)o(ernels)d(of)i(the)g(Lev)o(el)e(1)i(BLAS,)f(LAP)l(A)o(CK)h(w)o (as)-57 2474 y(designed)c(at)h(the)f(outset)h(to)g(exploit)e(the)i(Lev)o(el)e (3)i(BLAS)f(|a)h(set)f(of)h(sp)q(eci\014cations)f(for)h(F)l(ortran)g(subpro-) -57 2540 y(grams)d(that)h(do)g(v)m(arious)h(t)o(yp)q(es)e(of)h(matrix)e(m)o (ultiplic)o(ation)f(and)k(the)e(solution)h(of)g(triangular)g(systems)e(with) -57 2607 y(m)o(ultiple)k(righ)o(t-hand)k(sides.)34 b(Because)19 b(of)i(the)f(coarse)h(gran)o(ularit)o(y)f(of)g(the)g(Lev)o(el)f(3)i(BLAS)f (op)q(erations,)-57 2673 y(their)c(use)h(tends)g(to)g(promote)f(high)i (e\016ciency)c(on)k(man)o(y)d(high-p)q(erformance)h(computers,)g (particularly)g(if)951 2825 y(4)p eop %%Page: 5 7 6 bop -57 125 a Fm(sp)q(ecially)15 b(co)q(ded)i(implem)o(e)o(n)o(tations)d (are)i(pro)o(vided)g(b)o(y)g(the)g(man)o(ufacturer.)-57 276 y Fi(1.1.4)55 b(ScaLAP)-5 b(A)n(CK)-57 393 y Fm(The)16 b(ScaLAP)l(A)o(CK)g (soft)o(w)o(are)g(library)l(,)e(sc)o(heduled)h(for)i(completion)d(b)o(y)h (the)h(end)g(of)g(1994,)h(will)e(extend)g(the)-57 459 y(LAP)l(A)o(CK)g (library)g(to)g(run)h(scalably)f(on)h(MIMD,)e(distributed)h(memory)l(,)d (concurren)o(t)j(computers)f([10,)h(11)q(].)-57 525 y(F)l(or)c(suc)o(h)g(mac) o(hines)f(the)h(memory)d(hierarc)o(h)o(y)i(includes)g(the)h(o\013-pro)q (cessor)i(memory)8 b(of)k(other)f(pro)q(cessors,)i(in)-57 591 y(addition)g(to)g(the)f(hierarc)o(h)o(y)f(of)i(registers,)f(cac)o(he,)g(and)h (lo)q(cal)g(memory)c(on)k(eac)o(h)f(pro)q(cessor.)21 b(Lik)o(e)12 b(LAP)l(A)o(CK,)-57 658 y(the)20 b(ScaLAP)l(A)o(CK)g(routines)g(are)g(based)h (on)g(blo)q(c)o(k-partitioned)e(algorithms)h(in)g(order)g(to)h(minim)o(iz)o (e)c(the)-57 724 y(frequency)d(of)h(data)h(mo)o(v)o(em)o(en)o(t)c(b)q(et)o(w) o(een)i(di\013eren)o(t)g(lev)o(els)g(of)h(the)g(memory)d(hierarc)o(h)o(y)l(.) 19 b(The)c(fundamen)o(tal)-57 790 y(building)i(blo)q(c)o(ks)g(of)h(the)g (ScaLAP)l(A)o(CK)f(library)g(are)h(distributed)f(memory)d(v)o(ersions)k(of)g (the)f(Lev)o(el)f(2)i(and)-57 856 y(Lev)o(el)12 b(3)i(BLAS,)f(and)h(a)f(set)h (of)f(Basic)g(Linear)h(Algebra)f(Comm)o(unication)e(Subprograms)i(\(BLA)o (CS\))g([16)q(,)g(26])-57 923 y(for)21 b(comm)o(unic)o(ation)e(tasks)i(that)g (arise)g(frequen)o(tly)e(in)h(parallel)g(linear)g(algebra)h(computations.)35 b(In)20 b(the)-57 989 y(ScaLAP)l(A)o(CK)i(routines,)i(all)f(in)o(terpro)q (cessor)f(comm)o(unication)e(o)q(ccurs)j(within)g(the)f(distributed)g(BLAS) -57 1055 y(and)17 b(the)f(BLA)o(CS,)g(so)h(the)f(source)g(co)q(de)h(of)g(the) f(top)h(soft)o(w)o(are)f(la)o(y)o(er)f(of)i(ScaLAP)l(A)o(CK)f(lo)q(oks)h(v)o (ery)e(similar)-57 1121 y(to)i(that)f(of)h(LAP)l(A)o(CK.)-57 1206 y(W)l(e)h(en)o(visage)f(a)h(n)o(um)o(b)q(er)f(of)h(user)g(in)o(terfaces) f(to)h(ScaLAP)l(A)o(CK.)f(Initially)l(,)f(the)i(in)o(terface)e(will)h(b)q(e)h (similar)-57 1272 y(to)24 b(that)g(of)g(LAP)l(A)o(CK,)e(with)i(some)e (additional)i(argumen)o(ts)f(passed)h(to)g(eac)o(h)f(routine)g(to)h(sp)q (ecify)f(the)-57 1338 y(data)c(la)o(y)o(out.)26 b(Once)18 b(this)g(is)g(in)g (place,)f(w)o(e)h(in)o(tend)f(to)i(mo)q(dify)d(the)i(in)o(terface)f(so)i(the) f(argumen)o(ts)f(to)i(eac)o(h)-57 1404 y(ScaLAP)l(A)o(CK)c(routine)g(are)h (the)f(same)g(as)h(in)f(LAP)l(A)o(CK.)g(This)g(will)f(require)h(information)f (ab)q(out)j(the)e(data)-57 1471 y(distribution)h(of)g(eac)o(h)g(matrix)f(and) i(v)o(ector)e(to)i(b)q(e)f(hidden)g(from)f(the)i(user.)k(This)16 b(ma)o(y)f(b)q(e)i(done)f(b)o(y)g(means)-57 1537 y(of)i(a)g(ScaLAP)l(A)o(CK)g (initialization)e(routine.)25 b(This)18 b(in)o(terface)e(will)h(b)q(e)h (fully)f(compatible)e(with)j(LAP)l(A)o(CK.)-57 1603 y(Pro)o(vided)13 b(\\dumm)o(y")f(v)o(ersions)h(of)h(the)g(ScaLAP)l(A)o(CK)f(initialization)g (routine)g(and)i(the)e(BLA)o(CS)h(are)g(added)-57 1669 y(to)h(LAP)l(A)o(CK,)e (there)h(will)f(b)q(e)i(no)g(distinction)f(b)q(et)o(w)o(een)f(LAP)l(A)o(CK)h (and)h(ScaLAP)l(A)o(CK)f(at)h(the)f(application)-57 1735 y(lev)o(el,)k (though)k(eac)o(h)d(will)g(link)g(to)h(di\013eren)o(t)g(v)o(ersions)f(of)i (the)e(BLAS)h(and)h(BLA)o(CS.)e(F)l(ollo)o(wing)h(on)g(from)-57 1802 y(this,)e(w)o(e)g(will)f(exp)q(erimen)o(t)f(with)i(ob)s(ject-based)h(in) o(terfaces)e(for)i(LAP)l(A)o(CK)e(and)i(ScaLAP)l(A)o(CK,)f(with)g(the)-57 1868 y(goal)f(of)g(dev)o(eloping)e(in)o(terfaces)g(compatible)f(with)i(F)l (ortran)h(90)g([10])f(and)h(C++)f([24)q(].)-57 2035 y Fo(1.2)70 b(T)-6 b(arget)23 b(Arc)n(hitectures)-57 2152 y Fm(The)18 b(EISP)l(A)o(CK)g (and)h(LINP)l(A)o(CK)e(soft)o(w)o(are)h(libraries)g(w)o(ere)f(designed)h(for) h(sup)q(ercomputers)e(used)h(in)g(the)-57 2218 y(1970s)f(and)f(early)f (1980s,)i(suc)o(h)f(as)g(the)f(CDC-7600,)j(Cyb)q(er)d(205,)i(and)f(Cra)o (y-1.)21 b(These)16 b(mac)o(hines)d(featured)-57 2284 y(m)o(ultiple)j (functional)k(units)g(pip)q(elined)e(for)i(go)q(o)q(d)i(p)q(erformance)c([43) q(].)31 b(The)19 b(CDC-7600)k(w)o(as)d(basically)f(a)-57 2351 y(high-p)q(erformance)13 b(scalar)h(computer,)e(while)h(the)g(Cyb)q(er)h(205) h(and)f(Cra)o(y-1)g(w)o(ere)f(early)g(v)o(ector)g(computers.)-57 2435 y(The)18 b(dev)o(elopmen)o(t)c(of)k(LAP)l(A)o(CK)f(in)g(the)h(late)f (1980s)i(w)o(as)f(in)o(tended)f(to)h(mak)o(e)e(the)h(EISP)l(A)o(CK)g(and)h (LIN-)-57 2501 y(P)l(A)o(CK)k(libraries)f(run)i(e\016cien)o(tly)c(on)k (shared)g(memory)l(,)e(v)o(ector)g(sup)q(ercomputers.)39 b(The)22 b(ScaLAP)l(A)o(CK)-57 2568 y(soft)o(w)o(are)d(library)g(will)f(extend)h(the)g (use)g(of)h(LAP)l(A)o(CK)e(to)i(distributed)f(memory)d(concurren)o(t)j(sup)q (ercom-)-57 2634 y(puters.)i(The)15 b(dev)o(elopmen)o(t)d(of)k(ScaLAP)l(A)o (CK)e(b)q(egan)j(in)d(1991)j(and)f(is)f(exp)q(ected)f(to)h(b)q(e)h(completed) d(b)o(y)h(the)-57 2700 y(end)i(of)h(1994.)951 2825 y(5)p eop %%Page: 6 8 7 bop -57 125 a Fm(The)21 b(underlying)f(concept)h(of)g(b)q(oth)h(the)e(LAP)l (A)o(CK)h(and)g(ScaLAP)l(A)o(CK)g(libraries)f(is)g(the)h(use)g(of)g(blo)q(c)o (k-)-57 191 y(partitioned)15 b(algorithms)f(to)h(minimi)o(ze)c(data)16 b(mo)o(v)o(eme)o(n)o(t)c(b)q(et)o(w)o(een)i(di\013eren)o(t)g(lev)o(els)f(in)i (hierarc)o(hical)e(mem-)-57 257 y(ory)l(.)29 b(Th)o(us,)19 b(the)g(ideas)g(discussed)g(in)f(this)h(c)o(hapter)f(for)h(dev)o(eloping)f(a) i(library)e(for)h(dense)f(linear)h(algebra)-57 323 y(computations)14 b(are)g(applicable)f(to)h(an)o(y)g(computer)e(with)i(a)h(hierarc)o(hical)d (memory)f(that)k(\(1\))f(imp)q(oses)f(a)i(suf-)-57 390 y(\014cien)o(tly)d (large)h(startup)i(cost)e(on)i(the)e(mo)o(v)o(em)o(en)o(t)d(of)k(data)h(b)q (et)o(w)o(een)d(di\013eren)o(t)h(lev)o(els)f(in)h(the)g(hierarc)o(h)o(y)l(,)f (and)-57 456 y(for)g(whic)o(h)f(\(2\))h(the)f(cost)h(of)g(a)g(con)o(text)e (switc)o(h)h(is)h(to)q(o)g(great)g(to)g(mak)o(e)e(\014ne)h(grain)h(size)f(m)o (ultithreading)e(w)o(orth-)-57 522 y(while.)30 b(Our)19 b(target)h(mac)o (hines)d(are,)j(therefore,)f(medium)d(and)k(large)g(grain)f(size)g(adv)m (anced-arc)o(hitecture)-57 588 y(computers.)g(These)12 b(include)g (\\traditional")h(shared)h(memory)l(,)c(v)o(ector)h(sup)q(ercomputers,)i(suc) o(h)f(as)i(the)e(Cra)o(y)-57 654 y(Y-MP)19 b(and)g(C90,)h(and)f(MIMD)f (distributed)h(memory)d(concurren)o(t)i(sup)q(ercomputers,)g(suc)o(h)g(as)i (the)e(In)o(tel)-57 721 y(P)o(aragon,)g(and)g(Thinking)f(Mac)o(hines')e (CM-5,)j(and)f(the)g(more)f(recen)o(tly)f(announced)j(IBM)e(SP1)h(and)h(Cra)o (y)-57 787 y(T3D)c(concurren)o(t)f(systems.)19 b(Since)12 b(these)h(mac)o (hines)e(ha)o(v)o(e)i(only)g(v)o(ery)f(recen)o(tly)f(b)q(ecome)h(a)o(v)m (ailable,)g(most)h(of)-57 853 y(the)h(ongoing)h(dev)o(elopmen)o(t)10 b(of)15 b(the)e(ScaLAP)l(A)o(CK)h(library)f(is)g(b)q(eing)h(done)h(on)f(a)g (128-no)q(de)i(In)o(tel)c(iPSC/860)-57 919 y(h)o(yp)q(ercub)q(e)k(and)h(on)f (the)g(520-no)q(de)j(In)o(tel)14 b(Delta)j(system.)-57 1005 y(The)c(In)o(tel)e(P)o(aragon)j(sup)q(ercomputer)e(can)h(ha)o(v)o(e)e(up)i (to)g(2000)i(no)q(des,)e(eac)o(h)g(consisting)f(of)h(an)h(i860)f(pro)q (cessor)-57 1071 y(and)19 b(a)f(comm)o(unic)o(ations)e(pro)q(cessor.)27 b(The)18 b(no)q(des)h(eac)o(h)e(ha)o(v)o(e)g(at)i(least)f(16)g(Mb)o(ytes)f (of)i(mem)o(ory)l(,)c(and)k(are)-57 1137 y(connected)11 b(b)o(y)h(a)g (high-sp)q(eed)h(net)o(w)o(ork)e(with)h(the)g(top)q(ology)h(of)f(a)h(t)o(w)o (o-dimensional)d(mesh.)19 b(The)12 b(CM-5)g(from)-57 1203 y(Thinking)i(Mac)o (hines)f(Corp)q(oration)j([53)q(])d(supp)q(orts)j(b)q(oth)f(SIMD)f(and)h (MIMD)e(programming)g(mo)q(dels,)g(and)-57 1270 y(ma)o(y)g(ha)o(v)o(e)h(up)h (to)f(16k)i(pro)q(cessors,)f(though)h(the)e(largest)h(CM-5)g(curren)o(tly)e (installed)h(has)h(1024)h(pro)q(cessors.)-57 1336 y(Eac)o(h)i(CM-5)h(no)q(de) f(is)g(a)h(Sparc)f(pro)q(cessor)h(and)g(up)f(to)g(4)h(asso)q(ciated)g(v)o (ector)e(pro)q(cessors.)28 b(P)o(oin)o(t-to-p)q(oin)o(t)-57 1402 y(comm)o(unic)o(ation)14 b(b)q(et)o(w)o(een)h(no)q(des)h(is)g(supp)q (orted)h(b)o(y)e(a)i(data)f(net)o(w)o(ork)g(with)f(the)h(top)q(ology)h(of)f (a)h(\\fat)f(tree")-57 1468 y([46].)31 b(Global)20 b(comm)o(unic)o(ation)d (op)q(erations,)k(suc)o(h)f(as)g(sync)o(hronization)f(and)h(reduction,)f(are) h(supp)q(orted)-57 1534 y(b)o(y)e(a)h(separate)h(con)o(trol)e(net)o(w)o(ork.) 28 b(The)19 b(IBM)e(SP1)j(system)d(is)h(based)i(on)f(the)f(same)g(RISC)h(c)o (hip)e(used)i(in)-57 1601 y(the)f(IBM)g(RS/6000)j(w)o(orkstations)e(and)g (uses)g(a)g(m)o(ultistage)e(switc)o(h)h(to)h(connect)g(pro)q(cessors.)29 b(The)19 b(Cra)o(y)-57 1667 y(T3D)d(uses)f(the)g(Alpha)g(c)o(hip)f(from)g (Digital)h(Equipmen)o(t)e(Corp)q(oration,)j(and)g(connects)f(the)g(pro)q (cessors)h(in)f(a)-57 1733 y(three-dimensional)f(torus.)-57 1818 y(F)l(uture)19 b(adv)m(ances)i(in)e(compiler)e(and)j(hardw)o(are)h(tec)o (hnologies)e(in)g(the)g(mid)f(to)j(late)e(1990s)i(are)f(exp)q(ected)-57 1885 y(to)g(mak)o(e)d(m)o(ultithreading)g(a)i(viable)f(approac)o(h)i(for)g (masking)e(comm)o(unic)o(ation)f(costs.)30 b(Since)18 b(the)h(blo)q(c)o(ks) -57 1951 y(in)e(a)g(blo)q(c)o(k-partitioned)f(algorithm)g(can)h(b)q(e)g (regarded)g(as)h(separate)f(threads,)g(our)h(approac)o(h)f(will)f(still)g(b)q (e)-57 2017 y(applicable)f(on)i(mac)o(hines)d(that)j(exploit)e(medium)e(and)k (coarse)g(grain)f(size)g(m)o(ultithreading.)-57 2186 y Fo(1.3)70 b(High-Qualit)n(y)-6 b(,)21 b(Reusable,)g(Mathematical)g(Soft)n(w)n(are)-57 2304 y Fm(In)g(dev)o(eloping)g(a)h(library)f(of)h(high-qualit)o(y)f (subroutines)h(for)g(dense)f(linear)g(algebra)h(computations)f(the)-57 2370 y(design)16 b(goals)i(fall)d(in)o(to)h(three)g(broad)h(classes:)16 2484 y Fl(\017)24 b Fm(p)q(erformance)16 2587 y Fl(\017)g Fm(ease-of-use)16 2690 y Fl(\017)g Fm(range-of-use)951 2825 y(6)p eop %%Page: 7 9 8 bop -57 125 a Fi(1.3.1)55 b(P)n(erformance)-57 244 y Fm(Tw)o(o)23 b(imp)q(ortan)o(t)f(p)q(erformance)f(metrics)g(are)h Fg(c)n(oncurr)n(ent)i (e\016ciency)g Fm(and)g Fg(sc)n(alability)p Fm(.)41 b(W)l(e)22 b(seek)g(go)q(o)q(d)-57 311 y(p)q(erformance)15 b(c)o(haracteristics)f(in)h (our)h(algorithms)f(b)o(y)h(elimi)o(nating,)d(as)k(m)o(uc)o(h)c(as)k(p)q (ossible,)e(o)o(v)o(erhead)g(due)-57 377 y(to)c(load)h(im)o(balance,)d(data)j (mo)o(v)o(em)o(en)o(t,)c(and)k(algorithm)e(restructuring.)19 b(The)11 b(w)o(a)o(y)f(the)h(data)h(are)f(distributed)-57 443 y(\(or)i(decomp)q(osed\))g(o)o(v)o(er)f(the)h(memory)d(hierarc)o(h)o(y)i(of)h (a)h(computer)e(is)h(of)g(fundamen)o(tal)f(imp)q(ortance)g(to)h(these)-57 509 y(factors.)25 b(Concurren)o(t)17 b(e\016ciency)l(,)e Ff(\017)p Fm(,)h(is)i(de\014ned)f(as)h(the)f(concurren)o(t)f(sp)q(eedup)i(p)q(er)f(pro) q(cessor)h([32)q(],)e(where)-57 575 y(the)21 b(concurren)o(t)f(sp)q(eedup)h (is)f(the)h(execution)f(time,)f Ff(T)996 582 y Fe(seq)1046 575 y Fm(,)j(for)f(the)f(b)q(est)h(sequen)o(tial)f(algorithm)g(running)-57 642 y(on)f(one)f(pro)q(cessor)h(of)f(the)g(concurren)o(t)f(computer,)g (divided)g(b)o(y)g(the)h(execution)f(time,)f Ff(T)7 b Fm(,)18 b(of)g(the)g(parallel)-57 708 y(algorithm)d(running)h(on)g Ff(N)452 715 y Fd(p)488 708 y Fm(pro)q(cessors.)22 b(When)16 b(direct)f(metho)q(ds)g(are)h(used,)f(as)i(in)e(LU)h(factorization,)f(the)-57 774 y(concurren)o(t)i(e\016ciency)f(dep)q(ends)j(on)f(the)g(problem)e(size)i (and)g(the)g(n)o(um)o(b)q(er)e(of)j(pro)q(cessors,)g(so)g(on)f(a)h(giv)o(en) -57 840 y(parallel)13 b(computer)g(and)i(for)f(a)h(\014xed)e(n)o(um)o(b)q(er) g(of)h(pro)q(cessors,)i(the)d(running)i(time)d(should)i(not)h(v)m(ary)f (greatly)-57 906 y(for)j(problems)d(of)j(the)f(same)f(size.)21 b(Th)o(us,)16 b(w)o(e)f(ma)o(y)g(write,)703 1039 y Ff(\017)p Fm(\()p Ff(N)r(;)8 b(N)844 1046 y Fd(p)864 1039 y Fm(\))14 b(=)971 1005 y(1)p 954 1027 59 2 v 954 1073 a Ff(N)993 1080 y Fd(p)1040 1005 y Ff(T)1069 1012 y Fe(seq)1119 1005 y Fm(\()p Ff(N)5 b Fm(\))p 1022 1027 196 2 v 1022 1073 a Ff(T)i Fm(\()p Ff(N)r(;)h(N)1179 1080 y Fd(p)1199 1073 y Fm(\))1921 1039 y(\(1\))-57 1173 y(where)17 b Ff(N)22 b Fm(represen)o(ts)16 b(the)g(problem)g(size.)22 b(In)17 b(dense)f(linear)g(algebra)i(computations,)e(the)g(execution)g(time) -57 1240 y(is)e(usually)f(dominated)g(b)o(y)h(the)f(\015oating-p)q(oin)o(t)i (op)q(eration)g(coun)o(t,)f(so)g(the)g(concurren)o(t)f(e\016ciency)f(is)i (related)-57 1306 y(to)j(the)f(p)q(erformance,)e Ff(G)p Fm(,)j(measured)e(in) g(\015oating-p)q(oin)o(t)j(op)q(erations)f(p)q(er)g(second)f(b)o(y)l(,)692 1435 y Ff(G)p Fm(\()p Ff(N)r(;)8 b(N)851 1442 y Fd(p)872 1435 y Fm(\))14 b(=)972 1401 y Ff(N)1011 1408 y Fd(p)p 962 1423 79 2 v 962 1469 a Ff(t)980 1476 y Fe(calc)1054 1435 y Ff(\017)p Fm(\()p Ff(N)r(;)8 b(N)1195 1442 y Fd(p)1215 1435 y Fm(\))687 b(\(2\))-57 1563 y(where)11 b Ff(t)97 1570 y Fe(calc)170 1563 y Fm(is)g(the)h(time)e(for)i(one)g(\015oating-p)q(oin)o(t)h(op)q(eration.)20 b(F)l(or)12 b(iterativ)o(e)e(routines,)j(suc)o(h)e(as)i(eigensolv)o(ers,)-57 1629 y(the)f(n)o(um)o(b)q(er)e(of)i(iterations,)g(and)g(hence)f(the)h (execution)f(time,)f(dep)q(ends)i(not)h(only)e(on)i(the)e(problem)f(size,)i (but)-57 1695 y(also)18 b(on)g(other)g(c)o(haracteristics)e(of)i(the)f(input) h(data,)g(suc)o(h)f(as)h(condition)g(n)o(um)o(b)q(er.)23 b(A)17 b(parallel)g(algorithm)-57 1762 y(is)h(said)g(to)h(b)q(e)f(scalable)g([37])g (if)g(the)g(concurren)o(t)f(e\016ciency)f(dep)q(ends)i(on)h(the)f(problem)f (size)g(and)i(n)o(um)o(b)q(er)-57 1828 y(of)d(pro)q(cessors)h(only)e(through) i(their)d(ratio.)22 b(This)15 b(ratio)h(is)g(simply)d(the)i(problem)f(size)h (p)q(er)h(pro)q(cessor,)g(often)-57 1894 y(referred)c(to)h(as)g(the)g(gran)o (ularit)o(y)l(.)19 b(Th)o(us,)13 b(for)g(a)h(scalable)e(algorithm,)g(the)g (concurren)o(t)g(e\016ciency)f(is)i(constan)o(t)-57 1960 y(as)19 b(the)f(n)o(um)o(b)q(er)f(of)i(pro)q(cessors)h(increases)e(while)f(k)o (eeping)g(the)i(gran)o(ularit)o(y)f(\014xed.)27 b(Alternativ)o(ely)l(,)16 b(Eq.)i(2)-57 2027 y(sho)o(ws)h(that)f(this)f(is)h(equiv)m(alen)o(t)e(to)i (sa)o(ying)g(that,)g(for)g(a)h(scalable)e(algorithm,)g(the)g(p)q(erformance)g (dep)q(ends)-57 2093 y(linearly)e(on)i(the)f(n)o(um)o(b)q(er)e(of)j(pro)q (cessors)g(for)g(\014xed)e(gran)o(ularit)o(y)l(.)-57 2249 y Fi(1.3.2)55 b(Ease-Of-Use)-57 2369 y Fm(Ease-of-use)21 b(is)f(concerned)f (with)h(factors)g(suc)o(h)g(as)h(p)q(ortabilit)o(y)e(and)h(the)g(user)g(in)o (terface)e(to)j(the)e(library)l(.)-57 2435 y(P)o(ortabilit)o(y)l(,)e(in)h (its)g(most)f(inclusiv)o(e)f(sense,)i(means)g(that)g(the)g(co)q(de)h(is)f (written)g(in)f(a)i(standard)g(language,)-57 2501 y(suc)o(h)e(as)g(F)l (ortran,)g(and)g(that)h(the)e(source)h(co)q(de)g(can)g(b)q(e)g(compiled)d(on) k(an)f(arbitrary)g(mac)o(hine)d(to)k(pro)q(duce)-57 2568 y(a)h(program)f (that)h(will)e(run)i(correctly)l(.)27 b(W)l(e)18 b(call)f(this)i(the)f (\\mail-order)f(soft)o(w)o(are")i(mo)q(del)e(of)i(p)q(ortabilit)o(y)l(,)-57 2634 y(since)c(it)g(re\015ects)g(the)g(mo)q(del)f(used)i(b)o(y)f(soft)o(w)o (are)g(serv)o(ers)g(suc)o(h)g(as)h Fg(netlib)i Fm([20].)i(This)c(notion)g(of) g(p)q(ortabilit)o(y)-57 2700 y(is)21 b(quite)f(demanding.)35 b(It)21 b(requires)f(that)h(all)g(relev)m(an)o(t)f(prop)q(erties)h(of)g(the)g (computer's)f(arithmetic)e(and)951 2825 y(7)p eop %%Page: 8 10 9 bop -57 125 a Fm(arc)o(hitecture)20 b(b)q(e)h(disco)o(v)o(ered)f(at)h(run)o (time)e(within)i(the)g(con\014nes)h(of)f(a)h(F)l(ortran)g(co)q(de.)36 b(F)l(or)22 b(example,)e(if)-57 191 y(it)f(is)h(imp)q(ortan)o(t)e(to)i(kno)o (w)g(the)g(o)o(v)o(er\015o)o(w)f(threshold)g(for)h(scaling)g(purp)q(oses,)h (it)e(m)o(ust)g(b)q(e)h(determined)d(at)-57 257 y(run)o(time)h Fg(without)k(over\015owing)p Fm(,)g(since)d(o)o(v)o(er\015o)o(w)h(is)g (generally)f(fatal.)34 b(Suc)o(h)20 b(demands)f(ha)o(v)o(e)h(resulted)f(in) -57 323 y(quite)d(large)h(and)h(sophisticated)f(programs)g([28,)g(44])g(whic) o(h)f(m)o(ust)g(b)q(e)h(mo)q(di\014ed)f(frequen)o(tly)f(to)i(deal)g(with)-57 390 y(new)j(arc)o(hitectures)f(and)i(soft)o(w)o(are)f(releases.)32 b(This)20 b(\\mail-order")g(notion)g(of)h(soft)o(w)o(are)f(p)q(ortabilit)o(y) f(also)-57 456 y(means)14 b(that)i(co)q(des)g(generally)e(m)o(ust)g(b)q(e)h (written)g(for)h(the)f(w)o(orst)g(p)q(ossible)h(mac)o(hine)d(exp)q(ected)h (to)i(b)q(e)f(used,)-57 522 y(thereb)o(y)10 b(often)h(degrading)h(p)q (erformance)e(on)i(all)f(others.)20 b(Ease-of-use)12 b(is)f(also)h(enhanced)f (if)g(implem)o(en)o(tation)-57 588 y(details)h(are)h(largely)f(hidden)h(from) e(the)i(user,)g(for)g(example,)e(through)j(the)e(use)h(of)g(an)g(ob)s (ject-based)g(in)o(terface)-57 654 y(to)k(the)f(library)f([24].)-57 812 y Fi(1.3.3)55 b(Range-Of-Use)-57 932 y Fm(Range-of-use)17 b(ma)o(y)c(b)q(e)j(gauged)g(b)o(y)f(ho)o(w)h(n)o(umericall)o(y)d(stable)i (the)g(algorithms)f(are)i(o)o(v)o(er)e(a)i(range)g(of)f(input)-57 998 y(problems,)20 b(and)g(the)h(range)f(of)h(data)g(structures)f(the)g (library)g(will)f(supp)q(ort.)35 b(F)l(or)20 b(example,)f(LINP)l(A)o(CK)-57 1064 y(and)i(EISP)l(A)o(CK)e(deal)h(with)g(dense)g(matrices)e(stored)j(in)e (a)i(rectangular)f(arra)o(y)l(,)h(pac)o(k)o(ed)e(matrices)f(where)-57 1130 y(only)g(the)g(upp)q(er)h(or)f(lo)o(w)o(er)g(half)g(of)g(a)h(symmetri)o (c)c(matrix)i(is)h(stored,)g(and)h(banded)g(matrices)d(where)i(only)-57 1197 y(the)d(nonzero)g(bands)h(are)f(stored.)21 b(In)14 b(addition,)h(some)f (sp)q(ecial)g(formats)h(suc)o(h)g(as)g(Householder)g(v)o(ectors)f(are)-57 1263 y(used)j(in)o(ternally)e(to)i(represen)o(t)f(orthogonal)j(matrices.)i (There)c(are)g(also)g(sparse)h(matrices,)c(whic)o(h)j(ma)o(y)e(b)q(e)-57 1329 y(stored)20 b(in)g(man)o(y)e(di\013eren)o(t)h(w)o(a)o(ys;)j(but)e(in)f (this)h(pap)q(er)h(w)o(e)e(fo)q(cus)i(on)f(dense)g(and)g(banded)h(matrices,)d (the)-57 1395 y(mathematical)13 b(t)o(yp)q(es)j(addressed)h(b)o(y)f(LINP)l(A) o(CK,)f(EISP)l(A)o(CK,)g(and)i(LAP)l(A)o(CK.)-57 1592 y Fh(2)83 b(The)27 b(BLAS)g(as)g(the)h(Key)e(to)i(P)n(ortabilit)n(y)-57 1729 y Fm(A)o(t)15 b(least)i(three)e(factors)i(a\013ect)f(the)g(p)q (erformance)f(of)i(p)q(ortable)g(F)l(ortran)g(co)q(de.)3 1858 y(1.)24 b Fi(V)-5 b(ectorization.)21 b Fm(Designing)c(v)o(ectorizable)e (algorithms)g(in)i(linear)f(algebra)h(is)f(usually)g(straigh)o(tfor-)65 1924 y(w)o(ard.)21 b(Indeed,)15 b(for)i(man)o(y)d(computations)i(there)f(are) h(sev)o(eral)f(v)m(arian)o(ts,)h(all)f(v)o(ectorizable,)f(but)i(with)65 1991 y(di\013eren)o(t)f(c)o(haracteristics)h(in)g(p)q(erformance)f(\(see,)g (for)i(example,)d([15]\).)21 b(Linear)c(algebra)g(algorithms)65 2057 y(can)i(approac)o(h)g(the)g(p)q(eak)g(p)q(erformance)e(of)i(man)o(y)f (mac)o(hines|princi)o(pally)d(b)q(ecause)k(p)q(eak)g(p)q(erfor-)65 2123 y(mance)14 b(dep)q(ends)h(on)h(some)e(form)g(of)h(c)o(haining)g(of)g(v)o (ector)f(addition)i(and)f(m)o(ultiplication)d(op)q(erations,)65 2189 y(and)20 b(this)f(is)g(just)g(what)h(the)f(algorithms)g(require.)29 b(Ho)o(w)o(ev)o(er,)17 b(when)j(the)f(algorithms)f(are)h(realized)65 2256 y(in)h(straigh)o(tforw)o(ard)g(F)l(ortran)h(77)f(co)q(de,)h(the)f(p)q (erformance)e(ma)o(y)h(fall)g(w)o(ell)g(short)h(of)g(the)g(exp)q(ected)65 2322 y(lev)o(el,)11 b(usually)i(b)q(ecause)h(v)o(ectorizing)d(F)l(ortran)j (compilers)d(fail)i(to)h(minim)o(ize)c(the)j(n)o(um)o(b)q(er)e(of)j(memory)65 2388 y(references|that)h(is,)h(the)g(n)o(um)o(b)q(er)e(of)j(v)o(ector)e(load) i(and)f(store)h(op)q(erations.)3 2496 y(2.)24 b Fi(Data)13 b(mo)n(v)n(emen)n(t.)18 b Fm(What)12 b(often)f(limits)e(the)j(actual)f(p)q (erformance)g(of)h(a)g(v)o(ector,)f(or)h(scalar,)g(\015oating-)65 2562 y(p)q(oin)o(t)i(unit)g(is)g(the)g(rate)g(of)h(transfer)f(of)h(data)g(b)q (et)o(w)o(een)e(di\013eren)o(t)h(lev)o(els)e(of)j(mem)o(ory)d(in)i(the)g(mac) o(hine.)65 2628 y(Examples)e(include)g(the)h(transfer)g(of)h(v)o(ector)e(op)q (erands)j(in)e(and)h(out)g(of)f(v)o(ector)g(registers,)g(the)g(transfer)65 2694 y(of)24 b(scalar)g(op)q(erands)h(in)e(and)i(out)f(of)g(a)g(high-sp)q (eed)g(scalar)g(pro)q(cessor,)i(the)e(mo)o(v)o(em)o(en)n(t)d(of)j(data)951 2825 y(8)p eop %%Page: 9 11 10 bop 65 125 a Fm(b)q(et)o(w)o(een)20 b(main)g(memory)f(and)i(a)h(high-sp)q (eed)g(cac)o(he)e(or)i(lo)q(cal)f(memory)l(,)e(paging)k(b)q(et)o(w)o(een)d (actual)65 191 y(memory)15 b(and)j(disk)f(storage)h(in)f(a)h(virtual)f (memory)e(system,)h(and)i(in)o(terpro)q(cessor)f(comm)o(unication)65 257 y(on)g(a)f(distributed)g(memory)d(concurren)o(t)j(computer.)3 365 y(3.)24 b Fi(P)n(arallelism.)42 b Fm(The)24 b(nested)g(lo)q(op)h (structure)e(of)i(most)e(linear)g(algebra)i(algorithms)e(o\013ers)i(con-)65 431 y(siderable)19 b(scop)q(e)h(for)g(lo)q(op-based)i(parallelism.)30 b(This)20 b(is)f(the)h(principal)f(t)o(yp)q(e)g(of)h(parallelism)e(that)65 497 y(LAP)l(A)o(CK)d(and)i(ScaLAP)l(A)o(CK)e(presen)o(tly)g(aim)f(to)i (exploit.)k(On)c(shared)g(memory)d(concurren)o(t)i(com-)65 564 y(puters,)j(this)g(t)o(yp)q(e)g(of)g(parallelism)e(can)i(sometimes)d(b)q (e)j(generated)g(automatically)f(b)o(y)g(a)i(compiler,)65 630 y(but)j(often)g(requires)e(the)i(insertion)f(of)h(compiler)e(directiv)o(es.) 35 b(On)22 b(distributed)f(memory)e(concur-)65 696 y(ren)o(t)d(computers,)f (data)j(m)o(ust)d(b)q(e)i(mo)o(v)o(ed)e(b)q(et)o(w)o(een)h(pro)q(cessors.)24 b(This)17 b(is)f(usually)h(done)g(b)o(y)f(explicit)65 762 y(calls)h(to)g (message)g(passing)h(routines,)f(although)i(parallel)d(language)i(extensions) f(suc)o(h)g(as)h(Coheren)o(t)65 828 y(P)o(arallel)d(C)i([31])f(and)h(Split-C) f([13])g(do)g(the)g(message)g(passing)h(implicitl)o(y)l(.)-57 958 y(The)d(question)h(arises,)f(\\Ho)o(w)g(can)h(w)o(e)f(ac)o(hiev)o(e)e (su\016cien)o(t)h(con)o(trol)h(o)o(v)o(er)g(these)g(three)g(factors)h(to)f (obtain)h(the)-57 1024 y(lev)o(els)f(of)j(p)q(erformance)e(that)i(mac)o (hines)d(can)i(o\013er?")23 b(The)16 b(answ)o(er)h(is)f(through)h(use)f(of)h (the)f(BLAS.)-57 1111 y(There)g(are)g(no)o(w)h(three)e(lev)o(els)g(of)h (BLAS:)-57 1253 y Fi(Lev)n(el)h(1)i(BLAS)f([45]:)24 b Fm(for)16 b(v)o(ector)f(op)q(erations,)i(suc)o(h)f(as)h Ff(y)f Fl( )d Ff(\013x)e Fm(+)g Ff(y)-57 1361 y Fi(Lev)n(el)17 b(2)i(BLAS)f([18]:)24 b Fm(for)16 b(matrix-v)o(ector)e(op)q(erations,)j(suc)o(h)f(as)h Ff(y)e Fl( )f Ff(\013Ax)c Fm(+)h Ff(\014)s(y)-57 1469 y Fi(Lev)n(el)17 b(3)i(BLAS)f([17]:)24 b Fm(for)16 b(matrix-matrix)d(op)q(erations,)k(suc)o(h) f(as)h Ff(C)g Fl( )d Ff(\013AB)f Fm(+)e Ff(\014)s(C)t Fm(.)-57 1610 y(Here,)k Ff(A)p Fm(,)g Ff(B)k Fm(and)e Ff(C)i Fm(are)e(matrices,)d Ff(x)i Fm(and)g Ff(y)i Fm(are)f(v)o(ectors,)e(and)i Ff(\013)f Fm(and)h Ff(\014)i Fm(are)d(scalars.)-57 1698 y(The)f(Lev)o(el)f(1)i(BLAS)f (are)g(used)g(in)g(LAP)l(A)o(CK,)g(but)g(for)g(con)o(v)o(enience)e(rather)j (than)f(for)h(p)q(erformance:)k(they)-57 1764 y(p)q(erform)15 b(an)h(insigni\014can)o(t)f(fraction)h(of)g(the)f(computation,)g(and)i(they)e (cannot)h(ac)o(hiev)o(e)e(high)i(e\016ciency)e(on)-57 1830 y(most)h(mo)q(dern)h(sup)q(ercomputers.)-57 1918 y(The)22 b(Lev)o(el)e(2)i (BLAS)f(can)h(ac)o(hiev)o(e)e(near-p)q(eak)i(p)q(erformance)e(on)j(man)o(y)d (v)o(ector)g(pro)q(cessors,)k(suc)o(h)d(as)i(a)-57 1984 y(single)16 b(pro)q(cessor)i(of)e(a)h(CRA)l(Y)f(X-MP)g(or)h(Y-MP)l(,)f(or)h(Con)o(v)o(ex) f(C-2)h(mac)o(hine.)j(Ho)o(w)o(ev)o(er,)14 b(on)j(other)g(v)o(ector)-57 2050 y(pro)q(cessors)d(suc)o(h)e(as)h(a)g(CRA)l(Y-2)g(or)f(an)h(IBM)f(3090)i (VF,)d(the)i(p)q(erformance)e(of)i(the)f(Lev)o(el)f(2)i(BLAS)g(is)f(limited) -57 2117 y(b)o(y)k(the)g(rate)g(of)h(data)g(mo)o(v)o(em)o(en)n(t)d(b)q(et)o (w)o(een)h(di\013eren)o(t)g(lev)o(els)g(of)h(memory)l(.)-57 2204 y(The)h(Lev)o(el)e(3)i(BLAS)g(o)o(v)o(ercome)d(this)j(limitation.)j (This)d(third)g(lev)o(el)d(of)j(BLAS)g(p)q(erforms)f Ff(O)q Fm(\()p Ff(n)1751 2186 y Fe(3)1771 2204 y Fm(\))h(\015oating-)-57 2270 y(p)q(oin)o(t)i(op)q(erations)h(on)f Ff(O)q Fm(\()p Ff(n)469 2252 y Fe(2)490 2270 y Fm(\))g(data,)h(whereas)f(the)f(Lev)o(el)g(2)h(BLAS)g (p)q(erform)f(only)g Ff(O)q Fm(\()p Ff(n)1633 2252 y Fe(2)1654 2270 y Fm(\))h(op)q(erations)h(on)-57 2337 y Ff(O)q Fm(\()p Ff(n)29 2319 y Fe(2)49 2337 y Fm(\))13 b(data.)21 b(The)13 b(Lev)o(el)f(3)i(BLAS)e(also)i(allo)o(w)f(us)g(to)g(exploit)f(parallelism)f (in)i(a)g(w)o(a)o(y)g(that)g(is)g(transparen)o(t)h(to)-57 2403 y(the)f(soft)o(w)o(are)g(that)h(calls)f(them.)19 b(While)12 b(the)h(Lev)o(el)f(2)i(BLAS)f(o\013er)h(some)e(scop)q(e)i(for)g(exploiting)e (parallelism,)-57 2469 y(greater)k(scop)q(e)h(is)f(pro)o(vided)g(b)o(y)f(the) h(Lev)o(el)f(3)i(BLAS,)e(as)i(T)l(able)f(1)h(illustrates.)951 2825 y(9)p eop %%Page: 10 12 11 bop -57 154 a Fm(T)l(able)20 b(1:)30 b(Sp)q(eed)20 b(\(Mega\015ops\))h(of) g(Lev)o(el)e(2)i(and)f(Lev)o(el)f(3)i(BLAS)f(Op)q(erations)h(on)g(a)f(CRA)l (Y)g(Y-MP)l(.)f(All)-57 220 y(matrices)14 b(are)j(of)f(order)h(500;)g Ff(U)k Fm(is)16 b(upp)q(er)h(triangular.)p 398 290 1131 2 v 397 356 2 67 v 423 336 a(Num)o(b)q(er)d(of)j(pro)q(cessors:)p 987 356 V 165 w(1)p 1110 356 V 99 w(2)p 1233 356 V 124 w(4)p 1380 356 V 123 w(8)p 1527 356 V 398 357 1131 2 v 398 367 V 397 434 2 67 v 423 414 a(Lev)o(el)e(2:)21 b Ff(y)16 b Fl( )d Ff(\013Ax)e Fm(+)g Ff(\014)s(y)p 987 434 V 88 w Fm(311)p 1110 434 V 51 w(611)p 1233 434 V 51 w(1197)p 1380 434 V 52 w(2285)p 1527 434 V 398 435 1131 2 v 397 501 2 67 v 423 482 a(Lev)o(el)k(3:)21 b Ff(C)d Fl( )13 b Ff(\013AB)h Fm(+)d Ff(\014)s(C)p 987 501 V 52 w Fm(312)p 1110 501 V 51 w(623)p 1233 501 V 51 w(1247)p 1380 501 V 52 w(2425)p 1527 501 V 398 503 1131 2 v 398 513 V 397 579 2 67 v 423 559 a(Lev)o(el)k(2:)21 b Ff(x)14 b Fl( )g Ff(U)5 b(x)p 987 579 V 230 w Fm(293)p 1110 579 V 51 w(544)p 1233 579 V 76 w(898)p 1380 579 V 51 w(1613)p 1527 579 V 398 581 1131 2 v 397 647 2 67 v 423 627 a(Lev)o(el)15 b(3:)21 b Ff(B)c Fl( )c Ff(U)5 b(B)p 987 647 V 210 w Fm(310)p 1110 647 V 51 w(620)p 1233 647 V 51 w(1240)p 1380 647 V 52 w(2425)p 1527 647 V 398 649 1131 2 v 398 659 V 397 725 2 67 v 423 705 a(Lev)o(el)15 b(2:)21 b Ff(x)14 b Fl( )g Ff(U)754 687 y Fc(\000)p Fe(1)801 705 y Ff(x)p 987 725 V 183 w Fm(272)p 1110 725 V 51 w(374)p 1233 725 V 76 w(479)p 1380 725 V 75 w(584)p 1527 725 V 398 727 1131 2 v 397 793 2 67 v 423 773 a(Lev)o(el)h(3:)21 b Ff(B)c Fl( )c Ff(U)765 755 y Fc(\000)p Fe(1)813 773 y Ff(B)p 987 793 V 162 w Fm(309)p 1110 793 V 51 w(618)p 1233 793 V 51 w(1235)p 1380 793 V 52 w(2398)p 1527 793 V 398 795 1131 2 v -57 981 a Fh(3)83 b(Blo)r(c)n(k)26 b(Algorithms)i(and)f(Their)g(Deriv)-5 b(ation)-57 1118 y Fm(It)12 b(is)g(comparativ)o(ely)d(straigh)o(tforw)o(ard)k (to)g(reco)q(de)f(man)o(y)f(of)h(the)g(algorithms)g(in)g(LINP)l(A)o(CK)f(and) i(EISP)l(A)o(CK)-57 1185 y(so)j(that)g(they)g(call)e(Lev)o(el)h(2)h(BLAS.)f (Indeed,)g(in)g(the)g(simplest)f(cases)i(the)f(same)g(\015oating-p)q(oin)o(t) i(op)q(erations)-57 1251 y(are)h(done,)h(p)q(ossibly)g(ev)o(en)e(in)h(the)g (same)g(order:)26 b(it)17 b(is)i(just)f(a)h(matter)e(of)i(reorganizing)f(the) h(soft)o(w)o(are.)27 b(T)l(o)-57 1317 y(illustrate)20 b(this)i(p)q(oin)o(t,)g (w)o(e)f(consider)g(the)g(Cholesky)g(factorization)h(algorithm)e(used)i(in)f (the)g(LINP)l(A)o(CK)-57 1383 y(routine)14 b(SPOF)-5 b(A,)13 b(whic)o(h)g(factorizes)h(a)g(symmetric)d(p)q(ositiv)o(e)i(de\014nite)g (matrix)g(as)i Ff(A)e Fm(=)h Ff(U)1627 1365 y Fd(T)1655 1383 y Ff(U)5 b Fm(.)20 b(W)l(e)14 b(consider)-57 1449 y(Cholesky)j(factorization) h(b)q(ecause)f(the)h(algorithm)e(is)i(simple,)d(and)j(no)g(piv)o(oting)f(is)g (required.)24 b(In)18 b(Section)-57 1516 y(4)f(w)o(e)e(shall)i(consider)e (the)h(sligh)o(tly)f(more)g(complicated)f(example)g(of)j(LU)f(factorization.) -57 1603 y(Supp)q(ose)g(that)g(after)f Ff(j)c Fl(\000)e Fm(1)15 b(steps)g(the)g(blo)q(c)o(k)g Ff(A)839 1610 y Fe(00)891 1603 y Fm(in)f(the)h(upp)q(er)h(lefthand)e(corner)h(of)h Ff(A)e Fm(has)i(b)q(een)f(factored)-57 1669 y(as)f Ff(A)37 1676 y Fe(00)87 1669 y Fm(=)g Ff(U)177 1651 y Fd(T)172 1682 y Fe(00)218 1669 y Ff(U)251 1676 y Fe(00)288 1669 y Fm(.)21 b(The)13 b(next)f(ro)o(w)h (and)h(column)d(of)i(the)g(factorization)g(can)g(then)g(b)q(e)g(computed)f(b) o(y)g(writing)-57 1736 y Ff(A)h Fm(=)h Ff(U)83 1718 y Fd(T)111 1736 y Ff(U)22 b Fm(as)323 1829 y Fb(0)323 1902 y(B)323 1927 y(B)323 1954 y(@)380 1874 y Ff(A)417 1881 y Fe(00)506 1874 y Ff(b)527 1881 y Fd(j)597 1874 y Ff(A)634 1881 y Fe(02)410 1940 y Ff(:)72 b(a)522 1947 y Fd(j)r(j)610 1940 y Ff(c)631 1922 y Fd(T)631 1953 y(j)410 2007 y Ff(:)95 b(:)64 b(A)634 2014 y Fe(22)692 1829 y Fb(1)692 1902 y(C)692 1927 y(C)692 1954 y(A)742 1940 y Fm(=)794 1829 y Fb(0)794 1902 y(B)794 1927 y(B)794 1954 y(@)851 1874 y Ff(U)889 1856 y Fd(T)884 1886 y Fe(00)982 1874 y Fm(0)84 b(0)860 1940 y Ff(v)886 1922 y Fd(T)884 1953 y(j)963 1940 y Ff(u)991 1947 y Fd(j)r(j)1090 1940 y Fm(0)851 2007 y Ff(U)889 1988 y Fd(T)884 2019 y Fe(02)967 2007 y Ff(w)1002 2014 y Fd(j)1067 2007 y Ff(U)1105 1988 y Fd(T)1100 2019 y Fe(22)1158 1829 y Fb(1)1158 1902 y(C)1158 1927 y(C)1158 1954 y(A)1202 1829 y(0)1202 1902 y(B)1202 1927 y(B)1202 1954 y(@)1260 1874 y Ff(U)1293 1881 y Fe(00)1382 1874 y Ff(v)1406 1881 y Fd(j)1475 1874 y Ff(U)1508 1881 y Fe(02)1283 1940 y Fm(0)65 b Ff(u)1400 1947 y Fd(j)r(j)1479 1940 y Ff(w)1515 1922 y Fd(T)1514 1953 y(j)1283 2007 y Fm(0)83 b(0)61 b Ff(U)1508 2014 y Fe(22)1567 1829 y Fb(1)1567 1902 y(C)1567 1927 y(C)1567 1954 y(A)-57 2092 y Fm(where)14 b Ff(b)103 2099 y Fd(j)121 2092 y Fm(,)h Ff(c)171 2099 y Fd(j)189 2092 y Fm(,)g Ff(v)242 2099 y Fd(j)260 2092 y Fm(,)f(and)h Ff(w)416 2099 y Fd(j)449 2092 y Fm(are)g(column)e(v)o(ectors)h (of)h(length)g Ff(j)c Fl(\000)d Fm(1,)14 b(and)i Ff(a)1314 2099 y Fd(j)r(j)1362 2092 y Fm(and)g Ff(u)1484 2099 y Fd(j)r(j)1533 2092 y Fm(are)e(scalars.)22 b(Equating)-57 2159 y(co)q(e\016cien)o(ts)15 b(of)h(the)g Ff(j)352 2141 y Fd(th)404 2159 y Fm(column,)e(w)o(e)i(obtain)778 2275 y Ff(b)799 2282 y Fd(j)859 2275 y Fm(=)41 b Ff(U)976 2254 y Fd(T)971 2287 y Fe(00)1009 2275 y Ff(v)1033 2282 y Fd(j)757 2353 y Ff(a)783 2360 y Fd(j)r(j)859 2353 y Fm(=)g Ff(v)964 2333 y Fd(T)962 2366 y(j)991 2353 y Ff(v)1015 2360 y Fd(j)1044 2353 y Fm(+)11 b Ff(u)1121 2333 y Fe(2)1121 2366 y Fd(j)r(j)1155 2353 y Ff(:)-57 2469 y Fm(Since)k Ff(U)103 2476 y Fe(00)157 2469 y Fm(has)i(already)f(b)q(een)g(computed,)f(w)o(e)h(can)g(compute)f Ff(v)1153 2476 y Fd(j)1187 2469 y Fm(and)i Ff(u)1310 2476 y Fd(j)r(j)1360 2469 y Fm(from)e(the)h(equations)732 2585 y Ff(U)770 2565 y Fd(T)765 2598 y Fe(00)802 2585 y Ff(v)826 2592 y Fd(j)886 2585 y Fm(=)41 b Ff(b)986 2592 y Fd(j)782 2664 y Ff(u)810 2643 y Fe(2)810 2676 y Fd(j)r(j)886 2664 y Fm(=)g Ff(a)991 2671 y Fd(j)r(j)1036 2664 y Fl(\000)11 b Ff(v)1112 2643 y Fd(T)1110 2676 y(j)1139 2664 y Ff(v)1163 2671 y Fd(j)1181 2664 y Ff(:)939 2825 y Fm(10)p eop %%Page: 11 13 12 bop -57 125 a Fm(The)19 b(b)q(o)q(dy)h(of)g(the)f(co)q(de)g(of)h(the)f (LINP)l(A)o(CK)f(routine)h(SPOF)-5 b(A)18 b(that)i(implem)o(en)n(ts)d(the)i (ab)q(o)o(v)o(e)g(metho)q(d)f(is)-57 191 y(sho)o(wn)d(in)f(Figure)f(1.)21 b(The)14 b(same)f(computation)g(reco)q(ded)h(in)g(\\LAP)l(A)o(CK-st)o(yle")g (to)g(use)g(the)g(Lev)o(el)f(2)h(BLAS)-57 257 y(routine)k(STRSV)g(\(whic)o(h) f(solv)o(es)h(a)h(triangular)f(system)f(of)h(equations\))h(is)f(sho)o(wn)h (in)f(Figure)f(2.)28 b(The)18 b(call)-57 323 y(to)g(STRSV)g(has)g(replaced)f (the)h(lo)q(op)g(o)o(v)o(er)f(K)h(whic)o(h)f(made)g(sev)o(eral)f(calls)h(to)i (the)e(Lev)o(el)g(1)h(BLAS)f(routine)-57 390 y(SDOT.)d(\(F)l(or)h(reasons)g (giv)o(en)f(b)q(elo)o(w,)g(this)g(is)g(not)h(the)f(actual)h(co)q(de)f(used)h (in)f(LAP)l(A)o(CK)g(|)g(hence)g(the)g(term)-57 456 y(\\LAP)l(A)o(CK-st)o (yle".\))-57 542 y(This)d(c)o(hange)g(b)o(y)g(itself)f(is)g(su\016cien)o(t)g (to)h(result)g(in)g(large)g(gains)g(in)g(p)q(erformance)f(on)h(a)h(n)o(um)o (b)q(er)d(of)i(mac)o(hines|)-57 608 y(for)19 b(example,)f(from)g(72)i(to)f (251)i(mega\015ops)e(for)g(a)h(matrix)d(of)j(order)f(500)h(on)g(one)g(pro)q (cessor)g(of)f(a)h(CRA)l(Y)-57 674 y(Y-MP)l(.)14 b(Since)h(this)g(is)g (81\045)h(of)f(the)g(p)q(eak)h(sp)q(eed)f(of)h(matrix-matrix)c(m)o(ultipli)o (cation)h(on)i(this)g(pro)q(cessor,)h(w)o(e)-57 740 y(cannot)h(hop)q(e)g(to)f (do)h(v)o(ery)e(m)o(uc)o(h)f(b)q(etter)i(b)o(y)g(using)h(Lev)o(el)e(3)h (BLAS.)-57 826 y(W)l(e)i(can,)g(ho)o(w)o(ev)o(er,)f(restructure)h(the)g (algorithm)f(at)i(a)f(deep)q(er)g(lev)o(el)e(to)j(exploit)e(the)h(faster)g (sp)q(eed)h(of)f(the)-57 893 y(Lev)o(el)c(3)i(BLAS.)f(This)g(restructuring)g (in)o(v)o(olv)o(es)e(recasting)j(the)f(algorithm)f(as)i(a)g Fi(blo)r(c)n(k)h(algorithm)p Fm(|that)-57 959 y(is,)f(an)g(algorithm)g(that)g (op)q(erates)h(on)g Fi(blo)r(c)n(ks)f Fm(or)g(submatrices)f(of)i(the)f (original)g(matrix.)-57 1129 y Fo(3.1)70 b(Deriving)21 b(a)i(Blo)r(c)n(k)f (Algorithm)-57 1247 y Fm(T)l(o)c(deriv)o(e)e(a)i(blo)q(c)o(k)g(form)e(of)i (Cholesky)g(factorization,)f(w)o(e)g(partition)h(the)g(matrices)d(as)k(sho)o (wn)f(in)f(Figure)-57 1313 y(4,)i(in)f(whic)o(h)g(the)h(diagonal)g(blo)q(c)o (ks)f(of)h Ff(A)f Fm(and)h Ff(U)24 b Fm(are)19 b(square,)g(but)g(of)f (di\013ering)h(sizes.)27 b(W)l(e)19 b(assume)f(that)-57 1379 y(the)g(\014rst)g(blo)q(c)o(k)f(has)i(already)e(b)q(een)h(factored)g(as)g Ff(A)928 1386 y Fe(00)982 1379 y Fm(=)e Ff(U)1074 1361 y Fd(T)1069 1392 y Fe(00)1107 1379 y Ff(U)1140 1386 y Fe(00)1177 1379 y Fm(,)i(and)h(that)f(w)o(e)f(no)o(w)i(w)o(an)o(t)e(to)i(determine)-57 1445 y(the)c(second)g(blo)q(c)o(k)g(column)f(of)h Ff(U)21 b Fm(consisting)15 b(of)h(the)f(blo)q(c)o(ks)g Ff(U)1130 1452 y Fe(01)1183 1445 y Fm(and)h Ff(U)1310 1452 y Fe(11)1347 1445 y Fm(.)21 b(Equating)16 b(submatrices)d(in)i(the)-57 1512 y(second)h(blo)q(c) o(k)g(of)h(columns,)d(w)o(e)i(obtain)688 1616 y Ff(A)725 1623 y Fe(01)803 1616 y Fm(=)42 b Ff(U)921 1596 y Fd(T)916 1628 y Fe(00)953 1616 y Ff(U)986 1623 y Fe(01)688 1695 y Ff(A)725 1702 y Fe(11)803 1695 y Fm(=)g Ff(U)921 1674 y Fd(T)916 1707 y Fe(01)953 1695 y Ff(U)986 1702 y Fe(01)1035 1695 y Fm(+)11 b Ff(U)1122 1674 y Fd(T)1117 1707 y Fe(11)1154 1695 y Ff(U)1187 1702 y Fe(11)1225 1695 y Ff(:)-57 1799 y Fm(Hence,)i(since)g Ff(U)250 1806 y Fe(00)301 1799 y Fm(has)h(already)g(b)q(een)g(computed,)e(w)o (e)i(can)g(compute)e Ff(U)1289 1806 y Fe(01)1340 1799 y Fm(as)j(the)e (solution)h(to)h(the)e(equation)781 1904 y Ff(U)819 1883 y Fd(T)814 1916 y Fe(00)852 1904 y Ff(U)885 1911 y Fe(01)936 1904 y Fm(=)h Ff(A)1025 1911 y Fe(01)-57 2008 y Fm(b)o(y)i(a)g(call)g(to)g (the)g(Lev)o(el)f(3)i(BLAS)f(routine)g(STRSM;)g(and)h(then)f(w)o(e)g(can)g (compute)f Ff(U)1564 2015 y Fe(11)1618 2008 y Fm(from)674 2112 y Ff(U)712 2092 y Fd(T)707 2125 y Fe(11)744 2112 y Ff(U)777 2119 y Fe(11)828 2112 y Fm(=)f Ff(A)917 2119 y Fe(11)965 2112 y Fl(\000)d Ff(U)1053 2092 y Fd(T)1048 2125 y Fe(01)1085 2112 y Ff(U)1118 2119 y Fe(01)1156 2112 y Ff(:)-57 2237 y Fm(This)16 b(in)o(v)o(olv)o(es)d(\014rst)j(up)q(dating)g(the)f(symmetric)d(submatrix)i Ff(A)1130 2244 y Fe(11)1183 2237 y Fm(b)o(y)h(a)g(call)g(to)h(the)f(Lev)o(el) f(3)i(BLAS)f(routine)-57 2303 y(SSYRK,)f(and)i(then)f(computing)f(its)g (Cholesky)h(factorization.)21 b(Since)14 b(F)l(ortran)i(do)q(es)f(not)h(allo) o(w)f(recursion,)-57 2369 y(a)20 b(separate)f(routine)g(m)o(ust)f(b)q(e)h (called)f(\(using)i(Lev)o(el)e(2)i(BLAS)e(rather)i(than)f(Lev)o(el)f(3\),)i (named)e(SPOTF2)-57 2435 y(in)g(Figure)f(3.)27 b(In)17 b(this)h(w)o(a)o(y)l (,)f(successiv)o(e)g(blo)q(c)o(ks)g(of)i(columns)d(of)i Ff(U)24 b Fm(are)18 b(computed.)24 b(The)18 b(LAP)l(A)o(CK-st)o(yle)-57 2501 y(co)q(de)g(for)h(the)e(blo)q(c)o(k)h(algorithm)f(is)h(sho)o(wn)h(in)e (Figure)h(3.)27 b(This)18 b(co)q(de)g(runs)h(at)f(49)h(mega\015ops)f(on)h(an) f(IBM)-57 2568 y(3090,)e(more)e(than)i(double)f(the)g(sp)q(eed)g(of)h(the)f (LINP)l(A)o(CK)f(co)q(de.)21 b(On)15 b(a)h(CRA)l(Y)e(Y-MP)l(,)g(the)h(use)g (of)h(Lev)o(el)e(3)-57 2634 y(BLAS)j(squeezes)f(a)h(little)f(more)f(p)q (erformance)h(out)i(of)f(one)h(pro)q(cessor,)f(but)h(mak)o(es)d(a)j(large)f (impro)o(v)o(em)o(e)o(n)o(t)-57 2700 y(when)f(using)h(all)f(8)g(pro)q (cessors.)939 2825 y(11)p eop %%Page: 12 14 13 bop 107 77 1713 2 v 107 1178 2 1101 v 129 144 a Fk(do)25 b(j)g(=)g(0,)g(n-1)180 210 y(info)f(=)i(j)f(+)g(1)180 276 y(s)h(=)f(0.0e0)180 342 y(jm1)g(=)g(j)180 409 y(if)g(\(jm1)f(.ge.)50 b(1\))25 b(then)232 475 y(do)f(k)i(=)f(0,)g(jm1)f(-)i(1)283 541 y(t)f(=)g(a\(k,j\))f(-)h (sdot\(k,a\(0)o(,k\))o(,1,)o(a\()o(0,j)o(\),1)o(\))283 607 y(t)g(=)g(t/a\(k,k\))283 673 y(a\(k,j\))e(=)i(t)283 740 y(s)g(=)g(s)h(+)f (t*t)232 806 y(end)f(do)180 872 y(end)h(if)180 938 y(s)h(=)f(a\(j,j\))e(-)i (s)180 1004 y(if)g(\(s)g(.le.)50 b(0.0e0\))23 b(go)i(to)g(40)180 1071 y(a\(j,j\))f(=)h(sqrt\(s\))180 1137 y(end)g(do)p 1818 1178 V 107 1180 1713 2 v 101 1287 a Fm(Figure)16 b(1:)21 b(The)c(b)q(o)q(dy)g (of)f(the)g(LINP)l(A)o(CK)g(routine)g(SPOF)-5 b(A)15 b(for)i(Cholesky)f (factorization.)p 107 1377 V 107 1882 2 505 v 129 1445 a Fk(do)25 b(j)g(=)g(0,)g(n)h(-)f(1)180 1511 y(call)f(strsv\()g('upper',)f('transpos)o (e',)f('non-unit)o(',)g(j,)j(a,)g(lda,)129 1577 y(a\(0,j\),)e(1)i(\))180 1643 y(s)h(=)f(a\(j,j\))e(-)i(sdot\()f(j,)h(a\(0,j\),)e(1,)i(a\(0,j\),)e(1)i (\))180 1709 y(if)g(\()g(s)h(.le.)49 b(zero)25 b(\))g(go)g(to)g(20)180 1776 y(a\(j,j\))f(=)h(sqrt\()f(s)h(\))129 1842 y(end)g(do)p 1818 1882 V 107 1884 1713 2 v 28 1992 a Fm(Figure)16 b(2:)22 b(The)16 b(b)q(o)q(dy)h(of)g(the)f(\\LAP)l(A)o(CK-st)o(yle")f(routine)h(SPOF) -5 b(A)16 b(for)g(Cholesky)g(factorization.)p 107 2082 V 107 2852 2 770 v 129 2149 a Fk(do)25 b(j)g(=)g(0,)g(n-1,)f(nb)180 2216 y(jb)h(=)g(min\()g(nb,)f(n-j)h(\))180 2282 y(call)f(strsm\()g('left',)f ('upper',)g('transpos)o(e',)f('non-unit')o(,)g(j,)j(jb,)129 2348 y(one,)539 2414 y(a,)g(lda,)f(a\(0,j\),)f(lda)i(\))180 2480 y(call)f(ssyrk\()g('upper',)f('transpos)o(e',)f(jb,)i(j,)h(-one,)f (a\(0,j\),)f(lda,)129 2547 y(one,)539 2613 y(a\(j,j\),)g(lda)i(\))180 2679 y(call)f(spotf2\()f('upper',)g(jb,)i(a\(j,j\),)e(lda,)h(info)g(\))180 2745 y(if\()h(info)f(.ne.)50 b(0)25 b(\))g(go)g(to)g(20)129 2811 y(end)g(do)p 1818 2852 V 107 2854 1713 2 v -57 2961 a Fm(Figure)14 b(3:)20 b(The)14 b(b)q(o)q(dy)i(of)e(the)g(\\LAP)l(A)o(CK-st)o (yle")g(routine)g(SPOF)-5 b(A)13 b(for)h(blo)q(c)o(k)g(Cholesky)g (factorization.)20 b(In)-57 3028 y(this)c(co)q(de)h(fragmen)o(t,)d Fk(nb)i Fm(denotes)g(the)g(width)g(of)h(the)f(blo)q(c)o(ks.)939 2825 y(12)p eop %%Page: 13 15 14 bop -57 615 a @beginspecial @setspecial %%BeginDocument: cholesky.ps 1.5 setlinewidth 2 setlinecap 0 setlinejoin /w1 70 def /w2 35 def /w3 105 def /box w1 w2 w3 add add def /offset 8 def 5.7 72 mul box 3 mul 40 2 mul add .5780282 mul sub 2 div 0 translate /bigfont {/Helvetica findfont 20 scalefont setfont} def /smallfont {/Helvetica findfont 12 scalefont setfont} def /mathfont {/Symbol findfont 20 scalefont setfont} def /Label_Block{ /ywid exch def /xwid exch def /ycorner exch def /xcorner exch def /superscript exch def /subscript exch def /label exch def xcorner ycorner moveto xwid smallfont subscript stringwidth pop bigfont label stringwidth pop add 1 add sub 2 div ywid 0.5 mul offset sub rmoveto label show currentpoint 1 -6 rmoveto smallfont subscript show moveto 1 10 rmoveto superscript show } def .5780282 dup scale 0 0 moveto box 0 rlineto 0 box rlineto box neg 0 rlineto closepath w1 0 moveto 0 box rlineto w1 w2 add 0 moveto 0 box rlineto 0 w3 moveto box 0 rlineto 0 w3 w2 add moveto box 0 rlineto stroke (A) (00) () 0.0 w2 w3 add w1 w1 Label_Block (A) (01) () w1 w2 w3 add w2 w1 Label_Block (A) (02) () w1 w2 add w2 w3 add w3 w1 Label_Block (A) (01) (T) 0.0 w3 w1 w2 Label_Block (A) (11) () w1 w3 w2 w2 Label_Block (A) (12) () w1 w2 add w3 w3 w2 Label_Block (A) (02) (T) 0.0 0.0 w1 w3 Label_Block (A) (12) (T) w1 0.0 w2 w3 Label_Block (A) (22) () w1 w2 add 0.0 w3 w3 Label_Block mathfont (=) dup stringwidth pop box 20 add exch .5 mul sub box .5 mul offset sub moveto show box 40 add 0 translate 0 0 moveto box 0 rlineto 0 box rlineto box neg 0 rlineto closepath w1 0 moveto 0 box rlineto w1 w2 add 0 moveto 0 box rlineto 0 w3 moveto box 0 rlineto 0 w3 w2 add moveto box 0 rlineto stroke (U) (00) (T) 0.0 w2 w3 add w1 w1 Label_Block (0) () () w1 w2 w3 add w2 w1 Label_Block (0) () () w1 w2 add w2 w3 add w3 w1 Label_Block (U) (01) (T) 0.0 w3 w1 w2 Label_Block (U) (11) (T) w1 w3 w2 w2 Label_Block (0) () () w1 w2 add w3 w3 w2 Label_Block (U) (02) (T) 0.0 0.0 w1 w3 Label_Block (U) (12) (T) w1 0.0 w2 w3 Label_Block (U) (22) (T) w1 w2 add 0.0 w3 w3 Label_Block mathfont (*) dup stringwidth pop box 20 add exch .5 mul sub box .5 mul offset sub moveto show box 40 add 0 translate 0 0 moveto box 0 rlineto 0 box rlineto box neg 0 rlineto closepath w1 0 moveto 0 box rlineto w1 w2 add 0 moveto 0 box rlineto 0 w3 moveto box 0 rlineto 0 w3 w2 add moveto box 0 rlineto stroke (U) (00) () 0.0 w2 w3 add w1 w1 Label_Block (U) (01) () w1 w2 w3 add w2 w1 Label_Block (U) (02) () w1 w2 add w2 w3 add w3 w1 Label_Block (0) () () 0.0 w3 w1 w2 Label_Block (U) (11) () w1 w3 w2 w2 Label_Block (U) (12) () w1 w2 add w3 w3 w2 Label_Block (0) () () 0.0 0.0 w1 w3 Label_Block (0) () () w1 0.0 w2 w3 Label_Block (U) (22) () w1 w2 add 0.0 w3 w3 Label_Block %%EndDocument @endspecial 108 x Fm(Figure)13 b(4:)21 b(P)o(artitioning)14 b(of)g Ff(A)p Fm(,)f Ff(U)579 705 y Fd(T)607 723 y Fm(,)h(and)h Ff(U)k Fm(in)o(to)13 b(blo)q(c)o(ks.)20 b(It)14 b(is)g(assumed)f(that)h(the)g (\014rst)g(blo)q(c)o(k)g(has)g(already)-57 789 y(b)q(een)k(factored)g(as)h Ff(A)349 796 y Fe(00)404 789 y Fm(=)e Ff(U)497 771 y Fd(T)492 801 y Fe(00)529 789 y Ff(U)562 796 y Fe(00)600 789 y Fm(,)h(and)h(w)o(e)f (next)f(w)o(an)o(t)i(to)f(determine)e(the)i(blo)q(c)o(k)g(column)e (consisting)j(of)-57 855 y Ff(U)-24 862 y Fe(01)30 855 y Fm(and)e Ff(U)158 862 y Fe(11)195 855 y Fm(.)k(Note)16 b(that)h(the)f(diagonal)h(blo)q (c)o(ks)f(of)g Ff(A)g Fm(and)h Ff(U)22 b Fm(are)16 b(square)g(matrices.)146 1005 y(T)l(able)g(2:)22 b(Sp)q(eed)16 b(\(Mega\015ops\))h(of)g(Cholesky)f(F)l (actorization)g Ff(A)d Fm(=)h Ff(U)1450 987 y Fd(T)1478 1005 y Ff(U)22 b Fm(for)16 b Ff(n)g Fm(=)h(500)p 88 1077 1750 2 v 87 1144 2 67 v 786 1144 V 811 1124 a(IBM)e(3090)j(VF,)p 1144 1144 V 49 w(CRA)l(Y)d(Y-MP)l(,)p 1491 1144 V 49 w(CRA)l(Y)h(Y-MP)l(,)p 1837 1144 V 87 1210 V 786 1210 V 972 1190 a(1)h(pro)q(c.)p 1144 1210 V 199 w(1)g(pro)q(c.)p 1491 1210 V 198 w(8)g(pro)q(c.)p 1837 1210 V 88 1212 1750 2 v 87 1278 2 67 v 113 1258 a Ff(j)s Fm(-v)m(arian)o(t:)k(LINP)l(A)o(CK)p 786 1278 V 447 w(23)p 1144 1278 V 299 w(72)p 1491 1278 V 298 w(72)p 1837 1278 V 87 1344 V 113 1324 a Ff(j)s Fm(-v)m(arian)o(t:)g(using)c(Lev)o(el)e(2)i(BLAS)p 786 1344 V 244 w(24)p 1144 1344 V 274 w(251)p 1491 1344 V 275 w(378)p 1837 1344 V 87 1410 V 113 1390 a Ff(j)s Fm(-v)m(arian)o(t:)k(using)c (Lev)o(el)e(3)i(BLAS)p 786 1410 V 244 w(49)p 1144 1410 V 274 w(287)p 1491 1410 V 250 w(1225)p 1837 1410 V 87 1476 V 113 1457 a Ff(i)p Fm(-v)m(arian)o(t:)k(using)c(Lev)o(el)e(3)h(BLAS)p 786 1476 V 251 w(50)p 1144 1476 V 274 w(290)p 1491 1476 V 250 w(1414)p 1837 1476 V 88 1478 1750 2 v -57 1665 a(But)d(that)h(is)f(not)h(the) f(end)h(of)g(the)f(story)l(,)h(and)g(the)f(co)q(de)h(giv)o(en)e(ab)q(o)o(v)o (e)i(is)f(not)h(the)f(co)q(de)h(actually)f(used)g(in)g(the)-57 1731 y(LAP)l(A)o(CK)20 b(routine)g(SPOTRF.)g(W)l(e)g(men)o(tioned)e(earlier)h (that)h(for)h(man)o(y)d(linear)i(algebra)g(computations)-57 1797 y(there)g(are)h(sev)o(eral)e(algorithmic)g(v)m(arian)o(ts,)i(often)g (referred)e(to)i(as)g Ff(i)p Fm(-,)h Ff(j)s Fm(-,)f(and)g Ff(k)r Fm(-v)m(arian)o(ts,)h(according)f(to)-57 1863 y(a)e(con)o(v)o(en)o(tion)f(in) o(tro)q(duced)g(in)h([15])g(and)g(used)h(in)e([36].)29 b(The)19 b(same)f(is)h(true)g(of)g(the)g(corresp)q(onding)h(blo)q(c)o(k)-57 1930 y(algorithms.)-57 2017 y(It)15 b(turns)g(out)h(that)g(the)f Ff(j)s Fm(-v)m(arian)o(t)g(c)o(hosen)h(for)f(LINP)l(A)o(CK,)f(and)i(used)f (in)g(the)g(ab)q(o)o(v)o(e)g(examples,)e(is)j(not)f(the)-57 2084 y(fastest)k(on)h(man)o(y)d(mac)o(hines,)g(b)q(ecause)i(it)g(p)q(erforms) f(most)g(of)h(the)g(w)o(ork)g(in)f(solving)h(triangular)g(systems)-57 2150 y(of)g(equations,)f(whic)o(h)g(can)g(b)q(e)h(signi\014can)o(tly)e(slo)o (w)o(er)h(than)g(matrix-matrix)e(m)o(ultipli)o(cation.)25 b(The)18 b(v)m(arian)o(t)-57 2216 y(actually)c(used)h(in)f(LAP)l(A)o(CK)h(is)f(the)h Ff(i)p Fm(-v)m(arian)o(t,)f(whic)o(h)g(relies)g(on)h(matrix-matrix)c(m)o (ultiplication)h(for)j(most)-57 2282 y(of)i(the)f(w)o(ork.)-57 2370 y(T)l(able)g(2)h(summarizes)c(the)j(results.)939 2825 y(13)p eop %%Page: 14 16 15 bop 76 154 a Fm(T)l(able)17 b(3:)k(Sp)q(eed)16 b(\(Mega\015ops\))i(of)e (SGETRF/DGETRF)i(for)e(Square)h(Matrices)e(of)i(Order)f Ff(n)p 82 226 1763 2 v 81 292 2 67 v 106 272 a Fm(Mac)o(hine)p 736 292 V 516 w(No.)21 b(of)p 1002 292 V 91 w(Blo)q(c)o(k)p 1172 292 V 1181 292 V 244 w(V)l(alues)16 b(of)g Ff(n)p 1844 292 V 1181 294 664 2 v 81 359 2 67 v 736 359 V 762 339 a Fm(pro)q(cessors)p 1002 359 V 73 w(size)p 1172 359 V 1181 359 V 79 w(100)p 1303 359 V 51 w(200)p 1426 359 V 51 w(300)p 1549 359 V 75 w(400)p 1696 359 V 75 w(500)p 1844 359 V 82 360 1763 2 v 81 426 2 67 v 106 407 a(IBM)g(RISC/6000-530)p 736 426 V 314 w(1)p 1002 426 V 181 w(32)p 1172 426 V 1181 426 V 119 w(19)p 1303 426 V 75 w(25)p 1426 426 V 75 w(29)p 1549 426 V 99 w(31)p 1696 426 V 100 w(33)p 1844 426 V 81 493 V 106 473 a(Allian)o(t)f(FX/8)p 736 493 V 472 w(8)p 1002 493 V 181 w(16)p 1172 493 V 1181 493 V 144 w(9)p 1303 493 V 74 w(26)p 1426 493 V 75 w(32)p 1549 493 V 99 w(46)p 1696 493 V 100 w(57)p 1844 493 V 81 559 V 106 539 a(IBM)h(3090J)h(VF)p 736 559 V 431 w(1)p 1002 559 V 181 w(64)p 1172 559 V 1181 559 V 119 w(23)p 1303 559 V 75 w(41)p 1426 559 V 75 w(52)p 1549 559 V 99 w(58)p 1696 559 V 100 w(63)p 1844 559 V 81 625 V 106 605 a(Con)o(v)o(ex)f(C-240)p 736 625 V 455 w(4)p 1002 625 V 181 w(64)p 1172 625 V 1181 625 V 119 w(31)p 1303 625 V 75 w(60)p 1426 625 V 75 w(82)p 1549 625 V 75 w(100)p 1696 625 V 75 w(112)p 1844 625 V 81 691 V 106 671 a(CRA)l(Y)g(Y-MP)p 736 691 V 464 w(1)p 1002 691 V 193 w(1)p 1172 691 V 1181 691 V 107 w(132)p 1303 691 V 51 w(219)p 1426 691 V 51 w(254)p 1549 691 V 75 w(272)p 1696 691 V 75 w(283)p 1844 691 V 81 757 V 106 738 a(CRA)l(Y-2)p 736 757 V 571 w(1)p 1002 757 V 181 w(64)p 1172 757 V 1181 757 V 95 w(110)p 1303 757 V 51 w(211)p 1426 757 V 51 w(292)p 1549 757 V 75 w(318)p 1696 757 V 75 w(358)p 1844 757 V 81 824 V 106 804 a(Siemens/F)l(ujitsu)f(VP)h (400-EX)p 736 824 V 147 w(1)p 1002 824 V 181 w(64)p 1172 824 V 1181 824 V 119 w(46)p 1303 824 V 51 w(132)p 1426 824 V 51 w(222)p 1549 824 V 75 w(309)p 1696 824 V 75 w(397)p 1844 824 V 81 890 V 106 870 a(NEC)h(SX2)p 736 890 V 542 w(1)p 1002 890 V 193 w(1)p 1172 890 V 1181 890 V 107 w(118)p 1303 890 V 51 w(274)p 1426 890 V 51 w(412)p 1549 890 V 75 w(504)p 1696 890 V 75 w(577)p 1844 890 V 81 956 V 106 936 a(CRA)l(Y)f(Y-MP)p 736 956 V 464 w(8)p 1002 956 V 181 w(64)p 1172 956 V 1181 956 V 95 w(195)p 1303 956 V 51 w(556)p 1426 956 V 51 w(920)p 1549 956 V 51 w(1188)p 1696 956 V 51 w(1408)p 1844 956 V 82 958 1763 2 v -57 1145 a Fo(3.2)70 b(Examples)22 b(of)h(Blo)r(c)n(k)f(Algorithms)f (in)i(LAP)-6 b(A)n(CK)-57 1264 y Fm(Ha)o(ving)21 b(discussed)h(in)g(detail)f (the)h(deriv)m(ation)f(of)i(one)f(particular)f(blo)q(c)o(k)h(algorithm,)g(w)o (e)f(no)o(w)i(describ)q(e)-57 1331 y(examples)14 b(of)i(the)g(p)q(erformance) f(ac)o(hiev)o(ed)g(with)g(t)o(w)o(o)h(w)o(ell-kno)o(wn)g(blo)q(c)o(k)f (algorithms:)21 b(LU)16 b(and)g(Cholesky)-57 1397 y(factorizations.)40 b(No)22 b(extra)g(\015oating-p)q(oin)o(t)h(op)q(erations)h(nor)f(extra)f(w)o (orking)g(storage)h(are)g(required)e(for)-57 1463 y(either)f(of)g(these)h (simple)d(blo)q(c)o(k)i(algorithms.)33 b(\(See)20 b(Galliv)m(an)h(et)f(al.)g ([33])g(and)i(Dongarra)g(et)e(al.)g([19)q(])g(for)-57 1529 y(surv)o(eys)c(of)g(algorithms)g(for)g(dense)g(linear)g(algebra)g(on)h (high-p)q(erformance)f(computers.\))-57 1617 y(T)l(able)23 b(3)h(illustrates)e(the)h(sp)q(eed)h(of)f(the)g(LAP)l(A)o(CK)g(routine)g(for) h(LU)f(factorization)g(of)h(a)f(real)g(matrix,)-57 1683 y(SGETRF)16 b(in)f(single)h(precision)e(on)i(CRA)l(Y)f(mac)o(hines,)f(and)i(DGETRF)g(in)f (double)h(precision)f(on)h(all)f(other)-57 1749 y(mac)o(hines.)21 b(Th)o(us,)c(64-bit)g(\015oating-p)q(oin)o(t)i(arithmetic)14 b(is)j(used)g(on)g(all)f(mac)o(hines)f(tested.)23 b(A)16 b(blo)q(c)o(k)h (size)f(of)-57 1815 y(1)h(means)g(that)g(the)g(un)o(blo)q(c)o(k)o(ed)f (algorithm)g(is)h(used,)f(since)h(it)f(is)h(faster)g(than)h({)f(or)h(at)f (least)g(as)h(fast)f(as)h({)g(a)-57 1882 y(blo)q(c)o(k)e(algorithm.)-57 1969 y(LAP)l(A)o(CK)c(is)g(designed)h(to)g(giv)o(e)e(high)i(e\016ciency)d(on) j(v)o(ector)f(pro)q(cessors,)i(high-p)q(erformance)e(\\sup)q(erscalar")-57 2036 y(w)o(orkstations,)22 b(and)f(shared)f(memory)e(m)o(ultipro)q(cessors.) 32 b(LAP)l(A)o(CK)20 b(in)g(its)g(presen)o(t)f(form)g(is)i(less)f(lik)o(ely) -57 2102 y(to)f(giv)o(e)f(go)q(o)q(d)j(p)q(erformance)d(on)i(other)f(t)o(yp)q (es)g(of)g(parallel)f(arc)o(hitectures)g(\(for)h(example,)e(massiv)o(ely)f (par-)-57 2168 y(allel)i(SIMD)i(mac)o(hines,)d(or)j(MIMD)f(distributed)g (memory)e(mac)o(hines\),)h(but)h(the)h(ScaLAP)l(A)o(CK)f(pro)s(ject,)-57 2234 y(describ)q(ed)d(in)g(Section)g(1.1.4,)g(is)g(in)o(tended)g(to)g(adapt)i (LAP)l(A)o(CK)e(to)h(these)f(new)g(arc)o(hitectures.)k(LAP)l(A)o(CK)-57 2300 y(can)c(also)h(b)q(e)g(used)f(satisfactorily)g(on)h(all)e(t)o(yp)q(es)h (of)h(scalar)f(mac)o(hines)f(\(PCs,)h(w)o(orkstations,)h(mainframes\).)-57 2388 y(T)l(able)f(4)h(giv)o(es)e(similar)f(results)i(for)h(Cholesky)f (factorization,)g(extending)f(the)h(results)g(giv)o(en)f(in)h(T)l(able)h(2.) -57 2476 y(LAP)l(A)o(CK,)d(lik)o(e)g(LINP)l(A)o(CK,)g(pro)o(vides)h(LU)g(and) h(Cholesky)f(factorizations)g(of)h(band)g(matrices.)j(The)c(LIN-)-57 2542 y(P)l(A)o(CK)e(algorithms)g(can)g(easily)g(b)q(e)h(restructured)f(to)g (use)h(Lev)o(el)e(2)i(BLAS,)f(though)h(restructuring)g(has)g(little)-57 2608 y(e\013ect)j(on)h(p)q(erformance)f(for)h(matrices)e(of)i(v)o(ery)e (narro)o(w)i(bandwidth.)27 b(It)17 b(is)g(also)i(p)q(ossible)e(to)h(use)g (Lev)o(el)f(3)-57 2674 y(BLAS,)i(at)i(the)f(price)f(of)h(doing)h(some)e (extra)h(w)o(ork)g(with)g(zero)f(elemen)o(ts)f(outside)i(the)g(band)g([22)q (].)32 b(This)939 2825 y(14)p eop %%Page: 15 17 16 bop -57 154 a Fm(T)l(able)16 b(4:)21 b(Sp)q(eed)16 b(\(Mega\015ops\))g(of) g(SPOTRF/DPOTRF)h(for)f(Matrices)e(of)i(Order)g Ff(n)p Fm(.)21 b(Here)14 b(UPLO)i(=)g(`U',)-57 220 y(so)h(the)f(factorization)g(is)g(of)h (the)f(form)f Ff(A)e Fm(=)h Ff(U)814 202 y Fd(T)842 220 y Ff(U)5 b Fm(.)p 82 280 1763 2 v 81 346 2 67 v 106 326 a(Mac)o(hine)p 736 346 V 516 w(No.)21 b(of)p 1002 346 V 91 w(Blo)q(c)o(k)p 1172 346 V 1181 346 V 244 w(V)l(alues)16 b(of)g Ff(n)p 1844 346 V 1181 348 664 2 v 81 412 2 67 v 736 412 V 762 392 a Fm(pro)q(cessors)p 1002 412 V 73 w(size)p 1172 412 V 1181 412 V 79 w(100)p 1303 412 V 51 w(200)p 1426 412 V 51 w(300)p 1549 412 V 75 w(400)p 1696 412 V 75 w(500)p 1844 412 V 82 414 1763 2 v 81 480 2 67 v 106 460 a(IBM)g(RISC/6000-530)p 736 480 V 314 w(1)p 1002 480 V 181 w(32)p 1172 480 V 1181 480 V 119 w(21)p 1303 480 V 75 w(29)p 1426 480 V 75 w(34)p 1549 480 V 99 w(36)p 1696 480 V 100 w(38)p 1844 480 V 81 546 V 106 526 a(Allian)o(t)f(FX/8)p 736 546 V 472 w(8)p 1002 546 V 181 w(16)p 1172 546 V 1181 546 V 119 w(10)p 1303 546 V 75 w(27)p 1426 546 V 75 w(40)p 1549 546 V 99 w(49)p 1696 546 V 100 w(52)p 1844 546 V 81 613 V 106 593 a(IBM)h(3090J)h(VF)p 736 613 V 431 w(1)p 1002 613 V 181 w(48)p 1172 613 V 1181 613 V 119 w(26)p 1303 613 V 75 w(43)p 1426 613 V 75 w(56)p 1549 613 V 99 w(62)p 1696 613 V 100 w(67)p 1844 613 V 81 679 V 106 659 a(Con)o(v)o(ex)f(C-240)p 736 679 V 455 w(4)p 1002 679 V 181 w(64)p 1172 679 V 1181 679 V 119 w(32)p 1303 679 V 75 w(63)p 1426 679 V 75 w(82)p 1549 679 V 99 w(96)p 1696 679 V 75 w(103)p 1844 679 V 81 745 V 106 725 a(CRA)l(Y)g(Y-MP)p 736 745 V 464 w(1)p 1002 745 V 193 w(1)p 1172 745 V 1181 745 V 107 w(126)p 1303 745 V 51 w(219)p 1426 745 V 51 w(257)p 1549 745 V 75 w(275)p 1696 745 V 75 w(285)p 1844 745 V 81 811 V 106 791 a(CRA)l(Y-2)p 736 811 V 571 w(1)p 1002 811 V 181 w(64)p 1172 811 V 1181 811 V 95 w(109)p 1303 811 V 51 w(213)p 1426 811 V 51 w(294)p 1549 811 V 75 w(318)p 1696 811 V 75 w(362)p 1844 811 V 81 877 V 106 858 a(Siemens/F)l(ujitsu)f(VP)h (400-EX)p 736 877 V 147 w(1)p 1002 877 V 193 w(1)p 1172 877 V 1181 877 V 131 w(53)p 1303 877 V 51 w(145)p 1426 877 V 51 w(237)p 1549 877 V 75 w(312)p 1696 877 V 75 w(369)p 1844 877 V 81 944 V 106 924 a(NEC)h(SX2)p 736 944 V 542 w(1)p 1002 944 V 193 w(1)p 1172 944 V 1181 944 V 107 w(155)p 1303 944 V 51 w(387)p 1426 944 V 51 w(589)p 1549 944 V 75 w(719)p 1696 944 V 75 w(819)p 1844 944 V 81 1010 V 106 990 a(CRA)l(Y)f(Y-MP)p 736 1010 V 464 w(8)p 1002 1010 V 181 w(32)p 1172 1010 V 1181 1010 V 95 w(146)p 1303 1010 V 51 w(479)p 1426 1010 V 51 w(845)p 1549 1010 V 51 w(1164)p 1696 1010 V 51 w(1393)p 1844 1010 V 82 1011 1763 2 v -57 1197 a(pro)q(cess)h(b)q(ecomes)e(w)o(orth)o(while)g(for) i(large)f(matrices)e(and)j(semi-bandwidth)e(greater)i(than)g(100)g(or)g(so.) -57 1394 y Fh(4)83 b(LU)27 b(F)-7 b(actorization)-57 1531 y Fm(In)21 b(this)g(section,)h(w)o(e)e(\014rst)i(discuss)f(the)g(uses)h(of)f (dense)g(LU)h(factorization)f(in)g(sev)o(eral)f(\014elds.)36 b(W)l(e)21 b(next)-57 1597 y(dev)o(elop)f(a)h(blo)q(c)o(k-partitioned)g(v)o (ersion)f(of)h(the)g Ff(k)r Fm(,)h(or)f(righ)o(t-lo)q(oking,)h(v)m(arian)o(t) f(of)g(the)g(LU)g(factorization)-57 1663 y(algorithm.)j(In)18 b(subsequen)o(t)f(sections,)g(the)g(parallelization)g(of)h(this)f(algorithm)g (is)g(describ)q(ed)g(in)h(detail)e(in)-57 1729 y(order)e(to)g(highligh)o(t)f (the)g(issues)h(and)g(considerations)g(that)g(m)o(ust)f(b)q(e)g(tak)o(en)h (in)o(to)f(accoun)o(t)h(in)f(dev)o(eloping)g(an)-57 1795 y(e\016cien)o(t,)f (scalable,)i(and)g(transp)q(ortable)h(dense)f(linear)f(algebra)i(library)e (for)h(MIMD,)f(distributed)g(memory)l(,)-57 1862 y(concurren)o(t)j (computers.)-57 2034 y Fo(4.1)70 b(Uses)22 b(of)h(LU)h(F)-6 b(actorization)22 b(in)g(Science)f(and)i(Engineering)-57 2154 y Fm(A)13 b(ma)s(jor)g(source)h(of)h(large)e(dense)h(linear)f(systems)g(is)h (problems)e(in)o(v)o(olving)h(the)g(solution)h(of)h(b)q(oundary)g(in)o(te-) -57 2220 y(gral)h(equations.)21 b(These)16 b(are)f(in)o(tegral)g(equations)h (de\014ned)f(on)i(the)e(b)q(oundary)i(of)f(a)g(region)g(of)g(in)o(terest.)j (All)-57 2286 y(examples)13 b(of)h(practical)g(in)o(terest)f(compute)h(some)f (in)o(termediate)e(quan)o(tit)o(y)j(on)h(a)g(t)o(w)o(o-dimensional)e(b)q (ound-)-57 2353 y(ary)21 b(and)h(then)e(use)h(this)g(information)f(to)h (compute)f(the)g(\014nal)h(desired)g(quan)o(tit)o(y)e(in)i(three-dimensional) -57 2419 y(space.)35 b(The)21 b(price)f(one)h(pa)o(ys)g(for)g(replacing)f (three)h(dimensions)e(with)i(t)o(w)o(o)f(is)h(that)g(what)h(started)f(as)h(a) -57 2485 y(sparse)17 b(problem)e(in)g Ff(O)q Fm(\()p Ff(n)422 2467 y Fe(3)443 2485 y Fm(\))h(v)m(ariables)g(is)g(replaced)g(b)o(y)f(a)i (dense)f(problem)f(in)h Ff(O)q Fm(\()p Ff(n)1496 2467 y Fe(2)1516 2485 y Fm(\).)-57 2573 y(Dense)g(systems)f(of)i(linear)e(equations)i(are)f (found)h(in)f(n)o(umerous)f(applications,)h(including:)16 2700 y Fl(\017)24 b Fm(airplane)16 b(wing)g(design;)939 2825 y(15)p eop %%Page: 16 18 17 bop 16 125 a Fl(\017)24 b Fm(radar)17 b(cross-section)g(studies;)16 231 y Fl(\017)24 b Fm(\015o)o(w)17 b(around)g(ships)f(and)h(other)g (o\013-shore)g(constructions;)16 338 y Fl(\017)24 b Fm(di\013usion)17 b(of)f(solid)g(b)q(o)q(dies)h(in)f(a)h(liquid;)16 445 y Fl(\017)24 b Fm(noise)16 b(reduction;)f(and)16 552 y Fl(\017)24 b Fm(di\013usion)17 b(of)f(ligh)o(t)g(through)h(small)e(particles.)-57 677 y(The)j (electromagnetics)d(comm)o(unit)n(y)g(is)j(a)g(ma)s(jor)f(user)g(of)h(dense)g (linear)f(systems)f(solv)o(ers.)26 b(Of)17 b(particular)-57 743 y(in)o(terest)h(to)h(this)f(comm)o(unit)o(y)d(is)k(the)f(solution)h(of)h (the)e(so-called)h(radar)g(cross-section)g(problem.)28 b(In)18 b(this)-57 810 y(problem,)c(a)i(signal)f(of)h(\014xed)f(frequency)f(b)q (ounces)j(o\013)f(an)g(ob)s(ject;)f(the)g(goal)h(is)g(to)g(determine)d(the)i (in)o(tensit)o(y)-57 876 y(of)j(the)g(re\015ected)e(signal)j(in)e(all)g(p)q (ossible)i(directions.)25 b(The)18 b(underlying)f(di\013eren)o(tial)f (equation)i(ma)o(y)e(v)m(ary)l(,)-57 942 y(dep)q(ending)k(on)h(the)e(sp)q (eci\014c)h(problem.)31 b(In)19 b(the)h(design)g(of)g(stealth)g(aircraft,)h (the)e(principal)g(equation)h(is)-57 1008 y(the)15 b(Helmholtz)d(equation.)21 b(T)l(o)16 b(solv)o(e)f(this)g(equation,)g(researc)o(hers)f(use)h(the)h Fg(metho)n(d)g(of)g(moments)g Fm([38,)f(56)q(].)-57 1074 y(In)21 b(the)f(case)h(of)g(\015uid)g(\015o)o(w,)h(the)e(problem)g(often)g(in)o(v)o (olv)o(es)f(solving)i(the)g(Laplace)g(or)g(P)o(oisson)h(equation.)-57 1141 y(Here,)16 b(the)h(b)q(oundary)i(in)o(tegral)e(solution)h(is)f(kno)o(wn) h(as)g(the)f Fg(p)n(anel)i(metho)n(d)e Fm([40)q(,)g(41],)g(so)h(named)f(from) f(the)-57 1207 y(quadrilaterals)g(that)h(discretize)d(and)k(appro)o(ximate)d (a)h(structure)h(suc)o(h)f(as)h(an)g(airplane.)k(Generally)l(,)15 b(these)-57 1273 y(metho)q(ds)h(are)g(called)f Fg(b)n(oundary)i(element)j (metho)n(ds)p Fm(.)-57 1360 y(Use)g(of)h(these)f(metho)q(ds)g(pro)q(duces)h (a)f(dense)h(linear)e(system)g(of)i(size)e Ff(O)q Fm(\()p Ff(N)5 b Fm(\))22 b(b)o(y)e Ff(O)q Fm(\()p Ff(N)5 b Fm(\),)21 b(where)f Ff(N)26 b Fm(is)20 b(the)-57 1426 y(n)o(um)o(b)q(er)g(of)h(b)q(oundary)i(p)q (oin)o(ts)f(\(or)f(panels\))h(b)q(eing)f(used.)37 b(It)21 b(is)g(not)h(un)o (usual)g(to)f(see)g(size)g(3)p Ff(N)27 b Fm(b)o(y)21 b(3)p Ff(N)5 b Fm(,)-57 1493 y(b)q(ecause)16 b(of)h(three)f(ph)o(ysical)f(quan)o (tities)g(of)i(in)o(terest)e(at)h(ev)o(ery)f(b)q(oundary)i(elemen)o(t.)-57 1580 y(A)12 b(t)o(ypical)g(approac)o(h)i(to)f(solving)g(suc)o(h)f(systems)g (is)g(to)i(use)f(LU)f(factorization.)20 b(Eac)o(h)13 b(en)o(try)f(of)h(the)g (matrix)e(is)-57 1646 y(computed)h(as)i(an)g(in)o(teraction)f(of)h(t)o(w)o(o) f(b)q(oundary)i(elemen)o(ts.)j(Often,)13 b(man)o(y)f(in)o(tegrals)h(m)o(ust)f (b)q(e)i(computed.)-57 1712 y(In)g(man)o(y)g(instances,)g(the)h(time)d (required)i(to)h(compute)e(the)h(matrix)f(is)i(considerably)f(larger)g(than)i (the)e(time)-57 1778 y(for)j(solution.)-57 1865 y(Only)d(the)g(builders)f(of) i(stealth)f(tec)o(hnology)g(who)h(are)f(in)o(terested)f(in)h(radar)h (cross-sections)g(are)f(considering)-57 1932 y(using)k(direct)f(Gaussian)h (elimination)d(metho)q(ds)i(for)h(solving)g(dense)f(linear)g(systems.)24 b(These)18 b(systems)e(are)-57 1998 y(alw)o(a)o(ys)g(symmetric)c(and)17 b(complex,)d(but)i(not)h(Hermitian.)-57 2085 y(F)l(or)f(further)g (information)f(on)i(v)m(arious)f(metho)q(ds)g(for)g(solving)g(large)g(dense)g (linear)f(algebra)i(problems)e(that)-57 2151 y(arise)h(in)g(computational)f (\015uid)i(dynamics,)d(see)i(the)g(rep)q(ort)g(b)o(y)g(Alan)g(Edelman)f ([30].)-57 2323 y Fo(4.2)70 b(Deriv)l(ation)21 b(of)j(a)f(Blo)r(c)n(k)f (Algorithm)f(for)j(LU)f(F)-6 b(actorization)-57 2443 y Fm(Supp)q(ose)19 b(the)e Ff(M)h Fl(\002)12 b Ff(N)23 b Fm(matrix)16 b Ff(A)h Fm(is)h(partitioned)f(as)i(sho)o(wn)f(in)g(Figure)f(5,)h(and)g(w)o(e)g(seek)f (a)h(factorization)-57 2509 y Ff(A)13 b Fm(=)h Ff(LU)5 b Fm(,)16 b(where)g(the)g(partitioning)h(of)f Ff(L)h Fm(and)f Ff(U)22 b Fm(is)16 b(also)h(sho)o(wn)g(in)f(Figure)g(5.)21 b(Then)c(w)o(e)e(ma)o(y)g (write,)896 2621 y Ff(L)929 2628 y Fe(00)966 2621 y Ff(U)999 2628 y Fe(00)1078 2621 y Fm(=)42 b Ff(A)1195 2628 y Fe(00)1921 2621 y Fm(\(3\))896 2700 y Ff(L)929 2707 y Fe(10)966 2700 y Ff(U)999 2707 y Fe(00)1078 2700 y Fm(=)g Ff(A)1195 2707 y Fe(10)1921 2700 y Fm(\(4\))939 2825 y(16)p eop %%Page: 17 19 18 bop -57 675 a @beginspecial @setspecial %%BeginDocument: matrix_A.ps 1.5 setlinewidth 2 setlinecap 0 setlinejoin /panel 50 def /box 150 def /offset 8 def /bigfont {/Helvetica findfont 25 scalefont setfont} def /smallfont {/Helvetica findfont 15 scalefont setfont} def /mathfont {/Symbol findfont 25 scalefont setfont} def .7743396 .7743396 scale 0 0 moveto box 0 rlineto 0 box rlineto box neg 0 rlineto closepath panel 0 moveto 0 box rlineto box panel sub 0 exch moveto box 0 rlineto stroke smallfont (00) stringwidth pop bigfont (A) stringwidth pop add 1 add panel exch sub 2 div box panel .5 mul sub offset sub moveto (A) show 1 -6 rmoveto smallfont (00) show smallfont (10) stringwidth pop bigfont (A) stringwidth pop add 1 add panel exch sub 2 div box panel sub .5 mul offset sub moveto (A) show 1 -6 rmoveto smallfont (10) show smallfont (11) stringwidth pop bigfont (A) stringwidth pop add 1 add panel box add exch sub .5 mul box panel sub .5 mul offset sub moveto (A) show 1 -6 rmoveto smallfont (11) show smallfont (01) stringwidth pop bigfont (A) stringwidth pop add 1 add panel box add exch sub .5 mul box panel .5 mul sub offset sub moveto (A) show 1 -6 rmoveto smallfont (01) show 0 0 moveto box 0 rlineto 0 box rlineto box neg 0 rlineto closepath panel 0 moveto 0 box rlineto box panel sub 0 exch moveto box 0 rlineto stroke mathfont (=) dup stringwidth pop box 20 add exch .5 mul sub box .5 mul offset sub moveto show box 40 add 0 translate smallfont (00) stringwidth pop bigfont (L) stringwidth pop add 1 add panel exch sub 2 div box panel .5 mul sub offset sub moveto (L) show 1 -6 rmoveto smallfont (00) show smallfont (10) stringwidth pop bigfont (L) stringwidth pop add 1 add panel exch sub 2 div box panel sub .5 mul offset sub moveto (L) show 1 -6 rmoveto smallfont (10) show smallfont (11) stringwidth pop bigfont (L) stringwidth pop add 1 add panel box add exch sub .5 mul box panel sub .5 mul offset sub moveto (L) show 1 -6 rmoveto smallfont (11) show bigfont (0) stringwidth pop panel box add exch sub .5 mul box panel .5 mul sub offset sub moveto (0) show 0 0 moveto box 0 rlineto 0 box rlineto box neg 0 rlineto closepath panel 0 moveto 0 box rlineto box panel sub 0 exch moveto box 0 rlineto stroke mathfont (*) dup stringwidth pop box 20 add exch .5 mul sub box .5 mul offset sub moveto show box 40 add 0 translate smallfont (00) stringwidth pop bigfont (U) stringwidth pop add 1 add panel exch sub 2 div box panel .5 mul sub offset sub moveto (U) show 1 -6 rmoveto smallfont (00) show smallfont (11) stringwidth pop bigfont (U) stringwidth pop add 1 add panel box add exch sub .5 mul box panel sub .5 mul offset sub moveto (U) show 1 -6 rmoveto smallfont (11) show bigfont (0) stringwidth pop panel exch sub 2 div box panel sub .5 mul offset sub moveto (0) show smallfont (01) stringwidth pop bigfont (U) stringwidth pop add 1 add panel box add exch sub .5 mul box panel .5 mul sub offset sub moveto (U) show 1 -6 rmoveto smallfont (01) show 0 0 moveto box 0 rlineto 0 box rlineto box neg 0 rlineto closepath panel 0 moveto 0 box rlineto box panel sub 0 exch moveto box 0 rlineto stroke %%EndDocument @endspecial 108 x Fm(Figure)17 b(5:)23 b(Blo)q(c)o(k)17 b(LU)g (factorization)g(of)g(the)g(partitioned)g(matrix)f Ff(A)p Fm(.)24 b Ff(A)1331 790 y Fe(00)1385 783 y Fm(is)17 b Ff(r)c Fl(\002)f Ff(r)q Fm(,)17 b Ff(A)1612 790 y Fe(01)1666 783 y Fm(is)g Ff(r)c Fl(\002)f Fm(\()p Ff(N)17 b Fl(\000)11 b Ff(r)q Fm(\),)-57 849 y Ff(A)-20 856 y Fe(10)32 849 y Fm(is)j(\()p Ff(M)f Fl(\000)8 b Ff(r)q Fm(\))g Fl(\002)g Ff(r)q Fm(,)14 b(and)h Ff(A)483 856 y Fe(11)535 849 y Fm(is)f(\()p Ff(M)g Fl(\000)8 b Ff(r)q Fm(\))g Fl(\002)g Fm(\()p Ff(N)k Fl(\000)c Ff(r)q Fm(\).)20 b Ff(L)1032 856 y Fe(00)1085 849 y Fm(and)15 b Ff(L)1211 856 y Fe(11)1263 849 y Fm(are)g(lo)o(w)o(er)e(triangular)i(matrices)e(with)-57 915 y(1's)j(on)h(the)f(main)f(diagonal,)i(and)g Ff(U)627 922 y Fe(00)680 915 y Fm(and)g Ff(U)808 922 y Fe(11)862 915 y Fm(are)f(upp)q(er)h (triangular)f(matrices.)896 1054 y Ff(L)929 1061 y Fe(00)966 1054 y Ff(U)999 1061 y Fe(01)1078 1054 y Fm(=)42 b Ff(A)1195 1061 y Fe(01)1921 1054 y Fm(\(5\))694 1132 y Ff(L)727 1139 y Fe(10)765 1132 y Ff(U)798 1139 y Fe(01)847 1132 y Fm(+)11 b Ff(L)929 1139 y Fe(11)966 1132 y Ff(U)999 1139 y Fe(11)1078 1132 y Fm(=)42 b Ff(A)1195 1139 y Fe(11)1921 1132 y Fm(\(6\))-57 1241 y(where)17 b Ff(A)122 1248 y Fe(00)177 1241 y Fm(is)g Ff(r)c Fl(\002)f Ff(r)q Fm(,)17 b Ff(A)404 1248 y Fe(01)459 1241 y Fm(is)g Ff(r)c Fl(\002)f Fm(\()p Ff(N)17 b Fl(\000)12 b Ff(r)q Fm(\),)17 b Ff(A)831 1248 y Fe(10)886 1241 y Fm(is)g(\()p Ff(M)g Fl(\000)12 b Ff(r)q Fm(\))g Fl(\002)f Ff(r)q Fm(,)18 b(and)g Ff(A)1362 1248 y Fe(11)1417 1241 y Fm(is)f(\()p Ff(M)g Fl(\000)12 b Ff(r)q Fm(\))g Fl(\002)g Fm(\()p Ff(N)17 b Fl(\000)11 b Ff(r)q Fm(\).)25 b Ff(L)1945 1248 y Fe(00)-57 1307 y Fm(and)17 b Ff(L)71 1314 y Fe(11)125 1307 y Fm(are)g(lo)o(w)o(er)f(triangular)h (matrices)d(with)j(1s)g(on)g(the)f(main)g(diagonal,)h(and)g Ff(U)1542 1314 y Fe(00)1596 1307 y Fm(and)g Ff(U)1724 1314 y Fe(11)1778 1307 y Fm(are)g(upp)q(er)-57 1374 y(triangular)g(matrices.)-57 1460 y(Equations)e(3)f(and)h(4)f(tak)o(en)f(together)i(p)q(erform)e(an)h(LU)g (factorization)g(on)g(the)g(\014rst)g Ff(M)e Fl(\002)6 b Ff(r)16 b Fm(panel)d(of)i Ff(A)e Fm(\(i.e.,)-57 1526 y Ff(A)-20 1533 y Fe(00)34 1526 y Fm(and)k Ff(A)166 1533 y Fe(10)203 1526 y Fm(\).)23 b(Once)16 b(this)h(is)f(completed,)e(the)j(matrices)e Ff(L)1090 1533 y Fe(00)1127 1526 y Fm(,)i Ff(L)1191 1533 y Fe(10)1228 1526 y Fm(,)g(and)g Ff(U)1387 1533 y Fe(00)1442 1526 y Fm(are)f(kno)o(wn,)h(and)g(the)g(lo)o(w)o(er)-57 1593 y(triangular)g(system)d(in)i(Eq.)g(5)h(can)f(b)q(e)g(solv)o(ed)g(to)h(giv)o (e)e Ff(U)1014 1600 y Fe(01)1051 1593 y Fm(.)22 b(Finally)l(,)14 b(w)o(e)i(rearrange)h(Eq.)e(6)i(as,)652 1701 y Ff(A)689 1681 y Fc(0)689 1714 y Fe(11)740 1701 y Fm(=)d Ff(A)829 1708 y Fe(11)877 1701 y Fl(\000)c Ff(L)959 1708 y Fe(10)997 1701 y Ff(U)1030 1708 y Fe(01)1081 1701 y Fm(=)k Ff(L)1166 1708 y Fe(11)1204 1701 y Ff(U)1237 1708 y Fe(11)1921 1701 y Fm(\(7\))-57 1810 y(>F)l(rom)j(this)h(equation)g(w)o(e)f(see)h(that)h(the)f(problem)e(of)i (\014nding)h Ff(L)1188 1817 y Fe(11)1243 1810 y Fm(and)g Ff(U)1373 1817 y Fe(11)1429 1810 y Fm(reduces)e(to)i(\014nding)f(the)g(LU)-57 1876 y(factorization)j(of)g(the)g(\()p Ff(M)f Fl(\000)14 b Ff(r)q Fm(\))g Fl(\002)g Fm(\()p Ff(N)20 b Fl(\000)14 b Ff(r)q Fm(\))21 b(matrix)f Ff(A)1023 1858 y Fc(0)1023 1889 y Fe(11)1059 1876 y Fm(.)36 b(This)21 b(can)g(b)q(e)g(done)h(b)o(y)e(applying)h(the)g (steps)-57 1943 y(outlined)16 b(ab)q(o)o(v)o(e)g(to)g Ff(A)367 1925 y Fc(0)367 1955 y Fe(11)421 1943 y Fm(instead)g(of)g(to)h Ff(A)p Fm(.)k(Rep)q(eating)16 b(these)g(steps)h Ff(K)j Fm(times,)14 b(where)696 2052 y Ff(K)k Fm(=)c(min)6 b(\()p Fl(d)p Ff(M)r(=r)q Fl(e)p Ff(;)i Fl(d)p Ff(N)q(=r)q Fl(e)p Fm(\))694 b(\(8\))-57 2160 y(w)o(e)18 b(obtain)h(the)f(LU)g(factorization)g(of)h(the)f(original)g Ff(M)g Fl(\002)12 b Ff(N)23 b Fm(matrix)17 b Ff(A)p Fm(.)27 b(F)l(or)18 b(an)h(in-place)e(algorithm,)g Ff(A)-57 2227 y Fm(is)j(o)o(v)o(erwritten)f(b)o(y)h Ff(L)h Fm(and)g Ff(U)26 b Fm({)20 b(the)h(1s)g(on)g(the)f(diagonal)h(of)g Ff(L)f Fm(do)h(not)g(need)f (to)h(b)q(e)g(stored)f(explicitly)l(.)-57 2293 y(Similarly)l(,)13 b(when)j Ff(A)g Fm(is)g(up)q(dated)h(b)o(y)f(Eq.)g(7)g(this)g(ma)o(y)f(also)i (b)q(e)f(done)h(in)f(place.)-57 2379 y(After)c Ff(k)j Fm(of)f(these)e Ff(K)17 b Fm(steps,)d(the)f(\014rst)g Ff(k)r(r)i Fm(columns)c(of)j Ff(L)f Fm(and)h(the)e(\014rst)i Ff(k)r(r)g Fm(ro)o(ws)g(of)f Ff(U)18 b Fm(ha)o(v)o(e)13 b(b)q(een)g(ev)m(aluated,)-57 2446 y(and)i(matrix)e Ff(A)h Fm(has)h(b)q(een)f(up)q(dated)h(to)g(the)f(form)f (sho)o(wn)j(in)e(Figure)g(6,)g(in)g(whic)o(h)g(panel)g Ff(B)j Fm(is)d(\()p Ff(M)f Fl(\000)7 b Ff(k)r(r)q Fm(\))g Fl(\002)g Ff(r)-57 2512 y Fm(and)17 b Ff(C)j Fm(is)c Ff(r)c Fl(\002)f Fm(\()p Ff(N)16 b Fl(\000)11 b Fm(\()p Ff(k)i Fl(\000)e Fm(1\))p Ff(r)q Fm(\).)22 b(Step)16 b Ff(k)d Fm(+)e(1)17 b(then)f(pro)q(ceeds)g(as)h (follo)o(ws,)3 2634 y(1.)24 b(factor)c Ff(B)i Fm(to)d(form)g(the)g(next)g (panel)g(of)h Ff(L)p Fm(,)g(p)q(erforming)e(partial)h(piv)o(oting)g(o)o(v)o (er)g(ro)o(ws)g(if)g(necessary)65 2700 y(\(see)d(Figure)g(14\).)21 b(This)c(ev)m(aluates)f(the)g(matrices)f Ff(L)1054 2707 y Fe(0)1073 2700 y Fm(,)h Ff(L)1136 2707 y Fe(1)1156 2700 y Fm(,)g(and)h Ff(U)1314 2707 y Fe(0)1350 2700 y Fm(in)f(Figure)g(6.)939 2825 y(17)p eop %%Page: 18 20 19 bop -57 825 a @beginspecial @setspecial %%BeginDocument: stepk.ps /arrowdict 13 dict def % Local storage for the procedure % ``arrow.'' /arrow % The procedure ``arrow'' adds an { arrowdict begin % arrow shape to the current path. /headlength exch def % It takes seven arguments: the x /halfheadthickness exch 2 div def % and y coordinates of the tail /halfthickness exch 2 div def % (imagine that a line has been /tipy exch def /tipx exch def % drawn down the center of the /taily exch def /tailx exch def % arrow from the tip to the tail, % then x and y lie on this line), % the x and y coordinates of the % tip of the arrow, the thickness % of the arrow in the tail % portion, the thickness of the % arrow at the widest part of the % arrowhead and the length of the % arrowhead. /dx tipx tailx sub def % Compute the differences in x and /dy tipy taily sub def % y for the tip and tail. These /arrowlength dx dx mul dy dy mul add % will be used to compute the sqrt def % length of the arrow and to /angle dy dx atan def % compute the angle of direction % that the arrow is facing with % respect to the current user % coordinate system origin. /base arrowlength headlength sub def % Compute where the base of the % arrowhead will be. /savematrix matrix currentmatrix def % Save the current user coordinate % system. We are using the same % strategy to localize the effect % of transformations as was used % in the program to draw an % ellipse. tailx taily translate % Translate to the starting point % of the tail. angle rotate % Rotate the x-axis to correspond % with the center line of the % arrow. 0 halfthickness neg moveto % Add the arrow shape to the % current path. base halfthickness neg lineto base halfheadthickness neg lineto arrowlength 0 lineto base halfheadthickness lineto base halfthickness lineto 0 halfthickness lineto closepath savematrix setmatrix % Restore the current user % coordinate system. end } def /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /Gridbox { /ny exch def /nx exch def /dely exch def /delx exch def /ypos exch def /xpos exch def /leny { ny dely mul} def /lenx { nx delx mul} def xpos ypos moveto [2 2] 0 setdash delx dely nx ny Grid newpath xpos ypos moveto [] 0 setdash lenx leny Box stroke } def /Circle { 0 360 arc } def /Ndots { /crad exch def /csep exch def /ndots exch def currentpoint /ymid exch def /xmid exch def 1 1 ndots { newpath xmid ymid crad Circle fill /xmid xmid csep add def} for } def /Cgrid { /crad exch def /ny exch def /nx exch def /dely exch def /delx exch def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny 1 sub{ pop 0 1 nx 1 sub{ pop newpath x y crad Circle fill /x x delx add def} for /x xpos def /y y dely add def} for } def /PaintCircle { /lh exch def /crad exch def /ymid exch def /xmid exch def newpath xmid ymid crad Circle gsave 1 setgray fill grestore stroke xmid ymid moveto dup stringwidth pop 2 div neg lh neg rmoveto show } def /GridSym { /isymbol exch def /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def /dx3 delx 3 div def /dy3 dely 3 div def 1 1 nx { pop /y ypos def 1 1 ny { pop isymbol 1 eq { newpath x delx 2 div add y dely 2 div add delx 3 div Csym gsave 1.0 setgray fill grestore stroke } if isymbol 2 eq { x y PlusSym } if isymbol 3 eq { x y CrossSym } if isymbol 4 eq { x dx3 2 div add y dy3 2 div add dx3 2 mul dy3 2 mul TriSym gsave 1.0 setgray fill grestore stroke } if isymbol 5 eq { newpath x delx 2 div add y dely 2 div add delx 3 div Csym fill } if isymbol 6 eq { x dx3 2 div add y dy3 2 div add dx3 2 mul dy3 2 mul TriSym fill } if isymbol 7 eq { 2 copy PlusSym CrossSym } if isymbol 8 eq { x y RectSym gsave 1.0 setgray fill grestore stroke } if isymbol 9 eq { x y RectSym fill } if isymbol 10 eq { newpath x delx 2 div add y dely 2 div add delx 5 div gray Cfillsym stroke } if /y y dely add def} for /x x delx add def} for } def /PlusSym { newpath moveto delx 2 div 0 rmoveto 0 dely rlineto delx 2 div neg dely 2 div neg rmoveto delx 0 rlineto stroke } def /RectSym { newpath moveto delx 0 rlineto 0 dely rlineto delx neg 0 rlineto closepath } def /CrossSym { newpath moveto delx dely rlineto delx neg 0 rmoveto delx dely neg rlineto stroke } def /TriSym { /ddy exch def /ddx exch def newpath moveto ddx 0 rlineto ddx 2 div neg ddy rlineto closepath } def /Csym % stack: xcen ycen radius => ??? Draws circle centered on (xcen ycen) { 0 360 arc } def /Cfillsym % stack: xcen ycen radius gray => ??? Draws shaded circle centered % on (xcen ycen) { /gray exch def 0 360 arc gsave gray setgray fill grestore } def /dwdict 100 dict def dwdict begin 1.5 setlinewidth 47.2 10 translate 0.8 dup scale /Size 160 def /Bwid 30 def /Ewid 70 def [2 2] 0 setdash Size Bwid Ewid add sub 0 moveto 0 Ewid Bwid add rlineto Ewid Bwid add 0 rlineto Size Ewid sub 0 moveto 0 Ewid Bwid add rlineto Size Ewid moveto Ewid neg 0 rlineto stroke [] 0 setdash Size Ewid Bwid add sub Ewid Bwid add moveto 0 Size lineto 0 0 moveto Size Size Box stroke /Helvetica findfont 25 scalefont setfont /SBwid {Size Bwid Ewid add sub} def SBwid (L) stringwidth pop sub 2 div Size 2 div 5 sub moveto (L) show Size (U) stringwidth pop sub 2 div SBwid 2 div 5 sub Bwid Ewid add add moveto (U) show Bwid (B) stringwidth pop sub 2 div SBwid add Bwid Ewid add 2 div 5 sub moveto (B) show Ewid (C) stringwidth pop sub 2 div SBwid Bwid add add Bwid 2 div 5 sub Ewid add 3 sub moveto (C) show Ewid (E) stringwidth pop sub 2 div SBwid Bwid add add Ewid 2 div 5 sub moveto (E) show Size 15 add Size 2 div Size 60 add Size 2 div 20 40 15 arrow stroke Size 75 add 0 translate [2 2] 0 setdash Size Bwid Ewid add sub 0 moveto 0 Ewid Bwid add rlineto Ewid Bwid add 0 rlineto Size Ewid sub 0 moveto 0 Ewid Bwid add rlineto Size Ewid moveto Ewid Bwid add neg 0 rlineto Size Bwid Ewid add sub Bwid Ewid add moveto Bwid Bwid neg rlineto stroke [] 0 setdash Size Ewid Bwid add sub Ewid Bwid add moveto 0 Size lineto 0 0 moveto Size Size Box stroke /Helvetica findfont 25 scalefont setfont /SBwid {Size Bwid Ewid add sub} def SBwid (L) stringwidth pop sub 2 div Size 2 div 5 sub moveto (L) show Size (U) stringwidth pop sub 2 div SBwid 2 div 5 sub Bwid Ewid add add moveto (U) show Ewid (E') stringwidth pop sub 2 div SBwid Bwid add add Ewid 2 div 5 sub moveto (E') show Bwid (L1) stringwidth pop sub 2 div SBwid add Ewid 2 div 5 sub moveto (L) show /Helvetica findfont 15 scalefont setfont 1 -5 rmoveto (1) show /Helvetica findfont 25 scalefont setfont Ewid (U1) stringwidth pop sub 2 div SBwid Bwid add add Bwid 2 div 5 sub Ewid add 3 sub moveto (U) show /Helvetica findfont 15 scalefont setfont 1 -5 rmoveto (1) show /Helvetica findfont 15 scalefont setfont SBwid 3 add Ewid 5 add moveto (L) show 0 -3 rmoveto /Helvetica findfont 12 scalefont setfont (0) show /Helvetica findfont 15 scalefont setfont SBwid Bwid add 18 sub Bwid Ewid add 12 sub moveto (U) show -1 -3 rmoveto /Helvetica findfont 12 scalefont setfont (0) show end %%EndDocument @endspecial 108 x Fm(Figure)15 b(6:)21 b(Stage)16 b Ff(k)c Fm(+)d(1)16 b(of)g(the)f(blo)q(c)o(k)g(LU)h(factorization)f(algorithm)g(sho)o (wing)h(ho)o(w)g(the)f(panels)h Ff(B)i Fm(and)e Ff(C)t Fm(,)-57 999 y(and)i(the)g(trailing)f(submatrix)f Ff(E)21 b Fm(are)c(up)q(dated.)27 b(The)17 b(trap)q(ezoidal)h(submatrices)e Ff(L)i Fm(and)g Ff(U)23 b Fm(ha)o(v)o(e)17 b(already)-57 1065 y(b)q(een)c(factored)g(in)f(previous)h (steps.)20 b Ff(L)13 b Fm(has)h Ff(k)r(r)g Fm(columns,)e(and)h Ff(U)18 b Fm(has)c Ff(k)r(r)g Fm(ro)o(ws.)21 b(In)12 b(the)h(step)g(sho)o(wn) g(another)-57 1131 y Ff(r)18 b Fm(columns)d(of)h Ff(L)h Fm(and)f Ff(r)i Fm(ro)o(ws)f(of)f Ff(U)22 b Fm(are)16 b(ev)m(aluated.)3 1264 y(2.)24 b(solv)o(e)15 b(the)h(triangular)h(system)e Ff(L)691 1271 y Fe(0)711 1264 y Ff(U)744 1271 y Fe(1)777 1264 y Fm(=)f Ff(C)20 b Fm(to)d(get)f(the)g(next)g(ro)o(w)g(of)h(blo)q(c)o(ks)f(of)g Ff(U)5 b Fm(.)3 1372 y(3.)24 b(do)17 b(a)f(rank-)p Ff(r)i Fm(up)q(date)f(on)g (the)f(trailing)g(submatrix)f Ff(E)s Fm(,)h(replacing)f(it)h(with)g Ff(E)1520 1354 y Fc(0)1546 1372 y Fm(=)d Ff(E)h Fl(\000)d Ff(L)1730 1379 y Fe(1)1750 1372 y Ff(U)1783 1379 y Fe(1)1803 1372 y Fm(.)-57 1501 y(The)17 b(LAP)l(A)o(CK)g(implem)o(e)o(n)o(tation)e(of)i(this)g(form)f (of)h(LU)g(factorization)g(uses)g(the)g(Lev)o(el)f(3)h(BLAS)g(routines)-57 1567 y(xTRSM)j(and)g(xGEMM)g(to)g(p)q(erform)f(the)h(triangular)g(solv)o(e)f (and)i(rank-)p Ff(r)h Fm(up)q(date.)33 b(W)l(e)20 b(can)g(regard)g(the)-57 1633 y(algorithm)g(as)i(acting)g(on)g(matrices)d(that)j(ha)o(v)o(e)f(b)q(een) g(partitioned)g(in)o(to)g(blo)q(c)o(ks)g(of)h Ff(r)16 b Fl(\002)e Ff(r)23 b Fm(elemen)o(ts,)c(as)-57 1700 y(sho)o(wn)e(in)f(Figure)g(7.)-57 1897 y Fh(5)83 b(Data)28 b(Distribution)-57 2034 y Fm(The)21 b(fundamen)o(tal)f(data)j(ob)s(ject)d(in)h(the)g(LU)h(factorization)f (algorithm)f(presen)o(ted)h(in)g(Section)g(4.2)g(is)h(a)-57 2100 y(blo)q(c)o(k-partitioned)d(matrix.)28 b(In)19 b(this)h(section,)f(w)o (e)g(describ)q(e)f(the)h(blo)q(c)o(k-cyclic)e(metho)q(d)i(for)g(distributing) -57 2166 y(suc)o(h)f(a)h(matrix)d(o)o(v)o(er)h(a)i(t)o(w)o(o-dimensional)e (mesh)g(of)h(pro)q(cesses,)h(or)g(template.)25 b(In)18 b(general,)g(eac)o(h)f (pro)q(cess)-57 2232 y(has)j(an)g(indep)q(enden)o(t)f(thread)g(of)h(con)o (trol,)f(and)h(with)g(eac)o(h)f(pro)q(cess)h(is)f(asso)q(ciated)h(some)f(lo)q (cal)g(memory)-57 2299 y(directly)12 b(accessible)g(only)i(b)o(y)f(that)h (pro)q(cess.)21 b(The)13 b(assignmen)o(t)g(of)h(these)f(pro)q(cesses)h(to)g (ph)o(ysical)f(pro)q(cessors)-57 2365 y(is)j(a)h(mac)o(hine-dep)q(enden)o(t)c (optimization)i(issue,)h(and)g(will)g(b)q(e)g(considered)g(later)f(in)h (Section)g(7.)-57 2452 y(An)21 b(imp)q(ortan)o(t)g(prop)q(ert)o(y)g(of)h(the) f(class)g(of)h(data)h(distribution)e(w)o(e)g(shall)g(use)g(is)h(that)f(indep) q(enden)o(t)g(de-)-57 2519 y(comp)q(ositions)c(are)h(applied)f(o)o(v)o(er)g (ro)o(ws)i(and)f(columns.)24 b(W)l(e)18 b(shall,)f(therefore,)h(b)q(egin)f(b) o(y)h(considering)f(the)-57 2585 y(distribution)d(of)g(a)h(v)o(ector)e(of)h Ff(M)20 b Fm(data)15 b(ob)s(jects)f(o)o(v)o(er)f Ff(P)21 b Fm(pro)q(cesses.)g(This)14 b(can)h(b)q(e)f(describ)q(ed)g(b)o(y)f(a)i (mapping)-57 2651 y(of)j(the)f(global)h(index,)f Ff(m)p Fm(,)g(of)h(a)g(data) g(ob)s(ject)g(to)f(an)i(index)d(pair)i(\()p Ff(p;)8 b(i)p Fm(\),)17 b(where)h Ff(p)g Fm(sp)q(eci\014es)f(the)g(pro)q(cess)i(to)939 2825 y(18)p eop %%Page: 19 21 20 bop -57 975 a @beginspecial @setspecial %%BeginDocument: block_part.ps /Mydict 200 dict def Mydict begin /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /Label_Grid{ /Flag exch def /Label exch def /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def 0 1 ny 1 sub { /rowno exch def /Y ypos ny rowno sub 1 sub dely mul add Label_Yshift add def 0 1 nx 1 sub { /colno exch def /Helvetica findfont Label_PS scalefont setfont /X xpos colno delx mul add delx Label stringwidth pop sub 2 div add Label_Xshift add def X Y moveto Label show X Y moveto Label stringwidth pop 1 add Proc_PS .35 mul neg rmoveto /Helvetica findfont Proc_PS scalefont setfont Flag (T) eq { colno 2 string cvs show (,) show rowno 2 string cvs show } {rowno 2 string cvs show (,) show colno 2 string cvs show } ifelse } for } for } def /M 30 def /N 30 def /r 5 def /s 5 def /P 3 def /Q 6 def /Del 6 def /Proc_PS 10 def /Label_PS 14 def /Label_Xshift -7 def /Label_Yshift 10 def /X_Trans 5.7 72 mul N Del mul sub 2 div def /Y_Trans 10 def /Sfac 1 def 1.5 setlinewidth X_Trans Y_Trans translate Sfac dup scale 2 setlinecap 0 setlinejoin newpath -0.75 -0.75 moveto Del N mul 1.5 add 0 rlineto 0 Del M mul 1.5 add rlineto Del N mul 1.5 add neg 0 rlineto closepath clip newpath 0.5 setlinewidth 0 0 moveto Del s mul % Size of block in column direction Del r mul % Size of block in row direction N s div ceiling % Number of column blocks M r div ceiling % Number of row blocks Grid 0 0 moveto Del s mul % Size of block in column direction Del r mul % Size of block in row direction N s idiv % Number of column blocks M r idiv % Number of row blocks (A) % Label () Label_Grid end %%EndDocument @endspecial 19 1083 a Fm(Figure)15 b(7:)22 b(Blo)q(c)o(k-partitioned)15 b(matrix)g Ff(A)p Fm(.)20 b(Eac)o(h)d(blo)q(c)o(k)e Ff(A)1137 1090 y Fd(i;j)1193 1083 y Fm(consists)i(of)f Ff(r)d Fl(\002)e Ff(r)17 b Fm(matrix)e(elemen)o(ts.)-57 1219 y(whic)o(h)g(the)h(data)g(ob)s (ject)g(is)f(assigned,)h(and)h Ff(i)e Fm(sp)q(eci\014es)h(the)f(lo)q(cation)h (in)g(the)f(lo)q(cal)h(memory)d(of)j Ff(p)h Fm(at)f(whic)o(h)-57 1286 y(it)g(is)g(stored.)21 b(W)l(e)16 b(shall)g(assume)g(0)e Fl(\024)g Ff(m)f(<)h(M)22 b Fm(and)17 b(0)d Fl(\024)f Ff(p)i(<)e(P)7 b Fm(.)-57 1371 y(Tw)o(o)20 b(common)d(decomp)q(ositions)i(are)g(the)g Fg(blo)n(ck)26 b Fm(and)20 b(the)f Fg(cyclic)24 b Fm(decomp)q(ositions)18 b([55)q(,)g(32)q(].)30 b(The)20 b(blo)q(c)o(k)-57 1437 y(decomp)q(osition,)h (that)h(is)f(often)g(used)h(when)f(the)g(computational)f(load)i(is)f (distributed)g(homogeneously)-57 1503 y(o)o(v)o(er)d(a)h(regular)f(data)i (structure)e(suc)o(h)g(as)h(a)g(Cartesian)g(grid,)g(assigns)h(con)o(tiguous)f (en)o(tries)e(in)h(the)h(global)-57 1569 y(v)o(ector)c(to)i(the)f(pro)q (cesses)h(in)f(blo)q(c)o(ks.)675 1668 y Ff(m)d Fl(7!)h Fm(\()8 b Fl(b)p Ff(m=L)q Fl(c)g Ff(;)g(m)13 b Fm(mo)q(d)h Ff(L)8 b Fm(\))g Ff(;)678 b Fm(\(9\))-57 1766 y(where)17 b Ff(L)f Fm(=)f Fl(d)p Ff(M)r(=P)7 b Fl(e)p Fm(.)26 b(The)18 b(cyclic)d(decomp)q(osition)h (\(also)i(kno)o(wn)f(as)h(the)f(wrapp)q(ed)i(or)e(scattered)g(decom-)-57 1833 y(p)q(osition\))f(is)g(commonly)c(used)k(to)g(impro)o(v)o(e)d(load)j (balance)f(when)h(the)f(computational)g(load)h(is)g(distributed)-57 1899 y(inhomogeneously)k(o)o(v)o(er)f(a)i(regular)g(data)g(structure.)34 b(The)21 b(cyclic)d(decomp)q(osition)i(assigns)i(consecutiv)o(e)-57 1965 y(en)o(tries)15 b(in)h(the)g(global)h(v)o(ector)e(to)i(successiv)o(e)d (di\013eren)o(t)i(pro)q(cesses,)687 2064 y Ff(m)e Fl(7!)g Fm(\()8 b Ff(m)13 b Fm(mo)q(d)h Ff(P)q(;)8 b Fl(b)p Ff(m=P)f Fl(c)h Fm(\))667 b(\(10\))-57 2162 y(Examples)15 b(of)h(the)g(blo)q(c)o(k)g(and)h (cyclic)d(decomp)q(ositions)i(are)g(sho)o(wn)h(in)f(Figure)g(8.)-57 2247 y(The)c(blo)q(c)o(k)f(cyclic)e(decomp)q(osition)i(is)h(a)g (generalization)f(of)h(the)f(blo)q(c)o(k)g(and)i(cyclic)c(decomp)q(ositions)i (in)h(whic)o(h)-57 2313 y(blo)q(c)o(ks)g(of)h(consecutiv)o(e)e(data)i(ob)s (jects)f(are)h(distributed)e(cyclically)f(o)o(v)o(er)h(the)h(pro)q(cesses.)21 b(In)12 b(the)g(blo)q(c)o(k)g(cyclic)-57 2380 y(decomp)q(osition)j(the)g (mapping)g(of)h(the)f(global)h(index,)f Ff(m)p Fm(,)g(can)h(b)q(e)f (expressed)h(as)g Ff(m)d Fl(7!)h Fm(\()p Ff(p;)8 b(b;)g(i)p Fm(\),)15 b(where)g Ff(p)h Fm(is)-57 2446 y(the)d(pro)q(cess)h(n)o(um)o(b)q (er,)e Ff(b)h Fm(is)h(the)f(blo)q(c)o(k)g(n)o(um)o(b)q(er)e(in)i(pro)q(cess)i Ff(p)p Fm(,)f(and)g Ff(i)f Fm(is)g(the)g(index)g(within)g(blo)q(c)o(k)g Ff(b)g Fm(to)h(whic)o(h)-57 2512 y Ff(m)g Fm(is)g(mapp)q(ed.)20 b(Th)o(us,)15 b(if)f(the)g(n)o(um)o(b)q(er)f(of)i(data)g(ob)s(jects)f(in)g(a) h(blo)q(c)o(k)f(is)h Ff(r)q Fm(,)f(the)h(blo)q(c)o(k)f(cyclic)e(decomp)q (osition)-57 2578 y(ma)o(y)j(b)q(e)h(written,)552 2667 y Ff(m)e Fl(7!)672 2594 y Fb( $)737 2633 y Ff(m)f Fm(mo)q(d)h Ff(T)p 737 2655 200 2 v 825 2701 a(r)941 2594 y Fb(\045)976 2667 y Ff(;)1006 2606 y Fb(\026)1035 2633 y Ff(m)p 1035 2655 43 2 v 1039 2701 a(T)1083 2606 y Fb(\027)1115 2667 y Ff(;)i(m)e Fm(mo)q(d)f Ff(r)1332 2594 y Fb(!)1896 2667 y Fm(\(11\))939 2825 y(19)p eop %%Page: 20 22 21 bop -50 77 825 2 v -50 143 2 67 v -7 123 a Fm(m)p 75 143 V 82 w(0)42 b(1)g(2)f(3)h(4)f(5)h(6)f(7)h(8)g(9)p 773 143 V -50 145 825 2 v -50 211 2 67 v -1 191 a(p)p 75 211 V 91 w(0)g(0)g(0)f(0)h(1)f (1)h(1)f(1)h(2)g(2)p 773 211 V -50 212 825 2 v -50 279 2 67 v 6 259 a(i)p 75 279 V 97 w(0)g(1)g(2)f(3)h(0)f(1)h(2)f(3)h(0)g(1)p 773 279 V -50 280 825 2 v 271 359 a(\(a\))17 b(Blo)q(c)o(k)p 1165 77 V 1165 143 2 67 v 1208 123 a(m)p 1290 143 V 82 w(0)42 b(1)g(2)f(3)h(4)f(5)h(6)f(7)h(8)g(9)p 1988 143 V 1165 145 825 2 v 1165 211 2 67 v 1214 191 a(p)p 1290 211 V 91 w(0)g(1)g(2)f(0)h(1)f(2)h(0) f(1)h(2)g(0)p 1988 211 V 1165 212 825 2 v 1165 279 2 67 v 1221 259 a(i)p 1290 279 V 97 w(0)g(0)g(0)f(1)h(1)f(1)h(2)f(2)h(2)g(3)p 1988 279 V 1165 280 825 2 v 1479 359 a(\(b\))16 b(Cyclic)-57 467 y(Figure)21 b(8:)31 b(Examples)20 b(of)h(blo)q(c)o(k)g(and)h(cyclic)d (decomp)q(ositions)h(of)i Ff(M)27 b Fm(=)22 b(10)g(data)g(ob)s(jects)f(o)o(v) o(er)f Ff(P)30 b Fm(=)22 b(3)-57 533 y(pro)q(cesses.)p 111 602 1710 2 v 111 669 2 67 v 134 649 a(m)p 196 669 V 67 w(0)47 b(1)f(2)g(3)g(4)h(5)f(6)g(7)g(8)g(9)22 b(10)g(11)h(12)f(13)g(14)g(15)h(16)f (17)g(18)g(19)g(20)h(21)f(22)p 1834 669 V 111 670 1710 2 v 111 736 2 67 v 141 717 a(p)p 196 736 V 75 w(0)47 b(0)f(1)g(1)g(2)h(2)f(0)g(0) g(1)g(1)h(2)f(2)g(0)g(0)h(1)f(1)g(2)g(2)g(0)h(0)f(1)g(1)g(2)p 1834 736 V 111 738 1710 2 v 111 804 2 67 v 141 784 a(b)p 196 804 V 75 w(0)h(0)f(0)g(0)g(0)h(0)f(1)g(1)g(1)g(1)h(1)f(1)g(2)g(2)h(2)f(2)g(2) g(2)g(3)h(3)f(3)g(3)g(3)p 1834 804 V 111 806 1710 2 v 111 872 2 67 v 147 852 a(i)p 196 872 V 82 w(0)h(1)f(0)g(1)g(0)h(1)f(0)g(1)g(0)g(1)h (0)f(1)g(0)g(1)h(0)f(1)g(0)g(1)g(0)h(1)f(0)g(1)g(0)p 1834 872 V 111 874 1710 2 v 792 940 a(\(a\))16 b Ff(m)e Fl(7!)g Fm(\()p Ff(p;)8 b(b;)g(i)p Fm(\))p 111 983 V 111 1049 2 67 v 141 1030 a(p)p 196 1049 V 75 w(0)47 b(0)f(0)g(0)g(0)h(0)f(0)g(0)g(1)g(1)h(1)f(1)g(1)g (1)h(1)f(1)g(2)g(2)g(2)h(2)f(2)g(2)g(2)p 1834 1049 V 111 1051 1710 2 v 111 1117 2 67 v 141 1097 a(b)p 196 1117 V 75 w(0)h(0)f(1)g(1)g(2)h (2)f(3)g(3)g(0)g(0)h(1)f(1)g(2)g(2)h(3)f(3)g(0)g(0)g(1)h(1)f(2)g(2)g(3)p 1834 1117 V 111 1119 1710 2 v 111 1185 2 67 v 147 1165 a(i)p 196 1185 V 82 w(0)h(1)f(0)g(1)g(0)h(1)f(0)g(1)g(0)g(1)h(0)f(1)g(0)g(1)h(0)f (1)g(0)g(1)g(0)h(1)f(0)g(1)g(0)p 1834 1185 V 111 1187 1710 2 v 111 1253 2 67 v 134 1233 a(m)p 196 1253 V 67 w(0)h(1)f(6)g(7)22 b(12)g(13)g(18)h(19)46 b(2)g(3)h(8)f(9)22 b(14)g(15)g(20)g(21)47 b(4)f(5)22 b(10)g(11)g(16)h(17)f(22)p 1834 1253 V 111 1255 1710 2 v 791 1321 a(\(b\))16 b(\()p Ff(p;)8 b(b;)g(i)p Fm(\))13 b Fl(7!)h Ff(m)-57 1429 y Fm(Figure)20 b(9:)29 b(An)19 b(example)f(of)j(the)e (blo)q(c)o(k)h(cyclic)e(decomp)q(osition)h(of)h Ff(M)26 b Fm(=)20 b(23)h(data)g(ob)s(jects)f(o)o(v)o(er)f Ff(P)27 b Fm(=)21 b(3)-57 1495 y(pro)q(cesses)d(for)g(a)g(blo)q(c)o(k)f(size)g(of)h Ff(r)g Fm(=)e(2.)26 b(\(a\))18 b(sho)o(ws)h(the)e(mapping)g(from)g(global)h(index,)e Ff(m)p Fm(,)i(to)g(the)f(triplet)-57 1561 y(\()p Ff(p;)8 b(b;)g(i)p Fm(\),)15 b(and)i(\(b\))f(sho)o(ws)i(the)e(in)o(v)o(erse)e(mapping.)-57 1706 y(where)19 b Ff(T)26 b Fm(=)20 b Ff(r)q(P)7 b Fm(.)32 b(It)19 b(should)h(b)q(e)g(noted)g(that)g(this)f(rev)o(erts)g(to)h(the)f (cyclic)f(decomp)q(osition)h(when)h Ff(r)g Fm(=)g(1,)-57 1773 y(with)d(lo)q(cal)g(index)g Ff(i)e Fm(=)g(0)j(for)f(all)g(blo)q(c)o(ks.)24 b(A)17 b(blo)q(c)o(k)f(decomp)q(osition)h(is)g(reco)o(v)o(ered)e(when)i Ff(r)g Fm(=)f Ff(L)p Fm(,)h(in)g(whic)o(h)-57 1839 y(case)e(there)f(is)g(a)h (single)f(blo)q(c)o(k)g(in)g(eac)o(h)g(pro)q(cess)h(with)g(blo)q(c)o(k)f(n)o (um)o(b)q(er)f Ff(b)g Fm(=)h(0.)21 b(The)15 b(in)o(v)o(erse)e(mapping)g(of)i (the)-57 1905 y(triplet)g(\()p Ff(p;)8 b(b;)g(i)p Fm(\))16 b(to)g(a)h(global)g(index)e(is)h(giv)o(en)f(b)o(y)l(,)630 2021 y(\()p Ff(p;)8 b(b;)g(i)p Fm(\))13 b Fl(7!)h Ff(B)s(r)d Fm(+)h Ff(i)h Fm(=)h Ff(pr)f Fm(+)e Ff(bT)17 b Fm(+)11 b Ff(i)599 b Fm(\(12\))-57 2137 y(where)15 b Ff(B)h Fm(=)e Ff(p)8 b Fm(+)g Ff(bP)23 b Fm(is)14 b(the)h(global)g(blo)q(c)o(k)g(n)o(um)o(b)q(er.)k(The)14 b(blo)q(c)o(k)h(cyclic)e(decomp)q(osition)h(is)g(one)h(of)h(the)e(data)-57 2203 y(distributions)g(supp)q(orted)i(b)o(y)e(High)g(P)o(erformance)f(F)l (ortran)i(\(HPF\))f([42)q(],)g(and)h(has)g(b)q(een)g(previously)e(used,)-57 2269 y(in)j(one)h(form)e(or)i(another,)g(b)o(y)f(sev)o(eral)f(researc)o(hers) h(\(see)g([1,)g(4,)g(5)q(,)g(9,)g(23)q(,)g(27)q(,)g(50)q(,)g(52)q(,)g(54])g (for)h(examples)e(of)-57 2336 y(its)h(use\).)21 b(The)16 b(blo)q(c)o(k)g (cyclic)e(decomp)q(osition)i(is)g(illustrated)f(with)h(an)h(example)d(in)i (Figure)g(9.)-57 2423 y(The)i(form)e(of)i(the)g(blo)q(c)o(k)f(cyclic)f (decomp)q(osition)h(giv)o(en)f(b)o(y)i(Eq.)f(11)h(ensures)g(that)g(the)g(blo) q(c)o(k)f(with)h(global)-57 2489 y(index)g(0)g(is)h(placed)e(in)i(pro)q(cess) g(0,)f(the)h(next)e(blo)q(c)o(k)h(is)h(placed)e(in)h(pro)q(cess)i(1,)e(and)h (so)g(on.)29 b(Ho)o(w)o(ev)o(er,)16 b(it)i(is)-57 2556 y(sometimes)12 b(necessary)j(to)g(o\013set)h(the)f(pro)q(cesses)g(relativ)o(e)f(to)h(the)g (global)g(blo)q(c)o(k)g(index)f(so)i(that,)f(in)g(general,)-57 2622 y(the)21 b(\014rst)h(blo)q(c)o(k)e(is)h(placed)g(in)g(pro)q(cess)h Ff(p)743 2629 y Fe(0)763 2622 y Fm(,)g(the)f(next)g(in)g(pro)q(cess)g Ff(p)1263 2629 y Fe(0)1298 2622 y Fm(+)15 b(1,)22 b(and)g(so)g(on.)36 b(W)l(e,)22 b(therefore,)-57 2688 y(generalize)e(the)h(blo)q(c)o(k)g(cyclic)e (decomp)q(osition)i(b)o(y)g(replacing)g Ff(m)g Fm(on)h(the)f(righ)o(thand)h (side)f(of)g(Eq.)g(11)i(b)o(y)939 2825 y(20)p eop %%Page: 21 23 22 bop -57 125 a Ff(m)-14 107 y Fc(0)11 125 y Fm(=)14 b Ff(m)d Fm(+)g Ff(r)q(p)213 132 y Fe(0)249 125 y Fm(to)17 b(giv)o(e,)296 257 y Ff(m)41 b Fl(7!)472 184 y Fb( $)536 223 y Ff(m)579 205 y Fc(0)604 223 y Fm(mo)q(d)14 b Ff(T)p 536 245 211 2 v 630 291 a(r)752 184 y Fb(\045)787 257 y Ff(;)817 184 y Fb($)848 223 y Ff(m)891 205 y Fc(0)p 848 245 55 2 v 858 291 a Ff(T)908 184 y Fb(\045)942 257 y Ff(;)j(m)1016 236 y Fc(0)1041 257 y Fm(mo)q(d)c Ff(r)1171 184 y Fb(!)386 398 y Fm(=)472 325 y Fb( )o($)569 364 y Ff(m)g Fm(mo)q(d)h Ff(T)p 569 387 200 2 v 657 432 a(r)773 325 y Fb(\045)811 398 y Fm(+)d Ff(p)884 405 y Fe(0)904 325 y Fb(!)951 398 y Fm(mo)q(d)i Ff(P)q(;)1121 338 y Fb(\026)1150 364 y Ff(m)e Fm(+)g Ff(r)q(p)1300 371 y Fe(0)p 1150 387 171 2 v 1217 432 a Ff(T)1325 338 y Fb(\027)1358 398 y Ff(;)16 b(m)d Fm(mo)q(d)h Ff(r)1575 325 y Fb(!)1616 398 y Ff(:)266 b Fm(\(13\))-57 530 y(Equation)17 b(12)g(ma)o(y)d(also)j(b)q(e)g(generalized)e(to,)558 640 y(\()p Ff(p;)8 b(b;)g(i)p Fm(\))13 b Fl(7!)h Ff(B)s(r)e Fm(+)f Ff(i)i Fm(=)h(\()p Ff(p)e Fl(\000)f Ff(p)1113 647 y Fe(0)1133 640 y Fm(\))p Ff(r)h Fm(+)f Ff(bT)17 b Fm(+)11 b Ff(i)528 b Fm(\(14\))-57 750 y(where)20 b(no)o(w)h(the)f(global)h(blo)q(c)o (k)f(n)o(um)o(b)q(er)f(is)h(giv)o(en)g(b)o(y)g Ff(B)j Fm(=)e(\()p Ff(p)15 b Fl(\000)e Ff(p)1255 757 y Fe(0)1276 750 y Fm(\))g(+)h Ff(bP)7 b Fm(.)34 b(It)20 b(should)h(b)q(e)g(noted)g(that)-57 816 y(in)c(pro)q(cesses)i(with)e Ff(p)g(<)f(p)445 823 y Fe(0)465 816 y Fm(,)i(blo)q(c)o(k)f(0)h(is)f(not)h(within)g(the)f(range)h(of)g(the)g (blo)q(c)o(k)f(cyclic)e(mapping)i(and)h(it)g(is,)-57 883 y(therefore,)d(an)i (error)f(to)h(reference)d(it)i(in)g(an)o(y)g(w)o(a)o(y)l(.)-57 969 y(In)f(decomp)q(osing)g(an)h Ff(M)f Fl(\002)9 b Ff(N)21 b Fm(matrix)14 b(w)o(e)h(apply)g(indep)q(enden)o(t)f(blo)q(c)o(k)h(cyclic)f (decomp)q(ositions)g(in)i(the)f(ro)o(w)-57 1036 y(and)e(column)e(directions.) 19 b(Th)o(us,)13 b(supp)q(ose)h(the)e(matrix)f(ro)o(ws)i(are)g(distributed)e (with)i(blo)q(c)o(k)f(size)f Ff(r)j Fm(and)f(o\013set)-57 1102 y Ff(p)-33 1109 y Fe(0)3 1102 y Fm(o)o(v)o(er)i Ff(P)23 b Fm(pro)q(cesses)17 b(b)o(y)e(the)h(blo)q(c)o(k)f(cyclic)f(mapping)h Ff(\026)1014 1109 y Fd(r)o(;p)1057 1114 y Fa(0)1074 1109 y Fd(;P)1113 1102 y Fm(,)h(and)g(the)g(matrix)e(columns)h(are)h(distributed)-57 1168 y(with)j(blo)q(c)o(k)f(size)g Ff(s)h Fm(and)g(o\013set)h Ff(q)576 1175 y Fe(0)614 1168 y Fm(o)o(v)o(er)e Ff(Q)g Fm(pro)q(cesses)i(b)o (y)e(the)g(blo)q(c)o(k)h(cyclic)d(mapping)j Ff(\027)1644 1175 y Fd(s;q)1686 1180 y Fa(0)1703 1175 y Fd(;Q)1742 1168 y Fm(.)29 b(Then)19 b(the)-57 1234 y(matrix)c(elemen)n(t)f(indexed)h(globally)h(b)o(y)g (\()p Ff(m;)8 b(n)p Fm(\))15 b(is)h(mapp)q(ed)g(as)h(follo)o(ws,)777 1344 y Ff(m)890 1315 y Fd(\026)861 1344 y Fl(7\000)-9 b(!)42 b Fm(\()p Ff(p;)8 b(b;)g(i)p Fm(\))790 1423 y Ff(n)891 1396 y Fd(\027)861 1423 y Fl(7\000)-9 b(!)42 b Fm(\()p Ff(q)r(;)8 b(d;)g(j)s Fm(\))p Ff(:)745 b Fm(\(15\))-57 1533 y(The)17 b(decomp)q(osition) f(of)h(the)f(matrix)f(can)i(b)q(e)g(regarded)g(as)h(the)e(tensor)h(pro)q (duct)h(of)f(the)f(ro)o(w)h(and)h(column)-57 1599 y(decomp)q(ositions,)d(and) i(w)o(e)f(can)g(write,)638 1709 y(\()p Ff(m;)8 b(n)p Fm(\))13 b Fl(7!)h Fm(\()8 b(\()p Ff(p;)g(q)r Fm(\))p Ff(;)16 b Fm(\()p Ff(b;)8 b(d)p Fm(\))p Ff(;)16 b Fm(\()p Ff(i;)8 b(j)s Fm(\))g(\))p Ff(:)607 b Fm(\(16\))-57 1819 y(The)18 b(blo)q(c)o(k)f(cyclic)f(matrix)g (decomp)q(osition)g(giv)o(en)h(b)o(y)g(Eqs.)h(15)g(and)h(16)f(distributes)f (blo)q(c)o(ks)h(of)g(size)f Ff(r)c Fl(\002)f Ff(s)-57 1885 y Fm(to)17 b(a)g(mesh)f(of)h Ff(P)i Fl(\002)11 b Ff(Q)17 b Fm(pro)q(cesses.)23 b(W)l(e)17 b(shall)f(refer)g(to)i(this)e(mesh)g(as)h(the) g Fg(pr)n(o)n(c)n(ess)g(template)t Fm(,)g(and)h(refer)e(to)-57 1951 y(pro)q(cesses)f(b)o(y)e(their)h(p)q(osition)h(in)e(the)h(template.)19 b(Equation)14 b(16)h(sa)o(ys)g(that)f(global)h(index)e(\()p Ff(m;)8 b(n)p Fm(\))14 b(is)g(mapp)q(ed)-57 2017 y(to)k(pro)q(cess)h(\()p Ff(p;)8 b(q)r Fm(\),)17 b(where)g(it)h(is)f(stored)h(in)g(the)f(blo)q(c)o(k)g (at)h(lo)q(cation)h(\()p Ff(b;)8 b(d)p Fm(\))17 b(in)h(a)g(t)o(w)o (o-dimensional)e(arra)o(y)i(of)-57 2084 y(blo)q(c)o(ks.)i(Within)13 b(this)g(blo)q(c)o(k)f(it)h(is)g(stored)h(at)f(lo)q(cation)h(\()p Ff(i;)8 b(j)s Fm(\).)20 b(The)13 b(decomp)q(osition)f(is)h(completely)e(sp)q (eci\014ed)-57 2150 y(b)o(y)18 b(the)g(parameters)f Ff(r)q Fm(,)i Ff(s)p Fm(,)f Ff(p)488 2157 y Fe(0)508 2150 y Fm(,)h Ff(q)563 2157 y Fe(0)582 2150 y Fm(,)f Ff(P)7 b Fm(,)19 b(and)g Ff(Q)p Fm(.)27 b(In)18 b(Figure)f(10)i(an)g(example)d(is)i(giv)o(en)g(of)g (the)g(blo)q(c)o(k)g(cyclic)-57 2216 y(decomp)q(osition)d(of)g(a)h(36)10 b Fl(\002)g Fm(80)16 b(matrix)d(for)j(blo)q(c)o(k)f(size)g(4)10 b Fl(\002)f Fm(5,)15 b(a)h(pro)q(cess)g(template)e(3)9 b Fl(\002)h Fm(4,)15 b(and)h(a)g(template)-57 2282 y(o\013set)h(\()p Ff(p)115 2289 y Fe(0)135 2282 y Ff(;)8 b(q)179 2289 y Fe(0)198 2282 y Fm(\))14 b(=)g(\(0)p Ff(;)8 b Fm(0\).)22 b(Figure)15 b(11)j(sho)o(ws)f(the) f(same)f(example)f(but)i(for)h(a)f(template)f(o\013set)i(of)f(\(1)p Ff(;)8 b Fm(2\).)-57 2369 y(The)16 b(blo)q(c)o(k)g(cyclic)e(decomp)q(osition) h(can)h(repro)q(duce)g(most)g(of)g(the)g(data)h(distributions)f(commonly)d (used)j(in)-57 2435 y(linear)h(algebra)h(computations)f(on)h(parallel)f (computers.)24 b(F)l(or)17 b(example,)f(if)h Ff(Q)e Fm(=)h(1)i(and)h Ff(r)e Fm(=)f Fl(d)p Ff(M)r(=P)7 b Fl(e)19 b Fm(the)-57 2501 y(blo)q(c)o(k)g(ro)o(w)h(decomp)q(osition)f(is)h(obtained.)32 b(Similarly)l(,)18 b Ff(P)27 b Fm(=)20 b(1)g(and)h Ff(s)f Fm(=)f Fl(d)p Ff(N)q(=Q)p Fl(e)i Fm(giv)o(es)e(a)h(blo)q(c)o(k)g(column)-57 2568 y(decomp)q(osition.)g(These)15 b(decomp)q(ositions,)f(together)h(with)g (ro)o(w)g(and)h(column)d(cyclic)g(decomp)q(ositions,)h(are)-57 2634 y(sho)o(wn)22 b(in)f(Figure)g(12.)38 b(Other)21 b(commonly)e(used)j(blo) q(c)o(k)f(cyclic)e(matrix)h(decomp)q(ositions)h(are)g(sho)o(wn)h(in)-57 2700 y(Figure)16 b(13.)939 2825 y(21)p eop %%Page: 22 24 23 bop -57 125 a Fh(6)83 b(P)n(arallel)27 b(Implemen)n(tation)-57 262 y Fm(In)14 b(this)h(section)f(w)o(e)g(describ)q(e)g(the)g(parallel)g (implem)o(e)o(n)o(tation)e(of)j(LU)f(factorization,)h(with)f(partial)h(piv)o (oting)-57 328 y(o)o(v)o(er)20 b(ro)o(ws,)i(for)f(a)h(blo)q(c)o (k-partitioned)e(matrix.)33 b(The)21 b(matrix,)f Ff(A)p Fm(,)h(to)h(b)q(e)f (factored)g(is)f(assumed)h(to)g(ha)o(v)o(e)-57 394 y(a)h(blo)q(c)o(k)e (cyclic)f(decomp)q(osition,)j(and)g(at)f(the)g(end)g(of)h(the)f(computation)f (is)h(o)o(v)o(erwritten)f(b)o(y)h(the)g(lo)o(w)o(er)-57 460 y(and)d(upp)q(er)g(triangular)g(factors,)g Ff(L)g Fm(and)g Ff(U)5 b Fm(.)26 b(This)18 b(implici)o(tly)c(determines)i(the)h(decomp)q (osition)g(of)h Ff(L)f Fm(and)-57 527 y Ff(U)5 b Fm(.)28 b(Quite)18 b(a)h(high-lev)o(el)e(description)h(is)g(giv)o(en)f(here)h(since)g(the)g (details)g(of)h(the)f(parallel)g(implem)o(en)o(tation)-57 593 y(in)o(v)o(olv)o(e)c(optimization)g(issues)j(that)f(will)g(b)q(e)g(addressed) h(in)f(Section)f(7.)-57 681 y(The)f(sequen)o(tial)f(LU)i(factorization)f (algorithm)f(describ)q(ed)h(in)g(Section)g(4.2)h(uses)f(square)h(blo)q(c)o (ks.)20 b(Although)-57 747 y(in)c(the)g(parallel)f(algorithm)g(w)o(e)h(could) g(c)o(ho)q(ose)h(to)f(decomp)q(ose)g(the)g(matrix)e(using)j(nonsquare)g(blo)q (c)o(ks,)e(this)-57 813 y(w)o(ould)20 b(result)g(in)g(a)g(more)f(complicated) f(co)q(de,)j(and)f(additional)g(sources)h(of)f(concurren)o(t)g(o)o(v)o (erhead.)32 b(F)l(or)-57 879 y(LU)16 b(factorization)g(w)o(e,)f(therefore,)g (restrict)g(the)h(decomp)q(osition)f(to)i(use)f(only)g(square)g(blo)q(c)o (ks,)f(so)i(that)f(the)-57 945 y(blo)q(c)o(ks)h(used)h(to)g(decomp)q(ose)e (the)h(matrix)f(are)i(the)f(same)f(as)i(those)g(used)g(to)g(partition)f(the)g (computation.)-57 1012 y(If)e(the)g(blo)q(c)o(k)g(size)g(is)g Ff(r)c Fl(\002)f Ff(r)q Fm(,)16 b(then)f(an)h Ff(M)f Fl(\002)9 b Ff(N)21 b Fm(matrix)14 b(consists)i(of)g Ff(M)1263 1019 y Fd(b)1290 1012 y Fl(\002)9 b Ff(N)1377 1019 y Fd(b)1410 1012 y Fm(blo)q(c)o(ks,)15 b(where)g Ff(M)1758 1019 y Fd(b)1790 1012 y Fm(=)e Fl(d)p Ff(M)r(=r)q Fl(e)-57 1078 y Fm(and)k Ff(N)77 1085 y Fd(b)108 1078 y Fm(=)d Fl(d)p Ff(N)q(=r)q Fl(e)p Fm(.)-57 1165 y(As)i(discussed)h(in)f(Section)g(4.2,)g(LU)h(factorization)g(pro)q (ceeds)f(in)g(a)h(series)f(of)h(sequen)o(tial)e(steps)i(indexed)f(b)o(y)-57 1232 y Ff(k)g Fm(=)e(0)p Ff(;)8 b Fm(min)e(\()p Ff(M)237 1239 y Fd(b)255 1232 y Ff(;)i(N)316 1239 y Fd(b)333 1232 y Fm(\))j Fl(\000)g Fm(1,)16 b(in)g(eac)o(h)g(of)g(whic)o(h)g(the)g(follo)o(wing)g (three)f(tasks)i(are)f(p)q(erformed,)3 1361 y(1.)24 b(factor)g(the)g Ff(k)r Fm(th)h(column)d(of)j(blo)q(c)o(ks,)g(p)q(erforming)f(piv)o(oting)f (if)h(necessary)l(.)45 b(This)24 b(ev)m(aluates)g(the)65 1427 y(matrices)14 b Ff(L)293 1434 y Fe(0)313 1427 y Fm(,)i Ff(L)376 1434 y Fe(1)396 1427 y Fm(,)g(and)h Ff(U)554 1434 y Fe(0)590 1427 y Fm(in)f(Figure)f(6.)3 1535 y(2.)24 b(ev)m(aluate)16 b(the)g Ff(k)r Fm(th)g(blo)q(c)o(k)g(ro)o(w)h(of)f Ff(U)22 b Fm(b)o(y)15 b(solving)i(the)f(lo)o(w)o(er)f(triangular)i(system)d Ff(L)1627 1542 y Fe(0)1647 1535 y Ff(U)1680 1542 y Fe(1)1714 1535 y Fm(=)g Ff(C)t Fm(.)3 1642 y(3.)24 b(do)17 b(a)f(rank-)p Ff(r)i Fm(up)q(date)f(on)g(the)f(trailing)g(submatrix)f Ff(E)s Fm(,)h(replacing)f(it)h(with)g Ff(E)1520 1624 y Fc(0)1546 1642 y Fm(=)d Ff(E)h Fl(\000)d Ff(L)1730 1649 y Fe(1)1750 1642 y Ff(U)1783 1649 y Fe(1)1803 1642 y Fm(.)-57 1772 y(W)l(e)20 b(no)o(w)g(consider)g(the)g(parallel)f(implem)o(en)o(tati)o(on)f(of)i(eac)o (h)g(of)g(these)g(tasks.)33 b(The)20 b(computation)g(in)f(the)-57 1838 y(factorization)j(step)g(in)o(v)o(olv)o(es)e(a)i(single)g(column)e(of)j (blo)q(c)o(ks,)g(and)f(these)g(lie)f(in)g(a)i(single)e(column)g(of)h(the)-57 1904 y(pro)q(cess)h(template.)36 b(In)21 b(the)h Ff(k)r Fm(th)g (factorization)f(step,)i(eac)o(h)f(of)g(the)f Ff(r)j Fm(columns)c(in)i(blo)q (c)o(k)f(column)f Ff(k)k Fm(is)-57 1970 y(pro)q(cessed)17 b(in)g(turn.)23 b(Consider)17 b(the)f Ff(i)p Fm(th)h(column)e(in)i(blo)q(c)o(k)f(column)f Ff(k)r Fm(.)23 b(The)17 b(piv)o(ot)f(is)h(selected)e(b)o(y)i(\014nding)-57 2036 y(the)k(elemen)o(t)d(with)j(largest)g(absolute)g(v)m(alue)g(in)g(this)g (column)e(b)q(et)o(w)o(een)i(ro)o(w)g Ff(k)r(r)16 b Fm(+)e Ff(i)21 b Fm(and)g(the)g(last)g(ro)o(w,)-57 2103 y(inclusiv)o(e.)d(The)d (elemen)o(ts)d(in)o(v)o(olv)o(ed)g(in)i(the)h(piv)o(ot)f(searc)o(h)g(at)h (this)f(stage)h(are)g(sho)o(wn)g(shaded)g(in)g(Figure)f(14.)-57 2169 y(Ha)o(ving)f(selected)f(the)h(piv)o(ot,)f(the)i(v)m(alue)f(of)g(the)g (piv)o(ot)g(and)h(its)f(ro)o(w)g(are)h(broadcast)g(to)g(all)f(other)g(pro)q (cessors.)-57 2235 y(Next,)f(piv)o(oting)f(is)i(p)q(erformed)e(b)o(y)h(exc)o (hanging)g(the)g(en)o(tire)f(ro)o(w)h Ff(k)r(r)t Fm(+)s Ff(i)h Fm(with)f(the)g(ro)o(w)h(con)o(taining)f(the)g(piv)o(ot.)-57 2301 y(W)l(e)j(exc)o(hange)g(en)o(tire)f(ro)o(ws,)i(rather)g(than)g(just)g (the)f(part)h(to)g(the)f(righ)o(t)h(of)f(the)h(columns)e(already)h(factored,) -57 2368 y(in)i(order)h(to)h(simplify)c(the)i(application)h(of)g(the)f(piv)o (ots)h(to)g(the)g(righ)o(thand)g(side)f(in)h(an)o(y)f(subsequen)o(t)h(solv)o (e)-57 2434 y(phase.)35 b(Finally)l(,)20 b(eac)o(h)g(v)m(alue)g(in)g(the)g (column)g(b)q(elo)o(w)g(the)g(piv)o(ot)g(is)h(divided)e(b)o(y)h(the)g(piv)o (ot.)34 b(If)20 b(a)h(cyclic)-57 2500 y(column)14 b(decomp)q(osition)g(is)h (used,)g(lik)o(e)f(that)i(sho)o(wn)g(in)f(Figure)f(12\(d\),)i(only)f(one)h (pro)q(cessor)g(is)f(in)o(v)o(olv)o(ed)e(in)-57 2566 y(the)h(factorization)g (of)g(the)g(blo)q(c)o(k)g(column,)e(and)j(no)f(comm)o(unication)d(is)j (necessary)g(b)q(et)o(w)o(een)f(the)h(pro)q(cesses.)-57 2632 y(Ho)o(w)o(ev)o(er,)j(in)h(general)g Ff(P)26 b Fm(pro)q(cesses)19 b(are)g(in)o(v)o(olv)o(ed,)d(and)k(comm)o(uni)o(cation)c(is)i(necessary)h(in) f(selecting)f(the)-57 2699 y(piv)o(ot,)e(and)i(exc)o(hanging)f(the)g(piv)o (ot)g(ro)o(ws.)939 2825 y(22)p eop %%Page: 23 25 24 bop -57 987 a @beginspecial @setspecial %%BeginDocument: fig3.3a.ps /Mydict 100 dict def Mydict begin /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /yshift 3.5 def /GridDecomp { /ny exch def /nx exch def /dely exch def /delx exch def /Darray exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def /dx3 delx 3 div def /dy3 dely 3 div def /ix -1 def /iy ny 1 sub def Darray{ /Text exch def /ix ix 1 add def ix nx ge { /ix 0 def /iy iy 1 sub def} if /x xpos delx ix 0.5 add mul add Text stringwidth pop 2 div sub def /y ypos dely iy 0.5 add mul add yshift sub def x y moveto Text show } forall } def 2 setlinecap 40 0 translate 0.65 0.65 scale /Helvetica findfont 11 scalefont setfont 0.85 setgray /r 4 def /s 5 def /tc 16 def /tr 12 def /P 3 def /Q 4 def /Del 6 def 0 0 moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul 2 mul 0 moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul P r Del mul mul moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul 3 mul P r Del mul mul moveto Q s Del mul mul P r Del mul mul Box fill 0 P r Del mul mul 2 mul moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul 2 mul P r Del mul mul 2 mul moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul P r Del mul mul 3 mul moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul 3 mul P r Del mul mul 3 mul moveto Q s Del mul mul P r Del mul mul Box fill 0 setgray 0 0 moveto s Del mul r Del mul tc tr Grid 2 setlinewidth 0 0 moveto s Del tc mul mul r Del tr mul mul 1 1 Grid 0 0 moveto [ (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) ] s Del mul r Del mul tc tr GridDecomp 0 1 tr P idiv 1 sub { /iy exch def 0 1 tc Q idiv 1 sub { /ix exch def 0 setgray s Del mul ix Q mul mul r Del mul iy P mul P 1 sub add mul moveto s Del mul r Del mul Box fill 1 setgray s Del mul ix Q mul 0.5 add mul (0,0) stringwidth pop 2 div sub r Del mul P 0.5 sub iy P mul add mul yshift sub moveto (0,0) show } for } for 0 setgray /Helvetica findfont 15 scalefont setfont 2 setlinewidth 0 r Del tr mul mul moveto 0 25 rlineto 0 -25 rmoveto -25 0 rlineto stroke (p,q) dup stringwidth pop 4 add neg r Del mul tr mul 7 add moveto show (D) dup stringwidth pop s Del mul tc mul exch sub 2 div r Del mul tr mul 15 add moveto show r Del tr mul mul 2 div 3 add 7 sub -25 exch moveto (B) show 0 1 tc 1 sub { dup 0.5 add s Del mul mul exch 2 string cvs dup stringwidth pop 3 -1 roll exch 0.5 mul sub r Del tr mul mul 4 add moveto show} for 0 1 tr 1 sub { dup 0.5 add r Del mul mul r Del tr mul mul sub neg exch 2 string cvs dup stringwidth pop 4 add neg 3 -1 roll 5 sub moveto show} for end %%EndDocument @endspecial 253 1066 a Fm(\(a\))17 b(Assignmen)o(t)d(of)j(global)f(blo)q(c)o (k)g(indices,)f(\()p Ff(B)s(;)8 b(D)q Fm(\),)16 b(to)g(pro)q(cesses,)h(\()p Ff(p;)8 b(q)r Fm(\).)-57 2062 y @beginspecial @setspecial %%BeginDocument: fig3.3b.ps /Mydict 100 dict def Mydict begin /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /yshift 3.5 def /GridDecomp { /ny exch def /nx exch def /dely exch def /delx exch def /Darray exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def /dx3 delx 3 div def /dy3 dely 3 div def /ix -1 def /iy ny 1 sub def Darray{ /Text exch def /ix ix 1 add def ix nx ge { /ix 0 def /iy iy 1 sub def} if /x xpos delx ix 0.5 add mul add Text stringwidth pop 2 div sub def /y ypos dely iy 0.5 add mul add yshift sub def x y moveto Text show } forall } def 2 setlinecap 40 0 translate 0.65 0.65 scale /Helvetica findfont 11 scalefont setfont 0.85 setgray /r 4 def /s 5 def /tc 16 def /tr 12 def /P 3 def /Q 4 def /Del 6 def 0 0 moveto tr P idiv s Del mul mul tc Q idiv r Del mul mul Box fill tr P idiv s Del mul mul 2 mul 0 moveto tr P idiv s Del mul mul tc Q idiv r Del mul mul Box fill tr P idiv s Del mul mul tc Q idiv r Del mul mul moveto tr P idiv s Del mul mul tc Q idiv r Del mul mul Box fill tr P idiv s Del mul mul 3 mul tc Q idiv r Del mul mul moveto tr P idiv s Del mul mul tc Q idiv r Del mul mul Box fill 0 tc Q idiv r Del mul mul 2 mul moveto tr P idiv s Del mul mul tc Q idiv r Del mul mul Box fill tr P idiv s Del mul mul 2 mul tc Q idiv r Del mul mul 2 mul moveto tr P idiv s Del mul mul tc Q idiv r Del mul mul Box fill 0 setgray 0 0 moveto s Del mul r Del mul tc tr Grid 0 0 moveto [ (0,0) (0,4) (0,8) (0,12) (0,1) (0,5) (0,9) (0,13) (0,2) (0,6) (0,10) (0,14) (0,3) (0,7) (0,11) (0,15) (3,0) (3,4) (3,8) (3,12) (3,1) (3,5) (3,9) (3,13) (3,2) (3,6) (3,10) (3,14) (3,3) (3,7) (3,11) (3,15) (6,0) (6,4) (6,8) (6,12) (6,1) (6,5) (6,9) (6,13) (6,2) (6,6) (6,10) (6,14) (6,3) (6,7) (6,11) (6,15) (9,0) (9,4) (9,8) (9,12) (9,1) (9,5) (9,9) (9,13) (9,2) (9,6) (9,10) (9,14) (9,3) (9,7) (9,11) (9,15) (1,0) (1,4) (1,8) (1,12) (1,1) (1,5) (1,9) (1,13) (1,2) (1,6) (1,10) (1,14) (1,3) (1,7) (1,11) (1,15) (4,0) (4,4) (4,8) (4,12) (4,1) (4,5) (4,9) (4,13) (4,2) (4,6) (4,10) (4,14) (4,3) (4,7) (4,11) (4,15) (7,0) (7,4) (7,8) (7,12) (7,1) (7,5) (7,9) (7,13) (7,2) (7,6) (7,10) (7,14) (7,3) (7,7) (7,11) (7,15) (10,0) (10,4) (10,8) (10,12) (10,1) (10,5) (10,9) (10,13) (10,2) (10,6) (10,10) (10,14) (10,3) (10,7) (10,11) (10,15) (2,0) (2,4) (2,8) (2,12) (2,1) (2,5) (2,9) (2,13) (2,2) (2,6) (2,10) (2,14) (2,3) (2,7) (2,11) (2,15) (5,0) (5,4) (5,8) (5,12) (5,1) (5,5) (5,9) (5,13) (5,2) (5,6) (5,10) (5,14) (5,3) (5,7) (5,11) (5,15) (8,0) (8,4) (8,8) (8,12) (8,1) (8,5) (8,9) (8,13) (8,2) (8,6) (8,10) (8,14) (8,3) (8,7) (8,11) (8,15) (11,0) (11,4) (11,8) (11,12) (11,1) (11,5) (11,9) (11,13) (11,2) (11,6) (11,10) (11,14) (11,3) (11,7) (11,11) (11,15) ] s Del mul r Del mul tc tr GridDecomp 0 r Del mul tr P idiv mul P 1 sub mul moveto s Del tc Q idiv mul mul r Del tr P idiv mul mul Box fill 1 setgray 0 r Del mul tr P idiv mul P 1 sub mul moveto [ (0,0) (0,4) (0,8) (0,12) (3,0) (3,4) (3,8) (3,12) (6,0) (6,4) (6,8) (6,12) (9,0) (9,4) (9,8) (9,12) ] s Del mul r Del mul tc Q idiv tr P idiv GridDecomp 1 1 tc Q idiv 1 sub { s Del mul mul r Del mul tr P idiv mul P 1 sub mul moveto 0 r Del mul tr P idiv mul rlineto } for stroke 1 1 tr P idiv 1 sub { r Del mul mul r Del mul tr P idiv mul P 1 sub mul add 0 exch moveto s Del mul tc Q idiv mul 0 rlineto } for stroke 0 setgray 0 r Del mul tr P idiv mul P 1 sub mul moveto tc Q idiv s Del mul mul tr P idiv r Del mul mul Box stroke 2 setlinewidth /Helvetica findfont 15 scalefont setfont 0 r Del tr mul mul moveto 0 25 rlineto 0 -25 rmoveto -25 0 rlineto stroke (B,D) dup stringwidth pop 3 add neg r Del mul tr mul 6 add moveto show (q) dup stringwidth pop s Del mul tc mul exch sub 2 div r Del mul tr mul 15 add moveto show r Del tr mul mul 2 div 3 add 7 sub -25 exch moveto (p) show 0 1 Q 1 sub { dup 0.5 add tr P idiv s Del mul mul mul exch 2 string cvs dup stringwidth pop 3 -1 roll exch 0.5 mul sub r Del tr mul mul 4 add moveto show} for 0 1 P 1 sub { dup 0.5 add tc Q idiv r Del mul mul mul r Del tr mul mul sub neg exch 2 string cvs dup stringwidth pop 4 add neg 3 -1 roll 5 sub moveto show} for 0 0 moveto s Del tc mul mul r Del tr mul mul 1 1 Grid end %%EndDocument @endspecial 442 2141 a(\(b\))16 b(Global)g(blo)q(c)o(ks,)g(\()p Ff(B)s(;)8 b(D)q Fm(\),)16 b(in)g(eac)o(h)g(pro)q(cess,)g(\()p Ff(p;)8 b(q)r Fm(\).)-57 2248 y(Figure)14 b(10:)21 b(Blo)q(c)o(k)14 b(cyclic)e(decomp)q(osition)i(of)h(a)g(36)8 b Fl(\002)g Fm(80)15 b(matrix)e(with)i(a)g(blo)q(c)o(k)f(size)g(of)g(4)8 b Fl(\002)g Fm(5,)15 b(on)o(to)g(a)g(3)8 b Fl(\002)g Fm(4)-57 2315 y(pro)q(cess)15 b(template.)j(Eac)o(h)c(small)e(rectangle)h(represen)o(ts)g(one)h(matrix)f (blo)q(c)o(k)g({)h(individual)f(matrix)f(elemen)o(ts)-57 2381 y(are)20 b(not)g(sho)o(wn.)32 b(In)20 b(\(a\),)g(shading)h(is)e(used)h(to)g (emphasize)e(the)h(pro)q(cess)i(template)c(that)k(is)e(p)q(erio)q(dically)-57 2447 y(stamp)q(ed)e(o)o(v)o(er)f(the)g(matrix,)f(and)j(eac)o(h)f(blo)q(c)o(k) f(is)h(lab)q(eled)f(with)h(the)g(pro)q(cess)h(to)f(whic)o(h)g(it)f(is)h (assigned.)24 b(In)-57 2513 y(\(b\),)18 b(eac)o(h)f(shaded)h(region)g(sho)o (ws)g(the)g(blo)q(c)o(ks)f(in)g(one)h(pro)q(cess,)g(and)h(is)e(lab)q(eled)g (with)h(the)f(corresp)q(onding)-57 2579 y(global)e(blo)q(c)o(k)g(indices.)k (In)c(b)q(oth)h(\014gures,)f(the)g(blac)o(k)f(rectangles)g(indicate)g(the)h (blo)q(c)o(ks)f(assigned)i(to)f(pro)q(cess)-57 2646 y(\(0)p Ff(;)8 b Fm(0\).)939 2825 y(23)p eop %%Page: 24 26 25 bop -57 1044 a @beginspecial @setspecial %%BeginDocument: fig3.4a.ps /Mydict 100 dict def Mydict begin /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /yshift 3.5 def /GridDecomp { /ny exch def /nx exch def /dely exch def /delx exch def /Darray exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def /dx3 delx 3 div def /dy3 dely 3 div def /ix -1 def /iy ny 1 sub def Darray{ /Text exch def /ix ix 1 add def ix nx ge { /ix 0 def /iy iy 1 sub def} if /x xpos delx ix 0.5 add mul add Text stringwidth pop 2 div sub def /y ypos dely iy 0.5 add mul add yshift sub def x y moveto Text show } forall } def 2 setlinecap 40 0 translate 0.65 0.65 scale /Helvetica findfont 11 scalefont setfont 0.85 setgray /r 4 def /s 5 def /tc 16 def /tr 12 def /P 3 def /Q 4 def /Del 6 def gsave newpath 0 0 moveto 480 0 rlineto 0 288 rlineto -480 0 rlineto closepath clip s Del 2 mul mul neg r Del mul translate newpath Q s Del mul mul P r Del mul mul neg moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul 3 mul P r Del mul mul neg moveto Q s Del mul mul P r Del mul mul Box fill 0 0 moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul 2 mul 0 moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul 4 mul 0 moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul P r Del mul mul moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul 3 mul P r Del mul mul moveto Q s Del mul mul P r Del mul mul Box fill 0 P r Del mul mul 2 mul moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul 2 mul P r Del mul mul 2 mul moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul 4 mul P r Del mul mul 2 mul moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul P r Del mul mul 3 mul moveto Q s Del mul mul P r Del mul mul Box fill Q s Del mul mul 3 mul P r Del mul mul 3 mul moveto Q s Del mul mul P r Del mul mul Box fill 0 setgray 0 P r Del mul mul neg moveto Q s Del mul mul P r Del mul mul 5 5 Grid grestore 0 setgray 0 0 moveto s Del mul r Del mul tc tr Grid 2 setlinewidth 0 0 moveto s Del tc mul mul r Del tr mul mul 1 1 Grid 0 0 moveto [ (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) ] s Del mul r Del mul tc tr GridDecomp 0 1 3 { /iy exch def 0 1 3 { /ix exch def 0 setgray ix Q mul 2 add s Del mul mul iy P mul r Del mul mul moveto s Del mul r Del mul Box fill 1 setgray ix Q mul 2.5 add s Del mul mul (0,0) stringwidth pop 2 div sub iy P mul 0.5 add r Del mul mul yshift sub moveto (0,0) show } for } for 0 setgray /Helvetica findfont 15 scalefont setfont 2 setlinewidth 0 r Del tr mul mul moveto 0 25 rlineto 0 -25 rmoveto -25 0 rlineto stroke (p,q) dup stringwidth pop 4 add neg r Del mul tr mul 7 add moveto show (D) dup stringwidth pop s Del mul tc mul exch sub 2 div r Del mul tr mul 15 add moveto show r Del tr mul mul 2 div 3 add 7 sub -25 exch moveto (B) show 0 1 tc 1 sub { dup 0.5 add s Del mul mul exch 2 string cvs dup stringwidth pop 3 -1 roll exch 0.5 mul sub r Del tr mul mul 4 add moveto show} for 0 1 tr 1 sub { dup 0.5 add r Del mul mul r Del tr mul mul sub neg exch 2 string cvs dup stringwidth pop 4 add neg 3 -1 roll 5 sub moveto show} for end %%EndDocument @endspecial 253 1123 a Fm(\(a\))17 b(Assignmen)o(t)d(of)j(global)f(blo)q(c)o (k)g(indices,)f(\()p Ff(B)s(;)8 b(D)q Fm(\),)16 b(to)g(pro)q(cesses,)h(\()p Ff(p;)8 b(q)r Fm(\).)-57 2269 y @beginspecial @setspecial %%BeginDocument: fig3.4b.ps /Mydict 100 dict def Mydict begin /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /yshift 3.5 def /GridDecomp { /ny exch def /nx exch def /dely exch def /delx exch def /Darray exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def /dx3 delx 3 div def /dy3 dely 3 div def /ix -1 def /iy ny 1 sub def Darray{ /Text exch def /ix ix 1 add def ix nx ge { /ix 0 def /iy iy 1 sub def} if /x xpos delx ix 0.5 add mul add Text stringwidth pop 2 div sub def /y ypos dely iy 0.5 add mul add yshift sub def x y moveto Text show } forall } def 2 setlinecap 0 0 translate 0.65 0.65 scale /Helvetica findfont 11 scalefont setfont 0.85 setgray /r 4 def /s 5 def /tc 20 def /tr 15 def /P 3 def /Q 4 def /Del 6 def 0 0 moveto tr P idiv s Del mul mul tc Q idiv r Del mul mul Box fill tr P idiv s Del mul mul 2 mul 0 moveto tr P idiv s Del mul mul tc Q idiv r Del mul mul Box fill tr P idiv s Del mul mul tc Q idiv r Del mul mul moveto tr P idiv s Del mul mul tc Q idiv r Del mul mul Box fill tr P idiv s Del mul mul 3 mul tc Q idiv r Del mul mul moveto tr P idiv s Del mul mul tc Q idiv r Del mul mul Box fill 0 tc Q idiv r Del mul mul 2 mul moveto tr P idiv s Del mul mul tc Q idiv r Del mul mul Box fill tr P idiv s Del mul mul 2 mul tc Q idiv r Del mul mul 2 mul moveto tr P idiv s Del mul mul tc Q idiv r Del mul mul Box fill 0 setgray 0 0 moveto s Del mul r Del mul tc tr Grid 0 0 moveto [ (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) % end of first row (\320) (2,2) (2,6) (2,10) (2,14) (\320) (2,3) (2,7) (2,11) (2,15) (2,0) (2,4) (2,8) (2,12) (\320) (2,1) (2,5) (2,9) (2,13) (\320) % end of second row (\320) (5,2) (5,6) (5,10) (2,14) (\320) (5,3) (5,7) (5,11) (5,15) (5,0) (5,4) (5,8) (5,12) (\320) (5,1) (5,5) (5,9) (5,13) (\320) % end of third row (\320) (8,2) (8,6) (8,10) (8,14) (\320) (8,3) (8,7) (8,11) (8,15) (8,0) (8,4) (8,8) (8,12) (\320) (8,1) (8,5) (8,9) (8,13) (\320) % end of fourth row (\320) (11,2) (11,6) (11,10) (11,14) (\320) (11,3) (11,7) (11,11) (11,15) (11,0) (11,4) (11,8) (11,12) (\320) (11,1) (11,5) (11,9) (11,13) (\320) % end of fifth row (\320) (0,2) (0,6) (0,10) (0,14) (\320) (0,3) (0,7) (0,11) (0,15) (0,0) (0,4) (0,8) (0,12) (\320) (0,1) (0,5) (0,9) (0,13) (\320) % end of sixth row (\320) (3,2) (3,6) (3,10) (3,14) (\320) (3,3) (3,7) (3,11) (3,15) (3,0) (3,4) (3,8) (3,12) (\320) (3,1) (3,5) (3,9) (3,13) (\320) % end of seventh row (\320) (6,2) (6,6) (6,10) (6,14) (\320) (6,3) (6,7) (6,11) (6,15) (6,0) (6,4) (6,8) (6,12) (\320) (6,1) (6,5) (6,9) (6,13) (\320) % end of eighth row (\320) (9,2) (9,6) (9,10) (9,14) (\320) (9,3) (9,7) (9,11) (9,15) (9,0) (9,4) (9,8) (9,12) (\320) (9,1) (9,5) (9,9) (9,13) (\320) % end of ninth row (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) % end of tenth row (\320) (1,2) (1,6) (1,10) (1,14) (\320) (1,3) (1,7) (1,11) (1,15) (1,0) (1,4) (1,8) (1,12) (\320) (1,1) (1,5) (1,9) (1,13) (\320) % end of eleventh row (\320) (4,2) (4,6) (4,10) (4,14) (\320) (4,3) (4,7) (4,11) (4,15) (4,0) (4,4) (4,8) (4,12) (\320) (4,1) (4,5) (4,9) (4,13) (\320) % end of twelvth row (\320) (7,2) (7,6) (7,10) (7,14) (\320) (7,3) (7,7) (7,11) (7,15) (7,0) (7,4) (7,8) (7,12) (\320) (7,1) (7,5) (7,9) (7,13) (\320) % end of thirteenth row (\320) (10,2) (10,6) (10,10) (10,14) (\320) (10,3) (10,7) (10,11) (10,15) (10,0) (10,4) (10,8) (10,12) (\320) (10,1) (10,5) (10,9) (10,13) (\320) % end of fourteenth row (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) (\320) % end of fifteenth row ] s Del mul r Del mul tc tr GridDecomp s Del mul r Del mul 10 mul moveto s Del 4 mul mul r Del 4 mul mul Box fill 1 setgray s Del mul r Del mul 10 mul moveto [ (2,2) (2,6) (2,10) (2,14) (5,2) (5,6) (5,10) (5,14) (8,2) (8,6) (8,10) (8,14) (11,2) (11,6) (11,10) (11,14) ] s Del mul r Del mul 4 4 GridDecomp 1 1 3 { 1 add s Del mul mul r Del mul 10 mul moveto 0 r Del mul 4 mul rlineto } for stroke 1 1 3 { 10 add r Del mul mul s Del mul exch moveto s Del mul 4 mul 0 rlineto } for stroke 0 setgray s Del mul r Del mul 10 mul moveto 4 s Del mul mul 4 r Del mul mul Box stroke /Helvetica findfont 15 scalefont setfont 0 r Del tr mul mul moveto 0 25 rlineto 0 -25 rmoveto -25 0 rlineto stroke (B,D) dup stringwidth pop 3 add neg r Del mul tr mul 6 add moveto show (q) dup stringwidth pop s Del mul tc mul exch sub 2 div r Del mul tr mul 15 add moveto show r Del tr mul mul 2 div 3 add 7 sub -25 exch moveto (p) show 0 1 Q 1 sub { dup 0.5 add tr P idiv s Del mul mul mul exch 2 string cvs dup stringwidth pop 3 -1 roll exch 0.5 mul sub r Del tr mul mul 4 add moveto show} for 0 1 P 1 sub { dup 0.5 add tc Q idiv r Del mul mul mul r Del tr mul mul sub neg exch 2 string cvs dup stringwidth pop 4 add neg 3 -1 roll 5 sub moveto show} for 2 setlinewidth 0 0 moveto s Del tc mul mul r Del tr mul mul 1 1 Grid end %%EndDocument @endspecial 442 2348 a(\(b\))16 b(Global)g(blo)q(c)o(ks,)g(\()p Ff(B)s(;)8 b(D)q Fm(\),)16 b(in)g(eac)o(h)g(pro)q(cess,)g(\()p Ff(p;)8 b(q)r Fm(\).)-57 2456 y(Figure)17 b(11:)25 b(The)18 b(same)f(matrix)f(decomp)q(osition)h(as)h(sho)o(wn)h(in)e(Figure)h(10,)g(but) g(for)g(a)g(template)e(o\013set)j(of)-57 2522 y(\()p Ff(p)-14 2529 y Fe(0)6 2522 y Ff(;)8 b(q)50 2529 y Fe(0)69 2522 y Fm(\))20 b(=)h(\(1)p Ff(;)8 b Fm(2\).)33 b(Dashed)20 b(en)o(tries)f(in)h(\(b\))g (indicate)f(that)h(the)g(blo)q(c)o(k)f(do)q(es)i(not)f(con)o(tain)g(an)o(y)g (data.)33 b(In)-57 2588 y(b)q(oth)17 b(\014gures,)f(the)g(blac)o(k)g (rectangles)g(indicate)f(the)h(blo)q(c)o(ks)g(assigned)h(to)f(pro)q(cess)h (\(0)p Ff(;)8 b Fm(0\).)939 2825 y(24)p eop %%Page: 25 27 26 bop -57 1051 a @beginspecial @setspecial %%BeginDocument: fig3.5a.ps /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /yshift 3.5 def /GridDecomp { /ny exch def /nx exch def /dely exch def /delx exch def /Darray exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def /dx3 delx 3 div def /dy3 dely 3 div def /ix -1 def /iy ny 1 sub def Darray{ /Text exch def /ix ix 1 add def ix nx ge { /ix 0 def /iy iy 1 sub def} if /x xpos delx ix 0.5 add mul add Text stringwidth pop 2 div sub def /y ypos dely iy 0.5 add mul add yshift sub def x y moveto Text show } forall } def 2 setlinecap 12.6 10 translate /Helvetica findfont 10 scalefont setfont 0.8 setgray 0 7 18 mul moveto 10 18 mul 3 18 mul Box fill 0 setgray 0 0 moveto 18 18 10 10 Grid 0 0 moveto [ (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) ] 18 18 10 10 GridDecomp %%EndDocument @endspecial 33 1089 a Fm(\(a\))16 b Ff(r)f Fm(=)f(3,)i Ff(s)e Fm(=)g(10,)j Ff(P)k Fm(=)13 b(4,)k Ff(Q)c Fm(=)h(1)1128 1051 y @beginspecial @setspecial %%BeginDocument: fig3.5b.ps /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /yshift 3.5 def /GridDecomp { /ny exch def /nx exch def /dely exch def /delx exch def /Darray exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def /dx3 delx 3 div def /dy3 dely 3 div def /ix -1 def /iy ny 1 sub def Darray{ /Text exch def /ix ix 1 add def ix nx ge { /ix 0 def /iy iy 1 sub def} if /x xpos delx ix 0.5 add mul add Text stringwidth pop 2 div sub def /y ypos dely iy 0.5 add mul add yshift sub def x y moveto Text show } forall } def 2 setlinecap 12.6 10 translate /Helvetica findfont 10 scalefont setfont 0.8 setgray 0 0 moveto 0 1 18 mul moveto 10 18 mul 1 18 mul Box fill 0 5 18 mul moveto 10 18 mul 1 18 mul Box fill 0 9 18 mul moveto 10 18 mul 1 18 mul Box fill 0 setgray 0 0 moveto 18 18 10 10 Grid 0 0 moveto [ (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (2,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) (3,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (0,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) (1,0) ] 18 18 10 10 GridDecomp %%EndDocument @endspecial 1216 1089 a(\(b\))j Ff(r)e Fm(=)f(1,)i Ff(s)e Fm(=)f(10,)k Ff(P)k Fm(=)14 b(4,)i Ff(Q)d Fm(=)h(1)-57 2028 y @beginspecial @setspecial %%BeginDocument: fig3.5c.ps /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /yshift 3.5 def /GridDecomp { /ny exch def /nx exch def /dely exch def /delx exch def /Darray exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def /dx3 delx 3 div def /dy3 dely 3 div def /ix -1 def /iy ny 1 sub def Darray{ /Text exch def /ix ix 1 add def ix nx ge { /ix 0 def /iy iy 1 sub def} if /x xpos delx ix 0.5 add mul add Text stringwidth pop 2 div sub def /y ypos dely iy 0.5 add mul add yshift sub def x y moveto Text show } forall } def 2 setlinecap 12.6 10 translate /Helvetica findfont 10 scalefont setfont 0.8 setgray 0 0 18 mul moveto 3 18 mul 10 18 mul Box fill 0 setgray 0 0 moveto 18 18 10 10 Grid 0 0 moveto [ (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) ] 18 18 10 10 GridDecomp %%EndDocument @endspecial 34 2065 a(\(c\))i Ff(r)f Fm(=)f(10,)i Ff(s)e Fm(=)g(3,)i Ff(P)21 b Fm(=)14 b(1,)i Ff(Q)e Fm(=)g(4)1128 2028 y @beginspecial @setspecial %%BeginDocument: fig3.5d.ps /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /yshift 3.5 def /GridDecomp { /ny exch def /nx exch def /dely exch def /delx exch def /Darray exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def /dx3 delx 3 div def /dy3 dely 3 div def /ix -1 def /iy ny 1 sub def Darray{ /Text exch def /ix ix 1 add def ix nx ge { /ix 0 def /iy iy 1 sub def} if /x xpos delx ix 0.5 add mul add Text stringwidth pop 2 div sub def /y ypos dely iy 0.5 add mul add yshift sub def x y moveto Text show } forall } def 2 setlinecap 12.6 10 translate /Helvetica findfont 10 scalefont setfont 0.8 setgray 0 18 mul 0 moveto 1 18 mul 10 18 mul Box fill 4 18 mul 0 moveto 1 18 mul 10 18 mul Box fill 8 18 mul 0 moveto 1 18 mul 10 18 mul Box fill 0 setgray 0 0 moveto 18 18 10 10 Grid 0 0 moveto [ (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) ] 18 18 10 10 GridDecomp %%EndDocument @endspecial 1216 2065 a(\(d\))j Ff(r)e Fm(=)f(10,)i Ff(s)e Fm(=)g(1,)i Ff(P)21 b Fm(=)14 b(1,)i Ff(Q)d Fm(=)h(4)-57 2173 y(Figure)22 b(12:)33 b(These)22 b(4)g(\014gures)h(sho)o(w)f(di\013eren)o(t)f (w)o(a)o(ys)h(of)h(decomp)q(osing)e(a)i(10)15 b Fl(\002)g Fm(10)23 b(matrix.)37 b(Eac)o(h)22 b(cell)-57 2239 y(represen)o(ts)14 b(a)i(matrix)d(elemen)o(t,)f(and)j(is)g(lab)q(eled)g(b)o(y)f(the)h(p)q (osition,)g(\()p Ff(p;)8 b(q)r Fm(\),)15 b(in)f(the)h(template)e(of)j(the)e (pro)q(cess)-57 2306 y(to)j(whic)o(h)f(it)f(is)i(assigned.)22 b(T)l(o)17 b(emphasize)e(the)h(pattern)g(of)h(decomp)q(osition,)e(the)h (matrix)f(en)o(tries)g(assigned)-57 2372 y(to)h(the)g(pro)q(cess)h(in)f(the)f (\014rst)i(ro)o(w)f(and)h(column)d(of)j(the)e(template)f(are)i(sho)o(wn)h (shaded,)f(and)h(eac)o(h)f(separate)-57 2438 y(shaded)g(region)f(represen)o (ts)f(a)h(matrix)e(blo)q(c)o(k.)21 b(Figures)14 b(\(a\))i(and)f(\(b\))g(sho)o (w)h(blo)q(c)o(k)e(and)i(cyclic)c(ro)o(w-orien)o(ted)-57 2504 y(decomp)q(ositions,)24 b(resp)q(ectiv)o(ely)l(,)e(for)i(4)g(no)q(des.)43 b(In)23 b(\014gures)h(\(c\))f(and)h(\(d\))g(the)f(corresp)q(onding)h(column-) -57 2570 y(orien)o(ted)d(decomp)q(ositions)h(are)g(sho)o(wn.)39 b(Belo)o(w)21 b(eac)o(h)h(\014gure)g(w)o(e)g(giv)o(e)f(the)h(v)m(alues)g(of)g Ff(r)q Fm(,)h Ff(s)p Fm(,)g Ff(P)7 b Fm(,)24 b(and)e Ff(Q)-57 2637 y Fm(corresp)q(onding)17 b(to)g(the)f(decomp)q(osition.)k(In)c(all)g (cases)g Ff(p)1015 2644 y Fe(0)1049 2637 y Fm(=)e Ff(q)1123 2644 y Fe(0)1156 2637 y Fm(=)g(0.)939 2825 y(25)p eop %%Page: 26 28 27 bop -57 1217 a @beginspecial @setspecial %%BeginDocument: fig3.6a.ps /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /yshift 3.5 def /GridDecomp { /ny exch def /nx exch def /dely exch def /delx exch def /Darray exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def /dx3 delx 3 div def /dy3 dely 3 div def /ix -1 def /iy ny 1 sub def Darray{ /Text exch def /ix ix 1 add def ix nx ge { /ix 0 def /iy iy 1 sub def} if /x xpos delx ix 0.5 add mul add Text stringwidth pop 2 div sub def /y ypos dely iy 0.5 add mul add yshift sub def x y moveto Text show } forall } def 2 setlinecap 12.6 10 translate /Helvetica findfont 10 scalefont setfont 0.8 setgray 0 7 18 mul moveto 3 18 mul dup Box fill 0 setgray 0 0 moveto 18 18 10 10 Grid 0 0 moveto [ (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (1,0) (1,0) (1,0) (1,1) (1,1) (1,1) (1,2) (1,2) (1,2) (1,3) (1,0) (1,0) (1,0) (1,1) (1,1) (1,1) (1,2) (1,2) (1,2) (1,3) (1,0) (1,0) (1,0) (1,1) (1,1) (1,1) (1,2) (1,2) (1,2) (1,3) (2,0) (2,0) (2,0) (2,1) (2,1) (2,1) (2,2) (2,2) (2,2) (2,3) (2,0) (2,0) (2,0) (2,1) (2,1) (2,1) (2,2) (2,2) (2,2) (2,3) (2,0) (2,0) (2,0) (2,1) (2,1) (2,1) (2,2) (2,2) (2,2) (2,3) (3,0) (3,0) (3,0) (3,1) (3,1) (3,1) (3,2) (3,2) (3,2) (3,3) ] 18 18 10 10 GridDecomp %%EndDocument @endspecial 45 1254 a Fm(\(a\))17 b Ff(r)e Fm(=)e(3,)k Ff(s)c Fm(=)h(3,)i Ff(P)21 b Fm(=)14 b(4,)i Ff(Q)e Fm(=)g(4)1128 1217 y @beginspecial @setspecial %%BeginDocument: fig3.6b.ps /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /yshift 3.5 def /GridDecomp { /ny exch def /nx exch def /dely exch def /delx exch def /Darray exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def /dx3 delx 3 div def /dy3 dely 3 div def /ix -1 def /iy ny 1 sub def Darray{ /Text exch def /ix ix 1 add def ix nx ge { /ix 0 def /iy iy 1 sub def} if /x xpos delx ix 0.5 add mul add Text stringwidth pop 2 div sub def /y ypos dely iy 0.5 add mul add yshift sub def x y moveto Text show } forall } def 2 setlinecap 12.6 10 translate /Helvetica findfont 10 scalefont setfont 0.8 setgray 0 7 18 mul moveto 1 18 mul 3 18 mul Box fill 4 18 mul 7 18 mul moveto 1 18 mul 3 18 mul Box fill 8 18 mul 7 18 mul moveto 1 18 mul 3 18 mul Box fill 0 setgray 0 0 moveto 18 18 10 10 Grid 0 0 moveto [ (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (3,0) (3,1) (3,2) (3,3) (3,0) (3,1) (3,2) (3,3) (3,0) (3,1) ] 18 18 10 10 GridDecomp %%EndDocument @endspecial 1229 1254 a(\(b\))i Ff(r)f Fm(=)f(3,)i Ff(s)e Fm(=)g(1,)i Ff(P)21 b Fm(=)14 b(4,)i Ff(Q)d Fm(=)h(4)-57 2134 y @beginspecial @setspecial %%BeginDocument: fig3.6c.ps /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /yshift 3.5 def /GridDecomp { /ny exch def /nx exch def /dely exch def /delx exch def /Darray exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def /dx3 delx 3 div def /dy3 dely 3 div def /ix -1 def /iy ny 1 sub def Darray{ /Text exch def /ix ix 1 add def ix nx ge { /ix 0 def /iy iy 1 sub def} if /x xpos delx ix 0.5 add mul add Text stringwidth pop 2 div sub def /y ypos dely iy 0.5 add mul add yshift sub def x y moveto Text show } forall } def 2 setlinecap 12.6 10 translate /Helvetica findfont 10 scalefont setfont 0.8 setgray 0 1 18 mul moveto 3 18 mul 1 18 mul Box fill 0 5 18 mul moveto 3 18 mul 1 18 mul Box fill 0 9 18 mul moveto 3 18 mul 1 18 mul Box fill 0 setgray 0 0 moveto 18 18 10 10 Grid 0 0 moveto [ (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (1,0) (1,0) (1,0) (1,1) (1,1) (1,1) (1,2) (1,2) (1,2) (1,3) (2,0) (2,0) (2,0) (2,1) (2,1) (2,1) (2,2) (2,2) (2,2) (2,3) (3,0) (3,0) (3,0) (3,1) (3,1) (3,1) (3,2) (3,2) (3,2) (3,3) (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (1,0) (1,0) (1,0) (1,1) (1,1) (1,1) (1,2) (1,2) (1,2) (1,3) (2,0) (2,0) (2,0) (2,1) (2,1) (2,1) (2,2) (2,2) (2,2) (2,3) (3,0) (3,0) (3,0) (3,1) (3,1) (3,1) (3,2) (3,2) (3,2) (3,3) (0,0) (0,0) (0,0) (0,1) (0,1) (0,1) (0,2) (0,2) (0,2) (0,3) (1,0) (1,0) (1,0) (1,1) (1,1) (1,1) (1,2) (1,2) (1,2) (1,3) ] 18 18 10 10 GridDecomp %%EndDocument @endspecial 46 2171 a(\(c\))i Ff(r)f Fm(=)f(1,)i Ff(s)e Fm(=)g(3,)i Ff(P)21 b Fm(=)14 b(4,)i Ff(Q)e Fm(=)f(4)1128 2134 y @beginspecial @setspecial %%BeginDocument: fig3.6d.ps /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /yshift 3.5 def /GridDecomp { /ny exch def /nx exch def /dely exch def /delx exch def /Darray exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def /dx3 delx 3 div def /dy3 dely 3 div def /ix -1 def /iy ny 1 sub def Darray{ /Text exch def /ix ix 1 add def ix nx ge { /ix 0 def /iy iy 1 sub def} if /x xpos delx ix 0.5 add mul add Text stringwidth pop 2 div sub def /y ypos dely iy 0.5 add mul add yshift sub def x y moveto Text show } forall } def 2 setlinecap 12.6 10 translate /Helvetica findfont 10 scalefont setfont 0.8 setgray 0 1 18 mul moveto 18 18 Box fill 0 5 18 mul moveto 18 18 Box fill 0 9 18 mul moveto 18 18 Box fill 4 18 mul 1 18 mul moveto 18 18 Box fill 4 18 mul 5 18 mul moveto 18 18 Box fill 4 18 mul 9 18 mul moveto 18 18 Box fill 8 18 mul 1 18 mul moveto 18 18 Box fill 8 18 mul 5 18 mul moveto 18 18 Box fill 8 18 mul 9 18 mul moveto 18 18 Box fill 0 setgray 0 0 moveto 18 18 10 10 Grid 0 0 moveto [ (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (3,0) (3,1) (3,2) (3,3) (3,0) (3,1) (3,2) (3,3) (3,0) (3,1) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (2,2) (2,3) (2,0) (2,1) (3,0) (3,1) (3,2) (3,3) (3,0) (3,1) (3,2) (3,3) (3,0) (3,1) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (0,2) (0,3) (0,0) (0,1) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) (1,2) (1,3) (1,0) (1,1) ] 18 18 10 10 GridDecomp %%EndDocument @endspecial 1229 2171 a(\(d\))j Ff(r)f Fm(=)f(1,)i Ff(s)e Fm(=)g(1,)i Ff(P)21 b Fm(=)14 b(4,)i Ff(Q)d Fm(=)h(4)-57 2279 y(Figure)f(13:)21 b(These)14 b(4)g(\014gures)g(sho)o(w)g(di\013eren)o(t)f(w)o (a)o(ys)g(of)h(decomp)q(osing)g(a)g(10)6 b Fl(\002)g Fm(10)14 b(matrix)e(o)o(v)o(er)h(16)h(pro)q(cesses)-57 2345 y(arranged)f(as)h(a)e(4)s Fl(\002)s Fm(4)i(template.)k(Belo)o(w)11 b(eac)o(h)h(\014gure)h(w)o(e)f(giv)o (e)g(the)g(v)m(alues)g(of)h Ff(r)q Fm(,)g Ff(s)p Fm(,)g Ff(P)7 b Fm(,)13 b(and)g Ff(Q)f Fm(corresp)q(onding)-57 2411 y(to)17 b(the)f(decomp)q(osition.)k(In)c(all)g(cases)g Ff(p)702 2418 y Fe(0)736 2411 y Fm(=)e Ff(q)810 2418 y Fe(0)843 2411 y Fm(=)g(0.)939 2825 y(26)p eop %%Page: 27 29 28 bop -57 975 a @beginspecial @setspecial %%BeginDocument: exchange.ps /arrowdict 13 dict def % Local storage for the procedure % ``arrow.'' /arrow % The procedure ``arrow'' adds an { arrowdict begin % arrow shape to the current path. /headlength exch def % It takes seven arguments: the x /halfheadthickness exch 2 div def % and y coordinates of the tail /halfthickness exch 2 div def % (imagine that a line has been /tipy exch def /tipx exch def % drawn down the center of the /taily exch def /tailx exch def % arrow from the tip to the tail, % then x and y lie on this line), % the x and y coordinates of the % tip of the arrow, the thickness % of the arrow in the tail % portion, the thickness of the % arrow at the widest part of the % arrowhead and the length of the % arrowhead. /dx tipx tailx sub def % Compute the differences in x and /dy tipy taily sub def % y for the tip and tail. These /arrowlength dx dx mul dy dy mul add % will be used to compute the sqrt def % length of the arrow and to /angle dy dx atan def % compute the angle of direction % that the arrow is facing with % respect to the current user % coordinate system origin. /base arrowlength headlength sub def % Compute where the base of the % arrowhead will be. /savematrix matrix currentmatrix def % Save the current user coordinate % system. We are using the same % strategy to localize the effect % of transformations as was used % in the program to draw an % ellipse. tailx taily translate % Translate to the starting point % of the tail. angle rotate % Rotate the x-axis to correspond % with the center line of the % arrow. 0 halfthickness neg moveto % Add the arrow shape to the % current path. base halfthickness neg lineto base halfheadthickness neg lineto arrowlength 0 lineto base halfheadthickness lineto base halfthickness lineto 0 halfthickness lineto closepath savematrix setmatrix % Restore the current user % coordinate system. end } def /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /Gridbox { /ny exch def /nx exch def /dely exch def /delx exch def /ypos exch def /xpos exch def /leny { ny dely mul} def /lenx { nx delx mul} def xpos ypos moveto [2 2] 0 setdash delx dely nx ny Grid newpath xpos ypos moveto [] 0 setdash lenx leny Box stroke } def /Circle { 0 360 arc } def /Ndots { /crad exch def /csep exch def /ndots exch def currentpoint /ymid exch def /xmid exch def 1 1 ndots { newpath xmid ymid crad Circle fill /xmid xmid csep add def} for } def /Cgrid { /crad exch def /ny exch def /nx exch def /dely exch def /delx exch def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny 1 sub{ pop 0 1 nx 1 sub{ pop newpath x y crad Circle fill /x x delx add def} for /x xpos def /y y dely add def} for } def /PaintCircle { /lh exch def /crad exch def /ymid exch def /xmid exch def newpath xmid ymid crad Circle gsave 1 setgray fill grestore stroke xmid ymid moveto dup stringwidth pop 2 div neg lh neg rmoveto show } def /GridSym { /isymbol exch def /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def /dx3 delx 3 div def /dy3 dely 3 div def 1 1 nx { pop /y ypos def 1 1 ny { pop isymbol 1 eq { newpath x delx 2 div add y dely 2 div add delx 3 div Csym gsave 1.0 setgray fill grestore stroke } if isymbol 2 eq { x y PlusSym } if isymbol 3 eq { x y CrossSym } if isymbol 4 eq { x dx3 2 div add y dy3 2 div add dx3 2 mul dy3 2 mul TriSym gsave 1.0 setgray fill grestore stroke } if isymbol 5 eq { newpath x delx 2 div add y dely 2 div add delx 3 div Csym fill } if isymbol 6 eq { x dx3 2 div add y dy3 2 div add dx3 2 mul dy3 2 mul TriSym fill } if isymbol 7 eq { 2 copy PlusSym CrossSym } if isymbol 8 eq { x y RectSym gsave 1.0 setgray fill grestore stroke } if isymbol 9 eq { x y RectSym fill } if isymbol 10 eq { newpath x delx 2 div add y dely 2 div add delx 5 div gray Cfillsym stroke } if /y y dely add def} for /x x delx add def} for } def /PlusSym { newpath moveto delx 2 div 0 rmoveto 0 dely rlineto delx 2 div neg dely 2 div neg rmoveto delx 0 rlineto stroke } def /RectSym { newpath moveto delx 0 rlineto 0 dely rlineto delx neg 0 rlineto closepath } def /CrossSym { newpath moveto delx dely rlineto delx neg 0 rmoveto delx dely neg rlineto stroke } def /TriSym { /ddy exch def /ddx exch def newpath moveto ddx 0 rlineto ddx 2 div neg ddy rlineto closepath } def /Csym % stack: xcen ycen radius => ??? Draws circle centered on (xcen ycen) { 0 360 arc } def /Cfillsym % stack: xcen ycen radius gray => ??? Draws shaded circle centered % on (xcen ycen) { /gray exch def 0 360 arc gsave gray setgray fill grestore } def /dwdict 100 dict def dwdict begin 1.5 setlinewidth 110 40 translate 0.8 dup scale /Size 210 def /Bwid 35 def /Ewid 105 def Size Bwid Ewid add sub 0 moveto 0 Ewid Bwid add rlineto Ewid Bwid add 0 rlineto Size Ewid sub 0 moveto 0 Ewid Bwid add rlineto Size Ewid moveto Ewid neg 0 rlineto stroke Size Ewid Bwid add sub Ewid Bwid add moveto 0 Size lineto 0 0 moveto Size Size Box stroke clear /Helvetica findfont 25 scalefont setfont /SBwid {Size Bwid Ewid add sub} def SBwid (L) stringwidth pop sub 2 div Size 2 div 20 sub moveto (L) show Size (U) stringwidth pop sub 2 div SBwid 2 div 5 sub Bwid Ewid add add moveto (U) show Size Ewid sub Bwid sub 14 add 0 moveto 7 Ewid Bwid add 14 sub Box gsave 0.8 setgray fill grestore 1 setlinewidth 0 Ewid Bwid add 14 sub moveto Size 0 rlineto 0 Ewid Bwid add 21 sub moveto Size 0 rlineto stroke 0 Ewid Bwid add 70 sub moveto Size 0 rlineto 0 Ewid Bwid add 70 sub 7 sub moveto Size 0 rlineto stroke Size Ewid sub Bwid sub 14 add 0 moveto 0 Ewid Bwid add 14 sub rlineto Size Ewid sub Bwid sub 21 add 0 moveto 0 Ewid Bwid add 14 sub rlineto stroke Size Ewid sub Bwid sub 14 add 3.5 add -18 Size Ewid sub Bwid sub 14 add 3.5 add -3 2 7 6 arrow fill clear /Helvetica findfont 15 scalefont setfont (column) stringwidth pop Size Ewid sub Bwid sub 14 add 3.5 add exch 2 div sub -30 moveto (column) show (kr+i) stringwidth pop Size Ewid sub Bwid sub 14 add 3.5 add exch 2 div sub -42 moveto (kr+i) show -18 Ewid Bwid add 14 sub 3.5 sub -3 Ewid Bwid add 14 sub 3.5 sub 2 7 6 arrow fill (kr+i) stringwidth pop -22 exch 2 div sub dup dup (kr+i) stringwidth pop 2 div sub Ewid Bwid add 14 sub 12 sub moveto (kr+i) show (row) stringwidth pop 2 div sub Ewid Bwid add 14 sub 0 sub moveto (row) show -18 Ewid Bwid add 70 sub 3.5 sub -3 Ewid Bwid add 70 sub 3.5 sub 2 7 6 arrow fill (pivot) stringwidth pop -22 exch 2 div sub dup dup (pivot) stringwidth pop 2 div sub Ewid Bwid add 70 sub 0 sub moveto (pivot) show (row) stringwidth pop 2 div sub Ewid Bwid add 70 sub 12 sub moveto (row) show Size 18 add Ewid Bwid add 14 sub 3.5 sub Size 3 add Ewid Bwid add 14 sub 3.5 sub 2 7 6 arrow fill Size 18 add Ewid Bwid add 70 sub 3.5 sub Size 3 add Ewid Bwid add 70 sub 3.5 sub 2 7 6 arrow fill Size 18 add Ewid Bwid add 70 sub 3.5 sub 1 sub moveto 2 58 Box fill (exchange) stringwidth pop Size 23 add exch 2 div add dup dup (exchange) stringwidth pop 2 div sub Ewid Bwid add 42 sub 0 sub moveto (exchange) show (rows) stringwidth pop 2 div sub Ewid Bwid add 54 sub 0 sub moveto (rows) show Size Ewid sub Bwid sub Ewid Bwid add moveto 14 -14 rlineto stroke clear end %%EndDocument @endspecial 108 x Fm(Figure)23 b(14:)36 b(This)24 b(\014gure)f(sho)o(ws)i (piv)o(oting)e(for)g(step)h Ff(i)f Fm(of)g(the)h Ff(k)r Fm(th)f(stage)h(of)g (LU)f(factorization.)43 b(The)-57 1149 y(elemen)o(t)15 b(with)i(largest)h (absolute)g(v)m(alue)f(in)g(the)g(gra)o(y)h(shaded)g(part)g(of)g(column)e Ff(k)r(r)e Fm(+)d Ff(i)18 b Fm(is)f(found,)h(and)g(the)-57 1215 y(ro)o(w)f(con)o(taining)f(it)g(is)h(exc)o(hanged)f(with)g(ro)o(w)h Ff(k)r(r)12 b Fm(+)g Ff(i)p Fm(.)21 b(If)16 b(the)h(ro)o(ws)g(exc)o(hanged)f (lie)f(in)h(di\013eren)o(t)g(pro)q(cesses,)-57 1281 y(comm)o(unic)o(ation)e (ma)o(y)h(b)q(e)h(necessary)l(.)-57 1424 y(The)23 b(solution)h(of)g(the)f(lo) o(w)o(er)g(triangular)h(system)e Ff(L)968 1431 y Fe(0)987 1424 y Ff(U)1020 1431 y Fe(1)1066 1424 y Fm(=)k Ff(C)h Fm(to)d(ev)m(aluate)f(the)h Ff(k)r Fm(th)f(blo)q(c)o(k)g(ro)o(w)h(of)g Ff(U)-57 1490 y Fm(in)o(v)o(olv)o(es)15 b(a)j(single)e(ro)o(w)i(of)f(blo)q(c)o(ks,)g(and)h (these)e(lie)g(in)h(a)h(single)e(ro)o(w)i(of)f(the)g(pro)q(cess)h(template.)k (If)17 b(a)g(cyclic)-57 1556 y(ro)o(w)e(decomp)q(osition)f(is)h(used,)g(lik)o (e)e(that)i(sho)o(wn)h(in)e(Figure)h(12\(b\),)g(only)g(one)g(pro)q(cessor)h (is)f(in)o(v)o(olv)o(ed)e(in)h(the)-57 1623 y(triangular)i(solv)o(e,)e(and)i (no)g(comm)o(unicati)o(on)d(is)j(necessary)f(b)q(et)o(w)o(een)f(the)h(pro)q (cesses.)22 b(Ho)o(w)o(ev)o(er,)13 b(in)i(general)-57 1689 y Ff(Q)23 b Fm(pro)q(cesses)g(are)g(in)o(v)o(olv)o(ed,)g(and)g(comm)o (unication)d(is)j(necessary)g(to)g(broadcast)i(the)d(lo)o(w)o(er)h (triangular)-57 1755 y(matrix,)g Ff(L)157 1762 y Fe(0)177 1755 y Fm(,)i(to)f(all)f(pro)q(cesses)h(in)f(the)h(ro)o(w.)43 b(Once)23 b(this)g(has)h(b)q(een)g(done,)h(eac)o(h)e(pro)q(cess)h(in)f(the)h(ro)o(w)-57 1821 y(indep)q(enden)o(tly)15 b(p)q(erforms)g(a)i(lo)o(w)o(er)e(triangular)i (solv)o(e)e(for)i(the)f(blo)q(c)o(ks)g(of)g Ff(C)k Fm(that)d(it)f(holds.)-57 1909 y(The)d(comm)o(unic)o(ation)e(necessary)h(to)h(up)q(date)h(the)e (trailing)h(submatrix)e(at)j(step)e Ff(k)j Fm(tak)o(es)e(place)f(in)h(t)o(w)o (o)f(steps.)-57 1975 y(First,)i(eac)o(h)h(pro)q(cess)h(holding)f(part)g(of)h Ff(L)716 1982 y Fe(1)751 1975 y Fm(broadcasts)g(these)f(blo)q(c)o(ks)g(to)g (the)g(other)g(pro)q(cesses)h(in)e(the)h(same)-57 2041 y(ro)o(w)22 b(of)f(the)g(template.)35 b(This)21 b(ma)o(y)f(b)q(e)h(done)h(in)f (conjunction)g(with)g(the)g(broadcast)i(of)e Ff(L)1706 2048 y Fe(0)1726 2041 y Fm(,)h(men)o(tioned)-57 2107 y(in)c(the)g(preceding)f (paragraph,)j(so)f(that)f(all)g(of)g(the)g(factored)g(panel)g(is)g(broadcast) h(together.)27 b(Next,)18 b(eac)o(h)-57 2174 y(pro)q(cess)f(holding)f(part)g (of)h Ff(U)480 2181 y Fe(1)516 2174 y Fm(broadcasts)g(these)f(blo)q(c)o(ks)g (to)g(the)g(other)g(pro)q(cesses)h(in)e(the)h(same)f(column)g(of)-57 2240 y(the)k(template.)29 b(Eac)o(h)19 b(pro)q(cess)h(can)g(then)f(complete)e (the)i(up)q(date)h(of)g(the)f(blo)q(c)o(ks)g(that)h(it)f(holds)g(with)h(no) -57 2306 y(further)c(comm)o(unic)o(ation.)-57 2394 y(A)f(pseudo)q(co)q(de)h (outline)f(of)g(the)g(parallel)g(LU)g(factorization)g(algorithm)f(is)h(giv)o (en)g(in)g(Figure)f(15.)22 b(There)15 b(are)-57 2460 y(t)o(w)o(o)i(p)q(oin)o (ts)g(w)o(orth)g(noting)h(in)e(Figure)h(15.)24 b(First,)16 b(the)h(triangular)g(solv)o(e)f(and)i(up)q(date)f(phases)h(op)q(erate)g(on) -57 2526 y(matrix)10 b(blo)q(c)o(ks)i(and)h(ma)o(y)l(,)e(therefore,)h(b)q(e)g (done)h(with)f(parallel)f(v)o(ersions)h(of)h(the)e(Lev)o(el)g(3)i(BLAS)f (\(sp)q(eci\014cally)l(,)-57 2592 y(xTRSM)g(and)i(xGEMM,)e(resp)q(ectiv)o (ely\).)17 b(The)c(factorization)g(of)g(the)f(column)g(of)h(blo)q(c)o(ks,)f (ho)o(w)o(ev)o(er,)g(in)o(v)o(olv)o(es)-57 2659 y(a)24 b(lo)q(op)g(o)o(v)o (er)e(matrix)g(columns.)41 b(Hence,)23 b(is)h(it)e(not)i(a)g(blo)q(c)o (k-orien)o(ted)e(computation,)i(and)g(cannot)g(b)q(e)939 2825 y(27)p eop %%Page: 28 30 29 bop -57 125 a Fm(p)q(erformed)16 b(using)h(the)g(Lev)o(el)f(3)h(BLAS.)g (The)g(second)g(p)q(oin)o(t)g(to)h(note)f(is)g(that)g(most)g(of)g(the)g (parallelism)d(in)-57 191 y(the)g(co)q(de)h(comes)e(from)g(up)q(dating)i(the) g(trailing)e(submatrix)g(since)h(this)g(is)h(the)f(only)g(phase)h(in)f(whic)o (h)f(all)h(the)-57 257 y(pro)q(cesses)j(are)f(busy)l(.)-57 345 y(Figure)d(15)i(also)f(sho)o(ws)g(quite)f(clearly)f(where)i(comm)o(uni)o (cation)d(is)j(required;)f(namely)l(,)e(in)j(\014nding)g(the)f(piv)o(ot,)-57 411 y(exc)o(hanging)g(piv)o(ot)h(ro)o(ws,)g(and)g(p)q(erforming)f(v)m(arious) h(t)o(yp)q(es)g(of)g(broadcast.)22 b(The)13 b(exact)h(w)o(a)o(y)f(in)g(whic)o (h)g(these)-57 477 y(comm)o(unic)o(ations)f(are)i(done)g(and)h(in)o(terlea)o (v)o(ed)c(with)j(computation)g(generally)f(has)h(an)h(imp)q(ortan)o(t)e (e\013ect)h(on)-57 544 y(p)q(erformance,)h(and)h(will)g(b)q(e)g(discussed)g (in)g(more)f(detail)h(in)f(Section)h(7.)-57 631 y(Figure)e(15)h(refers)f(to)g (broadcasting)i(data)f(to)f(all)g(pro)q(cesses)h(in)f(the)g(same)f(ro)o(w)i (or)f(column)f(of)h(the)g(template.)-57 697 y(This)k(is)g(a)h(common)d(op)q (eration)j(in)f(parallel)f(linear)h(algebra)g(algorithms,)g(so)g(the)g(idea)g (will)f(b)q(e)i(describ)q(ed)-57 764 y(here)c(in)h(a)g(little)e(more)g (detail.)21 b(Consider,)15 b(for)h(example,)e(the)h(task)h(of)g(broadcasting) h(the)f(lo)o(w)o(er)f(triangular)-57 830 y(blo)q(c)o(k,)f Ff(L)117 837 y Fe(0)137 830 y Fm(,)h(to)g(all)f(pro)q(cesses)i(in)e(the)h(same)f(ro)o (w)h(of)g(the)g(template,)e(as)i(required)f(b)q(efore)h(solving)g Ff(L)1793 837 y Fe(0)1813 830 y Ff(U)1846 837 y Fe(1)1879 830 y Fm(=)f Ff(C)t Fm(.)-57 896 y(If)e Ff(L)21 903 y Fe(0)54 896 y Fm(is)g(in)g(pro)q(cess)i(\()p Ff(p;)8 b(q)r Fm(\),)k(then)h(it)f(will)g(b) q(e)g(broadcast)i(to)f(all)f(pro)q(cesses)i(in)e(ro)o(w)h Ff(p)g Fm(of)g(the)f(pro)q(cess)h(template.)-57 962 y(As)i(a)h(second)g(example,)d (consider)i(the)h(broadcast)h(of)f Ff(L)998 969 y Fe(1)1033 962 y Fm(to)g(all)f(pro)q(cesses)h(in)f(the)h(same)e(template)g(ro)o(w,)h(as) -57 1028 y(required)g(b)q(efore)i(up)q(dating)h(the)e(trailing)h(submatrix.)k (This)c(t)o(yp)q(e)f(of)h(\\ro)o(w)o(cast")g(is)g(sho)o(wn)g(sc)o (hematically)-57 1095 y(in)j(Figure)g(16\(a\).)36 b(If)20 b Ff(L)408 1102 y Fe(1)449 1095 y Fm(is)g(in)g(column)f Ff(q)j Fm(of)f(the)g(template,)e(then)h(eac)o(h)h(pro)q(cess)g(\()p Ff(p;)8 b(q)r Fm(\))20 b(broadcasts)i(its)-57 1161 y(blo)q(c)o(ks)16 b(of)h Ff(L)180 1168 y Fe(1)217 1161 y Fm(to)g(the)f(other)h(pro)q(cesses)g (in)f(ro)o(w)h Ff(p)g Fm(of)g(the)f(template.)k(Lo)q(osely)d(sp)q(eaking,)g (w)o(e)f(can)h(sa)o(y)g(that)-57 1227 y Ff(L)-24 1234 y Fe(0)10 1227 y Fm(and)d Ff(L)135 1234 y Fe(1)168 1227 y Fm(are)g(broadcast)h(along)f (the)g(ro)o(ws)g(of)g(the)f(template.)18 b(This)c(t)o(yp)q(e)f(of)h(data)h (mo)o(v)o(em)o(en)o(t)10 b(is)k(the)f(same)-57 1293 y(as)k(that)g(p)q (erformed)e(b)o(y)h(the)h(F)l(ortran)g(90)g(routine)f(SPREAD)h([7].)k(The)c (broadcast)g(of)g Ff(U)1624 1300 y Fe(1)1661 1293 y Fm(to)f(all)g(pro)q (cesses)-57 1359 y(in)g(the)g(same)f(template)g(column)g(is)h(v)o(ery)f (similar.)k(This)e(t)o(yp)q(e)f(of)g(comm)o(unication)e(is)i(sometime)o(s)e (referred)-57 1426 y(to)j(as)f(a)h(\\colcast",)g(and)f(is)g(sho)o(wn)h(in)f (Figure)g(16\(b\).)-57 1623 y Fh(7)83 b(Optimization,)26 b(T)-7 b(uning,)28 b(and)f(T)-7 b(rade-o\013s)-57 1760 y Fm(In)24 b(this)g(section,)i(w)o(e)e(shall)g(examine)e(tec)o(hniques)h(for)h (optimizing)f(the)h(basic)g(LU)h(factorization)f(co)q(de)-57 1826 y(presen)o(ted)16 b(in)h(Section)g(4.2.)24 b(Among)16 b(the)h(issues)h(to)f(b)q(e)g(considered)g(are)g(the)g(assignmen)o(t)f(of)i (pro)q(cesses)g(to)-57 1892 y(ph)o(ysical)11 b(pro)q(cessors,)i(the)e (arrangemen)o(t)g(of)h(the)f(data)h(in)f(the)h(lo)q(cal)f(memory)e(of)j(eac)o (h)f(pro)q(cess,)h(the)g(trade-o\013)-57 1958 y(b)q(et)o(w)o(een)f(load)h(im) o(balance)d(and)j(comm)o(unic)o(ation)d(latency)l(,)i(the)g(p)q(oten)o(tial)h (for)f(o)o(v)o(erlapping)g(comm)o(unication)-57 2025 y(and)21 b(calculation,)g(and)g(the)g(t)o(yp)q(e)f(of)h(algorithm)e(used)i(to)g (broadcast)h(data.)36 b(Man)o(y)20 b(of)h(these)f(issues)h(are)-57 2091 y(in)o(terdep)q(enden)o(t,)16 b(and)i(in)g(addition)g(the)f(p)q (ortabilit)o(y)g(and)i(ease)f(of)g(co)q(de)g(main)o(tenance)d(and)k(use)e(m)o (ust)g(b)q(e)-57 2157 y(considered.)33 b(F)l(or)20 b(further)g(details)g(of)h (the)f(optimization)e(of)j(parallel)f(LU)g(factorization)g(algorithms)g(for) -57 2223 y(sp)q(eci\014c)d(concurren)o(t)f(mac)o(hines,)f(together)i(with)g (timing)f(results,)g(the)h(reader)g(is)g(referred)f(to)i(the)f(w)o(ork)g(of) -57 2289 y(Ch)o(u)j(and)h(George)f([12],)h(Geist)e(and)i(Heath)f([34],)g (Geist)g(and)h(Romine)d([35],)i(V)l(an)g(de)g(V)l(elde)f([55],)h(Bren)o(t)-57 2356 y([8],)14 b(Hendric)o(kson)f(and)i(W)l(om)o(ble)d([39],)i(Lic)o(h)o (tenstein)f(and)h(Johnsson)i([47)q(],)d(and)i(Dongarra)h(and)f(co-w)o(ork)o (ers)-57 2422 y([10,)h(25)q(].)939 2825 y(28)p eop %%Page: 29 31 30 bop -57 125 a Fo(7.1)70 b(Mapping)23 b(Logical)f(Memory)h(to)g(Ph)n (ysical)f(Memory)-57 245 y Fm(In)14 b(Section)h(5,)g(a)g(logical)f(\(or)h (virtual\))f(matrix)f(decomp)q(osition)h(w)o(as)h(describ)q(ed)f(in)g(whic)o (h)g(the)h(global)g(index)-57 311 y(\()p Ff(m;)8 b(n)p Fm(\))23 b(is)h(mapp)q(ed)f(to)h(a)g(p)q(osition,)h(\()p Ff(p;)8 b(q)r Fm(\),)25 b(in)e(a)h(logical)f(pro)q(cess)i(template,)e(a)h(p)q(osition,)h (\()p Ff(b;)8 b(d)p Fm(\),)25 b(in)f(a)-57 377 y(logical)c(arra)o(y)h(of)g (blo)q(c)o(ks)f(lo)q(cal)g(to)h(the)g(pro)q(cess,)g(and)h(a)f(p)q(osition,)g (\()p Ff(i;)8 b(j)s Fm(\),)21 b(in)f(a)h(logical)f(arra)o(y)h(of)g(matrix)-57 443 y(elemen)o(ts)15 b(lo)q(cal)i(to)h(the)f(blo)q(c)o(k.)25 b(Th)o(us,)18 b(the)f(blo)q(c)o(k)g(cyclic)e(decomp)q(osition)i(is)g(hierarc) o(hical,)f(and)i(attempts)-57 509 y(to)d(represen)o(t)f(the)h(hierarc)o (hical)e(memory)f(of)j(adv)m(anced-arc)o(hitecture)f(computers.)19 b(Although)c(the)g(parallel)-57 576 y(LU)j(factorization)h(algorithm)e(can)h (b)q(e)h(sp)q(eci\014ed)e(solely)h(in)g(terms)e(of)j(this)f(logical)g (hierarc)o(hical)e(memory)l(,)-57 642 y(its)g(p)q(erformance)f(dep)q(ends)i (on)f(ho)o(w)h(the)f(logical)g(memory)d(is)j(mapp)q(ed)g(to)g(ph)o(ysical)g (memory)l(.)-57 799 y Fi(7.1.1)55 b(Assignmen)n(t)18 b(of)h(Pro)r(cesses)e (to)i(Pro)r(cessors)-57 919 y Fm(Consider,)d(\014rst,)g(the)f(assignmen)o(t)g (of)h(pro)q(cesses,)g(\()p Ff(p;)8 b(q)r Fm(\),)16 b(to)g(ph)o(ysical)f(pro)q (cessors.)22 b(In)16 b(general,)f(more)f(than)-57 985 y(one)k(pro)q(cess)g (ma)o(y)d(b)q(e)j(assigned)g(to)g(a)f(pro)q(cessor,)i(so)f(the)f(problem)e (ma)o(y)h(b)q(e)i(o)o(v)o(erdecomp)q(osed.)23 b(T)l(o)18 b(a)o(v)o(oid)-57 1052 y(load)j(im)o(balance)e(the)i(same)f(n)o(um)o(b)q(er)f(of)j(pro)q (cesses)f(should)h(b)q(e)f(assigned)h(to)f(eac)o(h)g(pro)q(cessor)h(as)g (nearly)-57 1118 y(as)i(p)q(ossible.)41 b(If)22 b(this)h(condition)g(is)f (satis\014ed,)j(the)d(assignmen)o(t)g(of)h(pro)q(cesses)h(to)f(pro)q(cessors) h(can)f(still)-57 1184 y(a\013ect)18 b(p)q(erformance)f(b)o(y)g (in\015uencing)g(the)h(comm)o(unic)o(ation)e(o)o(v)o(erhead.)25 b(On)18 b(recen)o(t)f(distributed)g(memory)-57 1250 y(mac)o(hines,)j(suc)o(h) g(as)i(the)f(In)o(tel)e(Delta)i(and)g(CM-5,)h(the)f(time)e(to)i(send)g(a)h (single)e(message)g(b)q(et)o(w)o(een)g(t)o(w)o(o)-57 1316 y(pro)q(cessors)15 b(is)f(largely)f(indep)q(enden)o(t)g(of)h(their)f(ph)o(ysical)g(lo)q(cation)h ([29,)g(48,)g(49)q(],)f(and)h(hence)f(the)h(assignmen)o(t)-57 1383 y(of)22 b(pro)q(cesses)h(to)f(pro)q(cessors)i(do)q(es)e(not)h(ha)o(v)o (e)e(m)o(uc)o(h)f(direct)h(e\013ect)h(on)g(p)q(erformance.)38 b(Ho)o(w)o(ev)o(er,)21 b(when)-57 1449 y(a)i(collectiv)o(e)d(comm)o(unic)o (ation)g(task,)25 b(suc)o(h)d(as)h(a)h(broadcast,)h(is)d(b)q(eing)h(done,)h (con)o(ten)o(tion)e(for)h(ph)o(ysical)-57 1515 y(resources)15 b(can)g(degrade)g(p)q(erformance.)20 b(Th)o(us,)15 b(the)g(w)o(a)o(y)f(in)h (whic)o(h)f(pro)q(cesses)i(are)f(assigned)g(to)h(pro)q(cessors)-57 1581 y(can)c(a\013ect)g(p)q(erformance)f(if)g(some)g(assignmen)o(ts)g(result) h(in)g(di\013ering)f(amoun)o(ts)h(of)g(con)o(ten)o(tion.)19 b(Logarithmic)-57 1647 y(con)o(ten)o(tion-free)i(broadcast)i(algorithms)e(ha) o(v)o(e)g(b)q(een)h(dev)o(elop)q(ed)f(for)h(pro)q(cessors)i(connected)d(as)h (a)h(t)o(w)o(o-)-57 1714 y(dimensional)15 b(mesh)g([6,)h(51)q(],)f(so)i(on)g (suc)o(h)f(mac)o(hines)f(pro)q(cess)i(\()p Ff(p;)8 b(q)r Fm(\))16 b(is)g(usually)g(mapp)q(ed)g(to)h(the)f(pro)q(cessor)-57 1780 y(at)c(p)q(osition)f(\()p Ff(p;)d(q)r Fm(\))j(in)g(the)g(mesh)f(of)h(pro)q (cessors.)21 b(Suc)o(h)11 b(an)g(assignmen)o(t)f(also)i(ensures)f(that)h(the) f(m)o(ultiple)d(one-)-57 1846 y(dimensional)16 b(broadcasts)i(of)g Ff(L)545 1853 y Fe(1)582 1846 y Fm(and)g Ff(U)711 1853 y Fe(1)748 1846 y Fm(along)g(the)f(ro)o(ws)h(and)f(columns)f(of)i(the)f(template,)e (resp)q(ectiv)o(ely)l(,)-57 1912 y(do)i(not)f(giv)o(e)g(rise)f(to)i(con)o (ten)o(tion.)-57 2070 y Fi(7.1.2)55 b(La)n(y)n(out)19 b(of)g(Lo)r(cal)f(Pro)r (cess)g(Memory)-57 2189 y Fm(The)h(la)o(y)o(out)g(of)g(matrix)e(blo)q(c)o(ks) i(in)g(the)g(lo)q(cal)g(memory)d(of)j(a)h(pro)q(cess,)g(and)f(the)g (arrangemen)o(t)f(of)i(matrix)-57 2256 y(elemen)o(ts)c(within)i(eac)o(h)g (blo)q(c)o(k,)g(can)g(also)h(a\013ect)g(p)q(erformance.)27 b(Here,)17 b(tradeo\013s)j(among)e(sev)o(eral)f(factors)-57 2322 y(need)11 b(to)h(b)q(e)f(tak)o(en)g(in)o(to)g(accoun)o(t.)19 b(When)12 b(comm)o(uni)o(cating)d(matrix)g(blo)q(c)o(ks,)j(for)f(example)f (in)g(the)i(broadcasts)-57 2388 y(of)j Ff(L)30 2395 y Fe(1)65 2388 y Fm(and)g Ff(U)191 2395 y Fe(1)211 2388 y Fm(,)g(w)o(e)f(w)o(ould)h (lik)o(e)e(the)i(data)h(in)e(eac)o(h)h(blo)q(c)o(k)f(to)h(b)q(e)g(con)o (tiguous)h(in)e(ph)o(ysical)g(memory)e(so)k(there)-57 2454 y(is)h(no)g(need)g(to)g(pac)o(k)g(them)e(in)o(to)i(a)g(comm)o(unication)d (bu\013er)k(b)q(efore)f(sending)g(them.)22 b(On)17 b(the)g(other)g(hand,)-57 2520 y(when)h(up)q(dating)i(the)e(trailing)g(submatrix,)f Ff(E)s Fm(,)h(eac)o(h)g(pro)q(cess)h(m)o(ultiplie)o(s)d(a)j(column)e(of)h(blo)q(c)o (ks)g(b)o(y)g(a)h(ro)o(w)-57 2587 y(of)e(blo)q(c)o(ks,)g(to)g(do)h(a)f(rank-) p Ff(r)i Fm(up)q(date)f(on)f(the)g(part)h(of)f Ff(E)j Fm(that)d(it)g(con)o (tains.)24 b(If)16 b(this)h(w)o(ere)f(done)i(as)f(a)h(series)-57 2653 y(of)d(separate)g(blo)q(c)o(k-blo)q(c)o(k)f(matrix)f(m)o(ultipli)o (cations,)f(as)k(sho)o(wn)f(in)f(Figure)g(18\(a\),)i(the)e(p)q(erformance)g (w)o(ould)939 2825 y(29)p eop %%Page: 30 32 31 bop -57 125 a Fm(b)q(e)22 b(p)q(o)q(or)i(except)d(for)i(su\016cien)o(tly)d (large)i(blo)q(c)o(k)g(sizes,)g Ff(r)q Fm(,)i(since)e(the)g(v)o(ector)f (and/or)i(pip)q(eline)e(units)i(on)-57 191 y(most)16 b(pro)q(cessors)i(w)o (ould)f(not)g(b)q(e)g(fully)f(utilized,)e(as)k(ma)o(y)d(b)q(e)i(seen)f(in)h (Figure)f(17)h(for)g(the)g(i860)g(pro)q(cessor.)-57 257 y(Instead,)j(w)o(e)f (arrange)h(the)f(lo)q(ops)h(of)g(the)f(computation)f(as)i(sho)o(wn)g(in)f (Figure)g(18\(b\).)31 b(No)o(w,)19 b(if)g(the)g(data)-57 323 y(are)e(laid)f(out)h(in)f(ph)o(ysical)g(memory)e(\014rst)i(b)o(y)h(running)g (o)o(v)o(er)e(the)i Ff(i)f Fm(index)g(and)h(then)f(o)o(v)o(er)g(the)g Ff(d)h Fm(index)f(the)-57 390 y(inner)f(t)o(w)o(o)g(lo)q(ops)h(can)f(b)q(e)h (merged,)d(so)j(that)g(the)f(length)g(of)g(the)g(inner)g(lo)q(op)h(is)f(no)o (w)g Ff(r)q(d)1578 397 y Fe(max)1647 390 y Fm(.)21 b(This)15 b(generally)-57 456 y(results)h(in)h(m)o(uc)o(h)d(b)q(etter)j(v)o(ector/pip)q (eline)e(p)q(erformance.)21 b(The)c Ff(b)g Fm(and)g Ff(j)j Fm(lo)q(ops)d(in)g(Figure)f(18\(b\))i(can)e(also)-57 522 y(b)q(e)i(merged,)e (giving)i(the)g(algorithm)f(sho)o(wn)h(in)g(Figure)f(18\(c\).)27 b(This)18 b(is)g(just)g(the)f(outer)h(pro)q(duct)h(form)e(of)-57 588 y(the)g(m)o(ultipli)o(cation)e(of)i(an)g Ff(r)q(d)514 595 y Fe(max)594 588 y Fl(\002)12 b Ff(r)18 b Fm(b)o(y)f(an)g Ff(r)c Fl(\002)e Ff(r)q(b)951 595 y Fe(max)1036 588 y Fm(matrix,)k(and)j(w)o(ould)f (usually)f(b)q(e)i(done)f(b)o(y)f(a)i(call)-57 654 y(to)d(the)f(Lev)o(el)g(3) h(BLAS)f(routine)h(xGEMM)f(of)h(whic)o(h)f(an)h(assem)o(bly)e(co)q(ded)i (sequen)o(tial)e(v)o(ersion)h(is)h(a)o(v)m(ailable)-57 721 y(on)f(most)g(mac)o(hines.)k(Note)c(that)g(in)g(Figure)f(18\(c\))h(the)g (order)g(of)g(the)g(inner)f(t)o(w)o(o)h(lo)q(ops)h(is)f(appropriate)g(for)h (a)-57 787 y(F)l(ortran)h(implem)o(en)n(tation)d({)j(for)f(the)g(C)h (language)g(this)f(order)h(should)f(b)q(e)h(rev)o(ersed,)e(and)h(the)g(data)i (should)-57 853 y(b)q(e)f(stored)h(in)f(eac)o(h)g(pro)q(cess)h(b)o(y)e(ro)o (ws)i(instead)f(of)h(b)o(y)f(columns.)-57 941 y(W)l(e)11 b(ha)o(v)o(e)g (found)h(in)f(our)h(w)o(ork)g(on)g(the)f(In)o(tel)f(iPSC/860)j(h)o(yp)q (ercub)q(e)e(and)h(the)g(Delta)f(system)f(that)i(it)f(is)g(b)q(etter)-57 1007 y(to)j(optimize)c(for)k(the)e(sequen)o(tial)g(matrix)g(m)o(ultipli)o (cation)f(with)i(an)g(\()p Ff(i;)8 b(d;)g(j;)g(b)p Fm(\))13 b(ordering)g(of)h(mem)o(ory)d(in)i(eac)o(h)-57 1073 y(pro)q(cess,)k(rather)g (than)h(adopting)f(an)h(\()p Ff(i;)8 b(j;)g(d;)g(b)p Fm(\))16 b(ordering)h(to)g(a)o(v)o(oid)g(bu\013er)g(copies)f(when)h(comm)o(unicating) -57 1139 y(blo)q(c)o(ks.)j(Ho)o(w)o(ev)o(er,)13 b(there)h(is)g(another)h (reason)g(for)g(doing)g(this.)20 b(On)14 b(most)g(distributed)g(memory)d (computers)-57 1206 y(the)16 b(message)f(startup)i(cost)g(is)f(su\016cien)o (tly)e(large)i(that)h(it)f(is)g(preferable)f(wherev)o(er)g(p)q(ossible)h(to)h (send)f(data)-57 1272 y(as)k(one)g(large)g(message)f(rather)h(than)g(as)g (sev)o(eral)f(smaller)e(messages.)31 b(Th)o(us,)21 b(when)f(comm)o(uni)o (cating)d Ff(L)1963 1279 y Fe(1)-57 1338 y Fm(and)g Ff(U)71 1345 y Fe(1)107 1338 y Fm(the)g(blo)q(c)o(ks)f(to)h(b)q(e)f(broadcast)i(w)o (ould)e(b)q(e)h(amalgamated)e(in)o(to)h(a)h(single)f(message,)f(whic)o(h)h (requires)-57 1404 y(a)k(bu\013er)g(cop)o(y)l(.)32 b(The)20 b(emerging)e(Message)i(P)o(assing)h(In)o(terface)d(\(MPI\))i(standard)h([21]) f(pro)o(vides)f(supp)q(ort)-57 1470 y(for)e(noncon)o(tiguous)i(messages,)d (so)i(in)e(the)h(future)g(the)g(need)f(to)i(a)o(v)o(oid)e(bu\013er)i(copies)e (will)g(not)i(b)q(e)f(of)g(suc)o(h)-57 1537 y(concern)f(to)g(the)g (application)g(dev)o(elop)q(er.)-57 1710 y Fo(7.2)70 b(T)-6 b(radeo\013s)24 b(b)r(et)n(w)n(een)d(Load)j(Balance)e(and)i(Comm)n(unication) c(Latency)-57 1830 y Fm(W)l(e)15 b(ha)o(v)o(e)f(discussed)i(the)f(mapping)f (of)i(the)f(logical)f(hierarc)o(hical)g(memory)e(to)k(ph)o(ysical)e(memory)l (.)k(In)d(addi-)-57 1896 y(tion,)g(w)o(e)g(ha)o(v)o(e)f(p)q(oin)o(ted)h(out)h (the)f(imp)q(ortance)f(of)i(main)o(taining)e(long)h(inner)g(lo)q(ops)h(to)g (get)f(go)q(o)q(d)i(sequen)o(tial)-57 1962 y(p)q(erformance)g(for)i(eac)o(h)f (pro)q(cess,)h(and)g(the)f(desirabilit)o(y)f(of)i(sending)f(a)h(few)g(large)f (messages)g(rather)g(than)-57 2028 y(man)o(y)i(smaller)g(ones.)39 b(W)l(e)22 b(next)f(consider)h(load)g(balance)g(issues.)38 b(Assuming)21 b(that)h(equal)f(n)o(um)o(b)q(ers)g(of)-57 2095 y(pro)q(cesses)d(ha)o(v)o(e)e(b)q(een)h(assigned)h(to)g(eac)o(h)e(pro)q (cessor,)i(load)g(im)o(balance)d(arises)i(in)g(t)o(w)o(o)g(phases)h(of)f(the) g(par-)-57 2161 y(allel)i(LU)i(factorization)g(algorithm;)g(namely)l(,)f(in)g (factoring)h(eac)o(h)f(column)g(blo)q(c)o(k,)h(whic)o(h)f(in)o(v)o(olv)o(es)e (only)-57 2227 y Ff(P)24 b Fm(pro)q(cesses,)18 b(and)g(in)e(solving)i(the)e (lo)o(w)o(er)h(triangular)g(system)f(to)h(ev)m(aluate)g(eac)o(h)g(ro)o(w)g (blo)q(c)o(k)g(of)g Ff(U)5 b Fm(,)18 b(whic)o(h)-57 2293 y(in)o(v)o(olv)o(es) f(only)i Ff(Q)f Fm(pro)q(cesses.)30 b(If)18 b(the)h(time)e(for)i(data)h(mo)o (v)o(em)o(en)o(t)15 b(is)k(negligible,)f(the)h(asp)q(ect)g(ratio)g(of)g(the) -57 2360 y(template)14 b(that)j(minim)o(ize)o(s)d(load)i(im)o(balance)e(in)i (step)h Ff(k)h Fm(of)e(the)g(algorithm)g(is,)464 2458 y Ff(P)p 464 2480 39 2 v 464 2526 a(Q)549 2491 y Fm(=)633 2458 y(Sequen)o(tial)f(time) f(to)j(factor)f(column)f(blo)q(c)o(k)p 633 2480 830 2 v 672 2526 a(Sequen)o(tial)f(time)h(for)h(triangular)h(solv)o(e)549 2625 y(=)633 2591 y Ff(M)680 2598 y Fd(b)709 2591 y Fl(\000)10 b Ff(k)k Fl(\000)c Fm(1)p Ff(=)p Fm(3)j(+)e(O\(1)p Ff(=r)1108 2573 y Fe(2)1128 2591 y Fm(\))p 633 2613 515 2 v 662 2659 a Ff(N)701 2666 y Fd(b)729 2659 y Fl(\000)g Ff(k)i Fl(\000)e Fm(1)g(+)g(O\(1)p Ff(=r)1079 2645 y Fe(2)1100 2659 y Fm(\))1896 2625 y(\(17\))939 2825 y(30)p eop %%Page: 31 33 32 bop -57 125 a Fm(where)13 b Ff(M)128 132 y Fd(b)150 125 y Fl(\002)5 b Ff(N)233 132 y Fd(b)263 125 y Fm(is)12 b(the)h(matrix)f(size)g (in)h(blo)q(c)o(ks,)g(and)g Ff(r)i Fm(the)e(blo)q(c)o(k)f(size.)20 b(Th)o(us,)13 b(the)g(optimal)f(asp)q(ect)h(ratio)h(of)-57 191 y(the)g(template)d(should)k(b)q(e)f(the)f(same)g(as)i(the)e(asp)q(ect)i (ratio)f(of)g(the)g(matrix,)e(i.e.,)g Ff(M)1487 198 y Fd(b)1504 191 y Ff(=)m(N)1564 198 y Fd(b)1596 191 y Fm(in)i(blo)q(c)o(ks,)f(or)h Ff(M)r(=)m(N)-57 257 y Fm(in)i(elemen)o(ts.)i(If)d(the)h(e\013ect)g(of)g (comm)o(unic)o(ation)e(time)g(is)h(included)g(then)h(w)o(e)g(m)o(ust)e(tak)o (e)i(in)o(to)f(accoun)o(t)h(the)-57 323 y(relativ)o(e)c(times)f(tak)o(en)i (to)h(lo)q(cate)f(and)h(broadcast)h(the)e(piv)o(ot)g(information,)f(and)i (the)g(time)d(to)j(broadcast)g(the)-57 390 y(lo)o(w)o(er)e(triangular)i (matrix,)e Ff(L)495 397 y Fe(0)515 390 y Fm(,)h(along)h(a)g(ro)o(w)g(of)f (the)g(template.)18 b(F)l(or)c(b)q(oth)g(tasks)g(the)f(comm)o(unic)o(ation)e (time)-57 456 y(increases)18 b(with)g(the)h(n)o(um)o(b)q(er)d(of)j(pro)q (cesses)g(in)o(v)o(olv)o(ed,)d(and)j(since)f(the)g(comm)o(unication)e(time)g (asso)q(ciated)-57 522 y(with)g(the)g(piv)o(oting)f(is)h(greater)g(than)h (that)f(asso)q(ciated)h(with)f(the)g(triangular)g(solv)o(e,)f(w)o(e)h(w)o (ould)g(exp)q(ect)f(the)-57 588 y(optim)o(um)h(asp)q(ect)j(ratio)g(of)g(the)g (template)e(to)i(b)q(e)g(less)f(than)i Ff(M)r(=)m(N)5 b Fm(.)30 b(In)19 b(fact,)g(for)g(our)g(runs)h(on)f(the)f(In)o(tel)-57 654 y(Delta)i(system)e(w)o(e)h(found)h(an)h(asp)q(ect)f(ratio,)g Ff(P)t(=Q)p Fm(,)h(of)f(b)q(et)o(w)o(een)f(1/4)h(and)h(1/8)g(to)f(b)q(e)f (optimal)g(for)h(most)-57 721 y(problems)12 b(with)i(square)f(matrices,)f (and)i(that)g(p)q(erformance)f(dep)q(ends)h(rather)f(w)o(eakly)g(on)h(the)f (asp)q(ect)h(ratio,)-57 787 y(particularly)g(for)h(large)g(grain)h(sizes.)k (Some)14 b(t)o(ypical)g(results)h(are)g(sho)o(wn)g(in)g(Figure)g(19)h(for)f (256)h(pro)q(cessors,)-57 853 y(whic)o(h)h(sho)o(w)h(a)h(v)m(ariation)f(of)g (less)f(than)i(20\045)f(in)f(p)q(erformance)g(as)h Ff(P)t(=Q)g Fm(v)m(aries)g(b)q(et)o(w)o(een)f(1/16)i(and)g(1)f(for)-57 919 y(the)e(largest)g(problem.)-57 1007 y(The)f(blo)q(c)o(k)f(size,)g Ff(r)q Fm(,)h(also)g(a\013ects)g(load)g(balance.)21 b(Here)14 b(the)g(tradeo\013)i(is)f(b)q(et)o(w)o(een)e(the)i(load)g(im)o(balance)e (that)-57 1073 y(arises)g(as)h(ro)o(ws)g(and)g(columns)e(of)i(the)f(matrix)e (are)j(eliminated)c(as)k(the)f(algorithm)f(progresses,)j(and)f(comm)o(u-)-57 1139 y(nication)j(startup)h(costs.)26 b(The)17 b(blo)q(c)o(k)g(cyclic)f (decomp)q(osition)g(seeks)h(to)h(main)o(tain)e(go)q(o)q(d)j(load)f(balance)g (b)o(y)-57 1206 y(cyclically)c(assigning)j(blo)q(c)o(ks)f(to)h(pro)q(cesses,) g(and)g(the)f(load)h(balance)f(is)g(b)q(est)h(if)f(the)g(blo)q(c)o(ks)g(are)h (small.)j(On)-57 1272 y(the)d(other)f(hand,)i(cum)o(ulativ)o(e)13 b(comm)o(unicati)o(on)i(startup)i(costs)h(are)e(less)h(if)f(the)h(blo)q(c)o (k)f(size)g(is)g(large)h(since,)-57 1338 y(in)e(this)f(case,)h(few)o(er)f (messages)h(m)o(ust)e(b)q(e)i(sen)o(t)g(\(although)h(the)f(total)g(v)o(olume) d(of)k(data)g(sen)o(t)e(is)h(indep)q(enden)o(t)-57 1404 y(of)k(the)g(blo)q(c) o(k)g(size\).)29 b(Th)o(us,)20 b(there)e(is)h(a)g(blo)q(c)o(k)g(size)f(that)i (optimally)d(balances)i(the)g(load)h(im)o(balance)c(and)-57 1470 y(comm)o(unic)o(ation)e(startup)j(costs.)-57 1644 y Fo(7.3)70 b(Optimali)o(t)n(y)20 b(and)k(Pip)r(elini)o(ng)c(T)-6 b(radeo\013s)-57 1764 y Fm(The)19 b(comm)o(unic)o(ation)d(algorithms)i(used)h(also)h (in\015uence)e(p)q(erformance.)27 b(In)19 b(the)f(LU)h(factorization)g(algo-) -57 1830 y(rithm,)g(all)g(the)h(comm)o(unication)d(can)j(b)q(e)h(done)f(b)o (y)g(mo)o(ving)e(data)j(along)g(ro)o(ws)g(and/or)g(columns)e(of)h(the)-57 1896 y(pro)q(cess)e(template.)k(This)c(t)o(yp)q(e)f(of)g(comm)o(unicati)o(on) e(can)j(b)q(e)f(done)h(b)o(y)f(passing)h(from)e(one)i(pro)q(cess)g(to)f(the) -57 1962 y(next)g(along)h(the)f(ro)o(w)g(or)h(column.)23 b(W)l(e)17 b(shall)g(call)f(this)h(a)h(\\ring")g(algorithm,)e(although)i(the)f(ring)g (ma)o(y)l(,)f(or)-57 2028 y(ma)o(y)c(not,)j(b)q(e)f(closed.)20 b(An)14 b(alternativ)o(e)e(is)i(to)g(use)g(a)g(spanning)h(tree)e(algorithm,)g (of)i(whic)o(h)e(there)g(are)h(sev)o(eral)-57 2095 y(v)m(arieties.)32 b(The)20 b(complexit)o(y)d(of)j(the)g(ring)g(algorithm)f(is)h(linear)g(in)f (the)h(n)o(um)o(b)q(er)f(of)h(pro)q(cesses)h(in)o(v)o(olv)o(ed,)-57 2161 y(whereas)c(that)f(of)h(spanning)g(tree)f(algorithms)f(is)h(logarithmic) f(\(for)h(example,)e(see)i([6]\).)k(Th)o(us,)d(considered)-57 2227 y(in)d(isolation,)g(the)h(spanning)g(tree)f(algorithms)f(are)i (preferable)e(to)i(a)g(ring)f(algorithm.)20 b(Ho)o(w)o(ev)o(er,)12 b(in)i(a)h(span-)-57 2293 y(ning)20 b(tree)g(algorithm,)f(a)i(pro)q(cess)f (ma)o(y)f(tak)o(e)g(part)i(in)e(sev)o(eral)g(of)i(the)f(logarithmic)e(steps,) j(and)f(in)g(some)-57 2360 y(implem)o(en)o(tati)o(ons)e(these)h(algorithms)f (act)i(as)g(a)g(barrier.)30 b(In)19 b(a)h(ring)g(algorithm,)e(eac)o(h)h(pro)q (cess)h(needs)g(to)-57 2426 y(comm)o(unic)o(ate)14 b(only)j(once,)g(and)g (can)g(then)g(con)o(tin)o(ue)f(to)h(compute,)e(in)i(e\013ect)f(o)o(v)o (erlapping)g(the)h(comm)o(uni-)-57 2492 y(cation)h(with)f(computation.)24 b(An)17 b(algorithm)f(that)i(in)o(terlea)o(v)o(es)d(comm)o(unic)o(ation)g (and)j(calculation)f(in)g(this)-57 2558 y(w)o(a)o(y)e(is)f(often)i(referred)d (to)j(as)f(a)h(pip)q(elined)e(algorithm.)19 b(In)c(a)g(pip)q(elined)f(LU)h (factorization)g(algorithm)f(with)-57 2624 y(no)21 b(piv)o(oting,)g(comm)o (unic)o(ation)d(and)k(calculation)e(w)o(ould)g(\015o)o(w)h(in)g(w)o(a)o(v)o (es)f(across)h(the)g(matrix.)32 b(Piv)o(oting)-57 2691 y(tends)16 b(to)h(inhibit)e(this)h(adv)m(an)o(tage)i(of)e(pip)q(elining.)939 2825 y(31)p eop %%Page: 32 34 33 bop -57 125 a Fm(In)12 b(the)h(pseudo)q(co)q(de)g(in)g(Figure)f(15,)h(w)o (e)g(do)g(not)g(sp)q(ecify)f(ho)o(w)h(the)f(piv)o(ot)g(information)f(should)j (b)q(e)e(broadcast.)-57 191 y(In)i(an)h(optimized)d(implem)o(e)o(n)o(tation,) g(w)o(e)h(need)h(to)h(\014nish)f(with)g(the)g(piv)o(ot)g(phase,)g(and)h(the)f (triangular)h(solv)o(e)-57 257 y(phase,)22 b(as)f(so)q(on)h(as)f(p)q(ossible) g(in)f(order)h(to)g(b)q(egin)f(the)h(up)q(date)g(phase)g(whic)o(h)f(is)g(ric) o(hest)g(in)g(parallelism.)-57 323 y(Th)o(us,)15 b(it)g(is)f(not)i(a)f(go)q (o)q(d)i(idea)e(to)g(broadcast)h(the)f(piv)o(ot)f(information)g(from)g(a)i (single)e(source)h(pro)q(cess)h(using)-57 390 y(a)k(spanning)h(tree)f (algorithm,)f(since)g(this)h(ma)o(y)f(o)q(ccup)o(y)g(some)g(of)i(the)e(pro)q (cesses)i(in)o(v)o(olv)o(ed)d(in)h(the)h(panel)-57 456 y(factorization)c(for) h(to)q(o)h(long.)k(It)16 b(is)g(imp)q(ortan)o(t)g(to)h(get)f(the)g(piv)o(ot)g (information)g(to)h(the)f(other)g(pro)q(cesses)h(in)-57 522 y(this)e(template)e(column)g(as)i(so)q(on)i(as)e(p)q(ossible,)g(so)h(the)e (piv)o(ot)h(information)e(is)i(\014rst)g(sen)o(t)g(to)g(these)f(pro)q(cesses) -57 588 y(whic)o(h)i(subsequen)o(tly)g(broadcast)i(it)f(along)h(the)e (template)f(ro)o(ws)j(to)f(the)g(other)g(pro)q(cesses)g(not)h(in)o(v)o(olv)o (ed)c(in)-57 654 y(the)20 b(panel)g(factorization.)33 b(In)19 b(addition,)i(the)f(exc)o(hange)g(of)g(the)g(parts)h(of)f(the)g(piv)o(ot)f (ro)o(ws)i(lying)f(within)-57 721 y(the)e(panel)h(is)f(done)h(separately)g (from)e(that)i(of)g(the)g(parts)g(outside)g(the)f(piv)o(ot)g(panel.)28 b(Another)19 b(factor)g(to)-57 787 y(consider)h(here)f(is)h(when)f(the)h(piv) o(ot)f(information)g(should)i(b)q(e)f(broadcast)h(along)f(the)g(template)e (columns.)-57 853 y(In)f(Figure)h(15,)g(the)g(information)e(is)i(broadcast,)h (and)f(ro)o(ws)g(exc)o(hanged,)g(imm)o(ediatel)o(y)d(after)j(the)f(piv)o(ot)g (is)-57 919 y(found.)23 b(An)16 b(alternativ)o(e)f(is)h(to)h(store)g(up)g (the)f(sequence)g(of)h Ff(r)g Fm(piv)o(ots)g(for)f(a)h(panel)g(and)g(to)g (broadcast)h(them)-57 986 y(along)e(the)f(template)f(ro)o(ws)h(when)h(panel)f (factorization)g(is)g(complete.)k(This)c(defers)g(the)g(exc)o(hange)g(of)h (piv)o(ot)-57 1052 y(ro)o(ws)21 b(for)g(the)f(parts)h(outside)f(the)h(panel)f (un)o(til)f(the)i(panel)f(factorization)g(has)h(b)q(een)g(done,)g(as)g(sho)o (wn)g(in)-57 1118 y(the)d(pseudo)q(co)q(de)h(fragmen)o(t)e(in)h(Figure)f(20.) 28 b(An)18 b(adv)m(an)o(tage)h(of)f(this)g(second)h(approac)o(h)f(is)g(that)h (only)f(one)-57 1184 y(message)e(is)h(used)g(to)g(send)g(the)f(piv)o(ot)g (information)g(for)h(the)g(panel)f(along)i(the)e(template)f(ro)o(ws,)i (instead)g(of)-57 1250 y Ff(r)h Fm(messages.)-57 1338 y(In)g(our)i(implem)o (e)o(n)o(tation)c(of)j(LU)g(factorization)g(on)g(the)f(In)o(tel)g(Delta)g (system,)g(w)o(e)g(used)h(a)g(spanning)h(tree)-57 1404 y(algorithm)g(to)i(lo) q(cate)g(the)f(piv)o(ot)g(and)h(to)g(broadcast)h(it)e(within)g(the)g(column)f (of)i(the)f(pro)q(cess)h(template)-57 1470 y(p)q(erforming)14 b(the)i(panel)f(factorization.)21 b(This)15 b(ensures)h(that)f(piv)o(oting,)g (whic)o(h)g(in)o(v)o(olv)o(es)e(only)i Ff(P)23 b Fm(pro)q(cesses,)-57 1537 y(is)18 b(completed)e(as)j(quic)o(kly)d(as)i(p)q(ossible.)28 b(A)17 b(ring)i(broadcast)g(is)f(used)g(to)h(pip)q(eline)e(the)g(piv)o(ot)h (information)-57 1603 y(and)j(the)f(factored)g(panel)g(along)h(the)g (template)d(ro)o(ws.)34 b(Finally)l(,)19 b(after)h(the)g(triangular)h(solv)o (e)e(phase)i(has)-57 1669 y(completed,)12 b(a)k(spanning)f(tree)g(broadcast)h (is)e(used)h(to)g(send)g(the)g(newly-formed)e(blo)q(c)o(k)h(ro)o(w)h(of)g Ff(U)21 b Fm(along)15 b(the)-57 1735 y(template)i(columns.)26 b(Results)18 b(for)h(square)g(matrices)e(from)g(runs)i(on)g(the)f(In)o(tel)f (Delta)i(system)e(are)h(sho)o(wn)-57 1802 y(in)f(Figure)g(21.)26 b(F)l(or)17 b(eac)o(h)g(curv)o(e)g(the)g(results)g(for)h(the)f(b)q(est)h(pro) q(cess)g(template)e(con\014guration)i(are)g(sho)o(wn.)-57 1868 y(Recalling)f(that)i(for)g(a)g(scalable)f(algorithm)f(the)h(p)q(erformance)f (should)i(dep)q(end)g(linearly)e(on)i(the)f(n)o(um)o(b)q(er)-57 1934 y(of)e(pro)q(cessors)h(for)f(\014xed)g(gran)o(ularit)o(y)f(\(see)g(Eq.) 21 b(2\),)16 b(it)f(is)h(apparen)o(t)g(that)h(scalabilit)o(y)d(ma)o(y)g(b)q (e)i(assessed)h(b)o(y)-57 2000 y(the)g(exten)o(t)e(to)j(whic)o(h)e(isogran)o (ularit)o(y)g(curv)o(es)g(di\013er)h(from)f(linearit)o(y)l(.)21 b(An)c(isogran)o(ularit)o(y)f(curv)o(e)g(is)h(a)g(plot)-57 2066 y(of)d(p)q(erformance)f(against)i(n)o(um)o(b)q(er)e(of)h(pro)q(cessors)h (for)g(a)f(\014xed)g(gran)o(ularit)o(y)l(.)20 b(The)14 b(results)g(in)f (Figure)h(21)h(can)-57 2133 y(b)q(e)i(used)g(to)g(generate)g(the)f(isogran)o (ularit)o(y)h(curv)o(es)f(sho)o(wn)h(in)g(Figure)f(22)i(whic)o(h)e(sho)o(w)h (that)g(on)h(the)e(Delta)-57 2199 y(system)j(the)h(LU)g(factorization)g (routine)g(starts)i(to)e(lose)g(scalabilit)o(y)f(when)h(the)g(gran)o(ularit)o (y)g(falls)g(b)q(elo)o(w)-57 2265 y(ab)q(out)g(0)p Ff(:)p Fm(2)13 b Fl(\002)f Fm(10)259 2247 y Fe(6)279 2265 y Fm(.)28 b(This)19 b(corresp)q(onds)g(to)g(a)g(matrix)e(size)g(of)i(ab)q(out)h Ff(M)j Fm(=)17 b(10000)j(on)f(512)h(pro)q(cessors,)g(or)-57 2331 y(ab)q(out)14 b(13\045)g(of)f(the)g(memory)d(a)o(v)m(ailable)i(to)h (applications)g(on)h(the)e(Delta,)h(indicating)g(that)g(LU)g(factorization) -57 2397 y(scales)j(rather)g(w)o(ell)f(on)i(the)f(In)o(tel)f(Delta)h(system.) 939 2825 y(32)p eop %%Page: 33 35 34 bop -57 125 a Fh(8)83 b(Conclusions)25 b(and)j(F)-7 b(uture)27 b(Researc)n(h)e(Directions)-57 261 y Fm(P)o(ortabilit)o(y)19 b(of)i(programs)g(has)g(alw)o(a)o(ys)f(b)q(een)h(an)g(imp)q(ortan)o(t)f (consideration.)34 b(P)o(ortabilit)o(y)19 b(w)o(as)i(easy)g(to)-57 328 y(ac)o(hiev)o(e)15 b(when)j(there)f(w)o(as)g(a)h(single)f(arc)o (hitectural)f(paradigm)h(\(the)g(serial)f(v)o(on)h(Neumann)f(mac)o(hine\))f (and)-57 394 y(a)20 b(single)g(programming)e(language)j(for)f(scien)o (ti\014c)e(programming)h(\(F)l(ortran\))h(em)o(b)q(o)q(dying)f(that)h(common) -57 460 y(mo)q(del)c(of)i(computation.)24 b(Arc)o(hitectural)15 b(and)j(linguistic)e(div)o(ersit)o(y)g(ha)o(v)o(e)g(made)g(p)q(ortabilit)o(y) h(m)o(uc)o(h)e(more)-57 526 y(di\016cult,)e(but)h(no)g(less)g(imp)q(ortan)o (t,)f(to)h(attain.)21 b(Users)14 b(simply)e(do)i(not)h(wish)f(to)g(in)o(v)o (est)f(signi\014can)o(t)h(amoun)o(ts)-57 592 y(of)k(time)e(to)i(create)f (large-scale)g(application)h(co)q(des)g(for)g(eac)o(h)f(new)h(mac)o(hine.)24 b(Our)17 b(answ)o(er)h(is)g(to)g(dev)o(elop)-57 659 y(p)q(ortable)f(soft)o(w) o(are)f(libraries)f(that)i(hide)f(mac)o(hine-sp)q(eci\014c)d(details.)-57 831 y Fo(8.1)70 b(P)n(ortabilit)n(y)-6 b(,)21 b(Scalabilit)n(y)-6 b(,)21 b(and)j(Standards)-57 950 y Fm(In)e(order)h(to)g(b)q(e)g(truly)f(p)q (ortable,)i(parallel)e(soft)o(w)o(are)h(libraries)e(m)o(ust)g(b)q(e)i Fg(standar)n(dize)n(d)p Fm(.)39 b(In)23 b(a)g(parallel)-57 1016 y(computing)d(en)o(vironmen)o(t)e(in)i(whic)o(h)g(the)g(higher-lev)o(el) f(routines)i(and/or)h(abstractions)f(are)g(built)f(up)q(on)-57 1083 y(lo)o(w)o(er-lev)o(el)c(computation)i(and)i(message-passing)f (routines,)g(the)g(b)q(ene\014ts)g(of)h(standardization)f(are)g(par-)-57 1149 y(ticularly)13 b(apparen)o(t.)21 b(F)l(urthermore,)13 b(the)i(de\014nition)f(of)h(computational)f(and)h(message-passing)h (standards)-57 1215 y(pro)o(vides)g(v)o(endors)g(with)g(a)h(clearly)d (de\014ned)i(base)h(set)f(of)h(routines)f(that)h(they)e(can)i(implem)o(en)n (t)d(e\016cien)o(tly)l(.)-57 1302 y(>F)l(rom)g(the)h(user's)f(p)q(oin)o(t)i (of)f(view,)f(p)q(ortabilit)o(y)g(means)g(that,)i(as)f(new)g(mac)o(hines)e (are)j(dev)o(elop)q(ed,)d(they)i(are)-57 1368 y(simply)f(added)j(to)f(the)g (net)o(w)o(ork,)f(supplying)i(cycles)d(where)i(they)g(are)g(most)g (appropriate.)-57 1456 y(>F)l(rom)h(the)h(mathematical)d(soft)o(w)o(are)j (dev)o(elop)q(er's)e(p)q(oin)o(t)i(of)h(view,)e(p)q(ortabilit)o(y)g(ma)o(y)g (require)f(signi\014can)o(t)-57 1522 y(e\013ort.)23 b(Econom)o(y)16 b(in)g(dev)o(elopmen)o(t)e(and)j(main)o(tenance)e(of)i(mathematical)c(soft)o (w)o(are)k(demands)f(that)h(suc)o(h)-57 1588 y(dev)o(elopmen)o(t)12 b(e\013ort)k(b)q(e)g(lev)o(eraged)e(o)o(v)o(er)g(as)i(man)o(y)e(di\013eren)o (t)h(computer)f(systems)g(as)i(p)q(ossible.)21 b(Giv)o(en)14 b(the)-57 1654 y(great)20 b(div)o(ersit)o(y)d(of)j(parallel)f(arc)o (hitectures,)g(this)g(t)o(yp)q(e)g(of)h(p)q(ortabilit)o(y)f(is)g(attainable)h (to)g(only)f(a)h(limited)-57 1720 y(degree,)15 b(but)i(mac)o(hine)d(dep)q (endences)h(can)i(at)g(least)f(b)q(e)g(isolated.)-57 1808 y(LAP)l(A)o(CK)21 b(is)h(an)g(example)e(of)i(a)g(mathematical)d(soft)o(w)o(are)i(pac)o(k)m(age) h(whose)h(highest-lev)o(el)d(comp)q(onen)o(ts)-57 1874 y(are)c(p)q(ortable,)h (while)e(mac)o(hine)f(dep)q(endences)i(are)h(hidden)f(in)g(lo)o(w)o(er-lev)o (el)d(mo)q(dules.)21 b(Suc)o(h)16 b(a)h(hierarc)o(hical)-57 1940 y(approac)o(h)23 b(is)f(probably)g(the)g(closest)g(one)g(can)h(come)d (to)j(soft)o(w)o(are)f(p)q(ortabilit)o(y)f(across)i(div)o(erse)e(parallel)-57 2006 y(arc)o(hitectures.)f(And)15 b(the)h(BLAS)f(that)i(are)f(used)g(so)g (hea)o(vily)e(in)i(LAP)l(A)o(CK)f(pro)o(vide)g(a)i(p)q(ortable,)f(e\016cien)o (t,)-57 2072 y(and)h(\015exible)d(standard)k(for)f(applications)f (programmers.)-57 2160 y(Lik)o(e)i(p)q(ortabilit)o(y)l(,)h Fg(sc)n(alability)h Fm(demands)f(that)h(a)f(program)h(b)q(e)f(reasonably)h (e\013ectiv)o(e)d(o)o(v)o(er)i(a)h(wide)e(range)-57 2226 y(of)g(n)o(um)o(b)q (er)e(of)i(pro)q(cessors.)28 b(The)18 b(scalabilit)o(y)e(of)i(parallel)f (algorithms,)g(and)h(soft)o(w)o(are)g(libraries)f(based)i(on)-57 2292 y(them,)11 b(o)o(v)o(er)h(a)h(wide)f(range)h(of)g(arc)o(hitectural)f (designs)h(and)g(n)o(um)o(b)q(ers)e(of)i(pro)q(cessors)h(will)d(lik)o(ely)f (require)i(that)-57 2358 y(the)18 b(fundamen)o(tal)f(gran)o(ularit)o(y)h(of)g (computation)g(b)q(e)g(adjustable)h(to)g(suit)f(the)g(particular)g (circumstances)-57 2424 y(in)c(whic)o(h)h(the)f(soft)o(w)o(are)h(ma)o(y)e (happ)q(en)j(to)f(execute.)k(Our)c(approac)o(h)h(to)f(this)g(problem)e(is)i (blo)q(c)o(k)f(algorithms)-57 2491 y(with)20 b(adjustable)h(blo)q(c)o(k)f (size.)32 b(In)20 b(man)o(y)f(cases,)i(ho)o(w)o(ev)o(er,)e(p)q(oly)o (algorithms)1418 2473 y Fe(2)1457 2491 y Fm(ma)o(y)g(b)q(e)i(required)e(to)h (deal)-57 2557 y(with)14 b(the)h(full)e(range)i(of)g(arc)o(hitectures)e(and)j (pro)q(cessor)f(m)o(ultiplic)o(it)o(y)c(lik)o(ely)h(to)j(b)q(e)f(a)o(v)m (ailable)g(in)g(the)h(future.)p -57 2600 816 2 v -1 2630 a Fj(2)18 2645 y Fn(In)i(a)f(p)q(oly)o(algorithm)d(the)18 b(actual)e(algorithm) e(used)k(dep)q(ends)g(on)f(the)g(computing)e(en)o(vironmen)o(t)h(and)h(the)g (input)g(data.)-57 2700 y(The)d(optimal)e(algorithm)f(in)i(a)h(particular)f (instance)i(is)f(automatically)d(selected)k(at)f(run)o(time.)939 2825 y Fm(33)p eop %%Page: 34 36 35 bop -57 125 a Fm(Scalable)23 b(parallel)g(arc)o(hitectures)f(of)i(the)f (future)g(are)h(lik)o(ely)d(to)i(b)q(e)h(based)g(on)g(a)g(distributed)f (memory)-57 191 y(arc)o(hitectural)18 b(paradigm.)30 b(In)19 b(the)g(longer)h(term,)e(progress)i(in)f(hardw)o(are)h(dev)o(elopmen)o(t,)c (op)q(erating)21 b(sys-)-57 257 y(tems,)f(languages,)i(compilers,)e(and)h (comm)o(unic)o(ations)e(ma)o(y)g(mak)o(e)f(it)j(p)q(ossible)f(for)h(users)g (to)g(view)f(suc)o(h)-57 323 y(distributed)15 b(arc)o(hitectures)g(\(without) h(signi\014can)o(t)g(loss)g(of)g(e\016ciency\))e(as)j(ha)o(ving)e(a)i(shared) f(memory)d(with)-57 390 y(a)21 b(global)g(address)g(space.)33 b(F)l(or)21 b(the)f(near)h(term,)e(ho)o(w)o(ev)o(er,)g(the)i(distributed)e (nature)i(of)g(the)f(underlying)-57 456 y(hardw)o(are)i(will)e(con)o(tin)o (ue)h(to)h(b)q(e)f(visible)f(at)i(the)f(programming)f(lev)o(el;)i(therefore,) f(e\016cien)o(t)f(pro)q(cedures)-57 522 y(for)g(explicit)e(comm)o(unication)f (will)j(con)o(tin)o(ue)f(to)h(b)q(e)g(necessary)l(.)33 b(Giv)o(en)19 b(this)h(fact,)h(standards)h(for)e(basic)-57 588 y(message)f(passing)h (\(send/receiv)o(e\),)e(as)i(w)o(ell)e(as)i(higher-lev)o(el)d(comm)o(unicati) o(on)g(constructs)j(\(global)g(sum-)-57 654 y(mation,)14 b(broadcast,)i (etc.\),)e(b)q(ecome)f(essen)o(tial)i(to)g(the)g(dev)o(elopmen)o(t)d(of)j (scalable)g(libraries)f(that)i(ha)o(v)o(e)e(an)o(y)-57 721 y(degree)f(of)h(p)q(ortabilit)o(y)l(.)20 b(In)14 b(addition)g(to)g (standardizing)g(general)g(comm)o(uni)o(cation)d(primitiv)o(es,)g(it)i(ma)o (y)f(also)-57 787 y(b)q(e)20 b(adv)m(an)o(tageous)h(to)e(establish)h (standards)g(for)g(problem-sp)q(eci\014c)e(constructs)h(in)g(commonly)e(o)q (ccurring)-57 853 y(areas)g(suc)o(h)f(as)h(linear)e(algebra.)-57 938 y(The)23 b(BLA)o(CS)f(\(Basic)g(Linear)h(Algebra)f(Comm)o(unication)e (Subprograms\))j([16)q(,)f(26)q(])g(is)g(a)i(pac)o(k)m(age)f(that)-57 1005 y(pro)o(vides)d(the)h(same)e(ease)i(of)g(use)g(and)g(p)q(ortabilit)o(y)f (for)h(MIMD)f(message-passing)h(linear)f(algebra)h(com-)-57 1071 y(m)o(unication)d(that)i(the)g(BLAS)f([17)q(,)g(18)q(,)g(45)q(])h(pro)o (vide)f(for)h(linear)f(algebra)h(computation.)32 b(Therefore,)19 b(w)o(e)-57 1137 y(recommend)14 b(that)j(future)g(soft)o(w)o(are)g(for)g (dense)g(linear)f(algebra)h(on)h(MIMD)e(platforms)g(consist)h(of)g(calls)g (to)-57 1203 y(the)j(BLAS)h(for)g(computation)e(and)j(calls)e(to)h(the)f(BLA) o(CS)g(for)h(comm)o(unic)o(ation.)32 b(Since)20 b(b)q(oth)h(pac)o(k)m(ages) -57 1269 y(will)d(ha)o(v)o(e)f(b)q(een)i(optimized)d(for)j(a)g(particular)g (platform,)e(go)q(o)q(d)k(p)q(erformance)c(should)j(b)q(e)e(ac)o(hiev)o(ed)f (with)-57 1336 y(relativ)o(ely)d(little)g(e\013ort.)22 b(Also,)15 b(since)h(b)q(oth)h(pac)o(k)m(ages)f(will)f(b)q(e)i(a)o(v)m(ailable)e(on)i(a) f(wide)g(v)m(ariet)o(y)f(of)h(mac)o(hines,)-57 1402 y(co)q(de)g(mo)q (di\014cations)g(required)f(to)i(c)o(hange)f(platforms)f(should)i(b)q(e)g (minim)o(al.)-57 1571 y Fo(8.2)70 b(Alternativ)n(e)20 b(Approac)n(hes)-57 1688 y Fm(T)l(raditionally)l(,)13 b(large,)h(general-purp)q(ose)g (mathematical)d(soft)o(w)o(are)j(libraries)e(ha)o(v)o(e)h(required)g(users)h (to)g(write)-57 1754 y(their)21 b(o)o(wn)i(programs)f(that)h(call)e(library)h (routines)g(to)h(solv)o(e)e(sp)q(eci\014c)h(subproblems)f(that)i(arise)f (during)-57 1820 y(a)d(computation.)27 b(Adapted)19 b(to)f(a)h(shared-memory) e(parallel)g(en)o(vironmen)o(t,)f(this)i(con)o(v)o(en)o(tional)f(in)o (terface)-57 1887 y(still)f(o\013ers)i(some)e(p)q(oten)o(tial)h(for)g(hiding) g(underlying)f(complexit)o(y)l(.)k(F)l(or)e(example,)c(the)j(LAP)l(A)o(CK)g (pro)s(ject)-57 1953 y(incorp)q(orates)g(parallelism)d(in)i(the)g(Lev)o(el)f (3)h(BLAS,)g(where)g(it)g(is)g(not)g(directly)f(visible)g(to)h(the)g(user.) -57 2038 y(But)h(when)h(going)g(from)f(shared-memory)e(systems)i(to)h(the)f (more)f(readily)h(scalable)g(distributed)g(memory)-57 2104 y(systems,)c(the)i(complexit)o(y)c(of)16 b(the)e(distributed)g(data)i (structures)f(required)e(is)i(more)e(di\016cult)h(to)h(hide)f(from)-57 2171 y(the)j(user.)25 b(Not)17 b(only)g(m)o(ust)f(the)i(problem)d(decomp)q (osition)i(and)h(data)g(la)o(y)o(out)f(b)q(e)h(sp)q(eci\014ed,)e(but)i (di\013eren)o(t)-57 2237 y(phases)24 b(of)f(the)g(user's)f(problem)g(ma)o(y)f (require)h(transformations)h(b)q(et)o(w)o(een)f(di\013eren)o(t)g(distributed) g(data)-57 2303 y(structures.)-57 2388 y(These)15 b(de\014ciencies)d(in)i (the)h(con)o(v)o(en)o(tional)e(user)h(in)o(terface)g(ha)o(v)o(e)f(prompted)h (extensiv)o(e)f(discussion)h(of)h(alter-)-57 2454 y(nativ)o(e)g(approac)o (hes)i(for)g(scalable)f(parallel)f(soft)o(w)o(are)h(libraries)g(of)g(the)g (future.)21 b(P)o(ossibilities)15 b(include:)3 2568 y(1.)24 b(T)l(raditional)14 b(function)g(library)g(\(i.e.,)e(minim)n(um)e(p)q (ossible)15 b(c)o(hange)f(to)h(the)f(status)h(quo)f(in)g(going)h(from)65 2634 y(serial)f(to)g(parallel)g(en)o(vironmen)o(t\).)j(This)e(will)e(allo)o (w)h(one)g(to)h(protect)f(the)g(programming)e(in)o(v)o(estmen)o(t)65 2700 y(that)17 b(has)g(b)q(een)f(made.)939 2825 y(34)p eop %%Page: 35 37 36 bop 3 125 a Fm(2.)24 b(Reactiv)o(e)12 b(serv)o(ers)h(on)i(the)f(net)o(w)o (ork.)19 b(A)14 b(user)g(w)o(ould)g(b)q(e)g(able)g(to)g(send)h(a)f (computational)f(problem)g(to)65 191 y(a)j(serv)o(er)f(that)i(w)o(as)f(sp)q (ecialized)f(in)g(dealing)h(with)g(the)g(problem.)j(This)e(\014ts)f(w)o(ell)f (with)g(the)h(concepts)65 257 y(of)j(a)g(net)o(w)o(ork)o(ed,)e(heterogeneous) i(computing)f(en)o(vironmen)o(t)e(with)i(v)m(arious)h(sp)q(ecialized)f(hardw) o(are)65 323 y(resources)11 b(\(or)g(ev)o(en)f(the)g(heterogeneous)i (partitioning)f(of)g(a)g(single)g(homogeneous)g(parallel)f(mac)o(hine\).)3 431 y(3.)24 b(General)h(in)o(teractiv)o(e)f(en)o(vironmen)o(ts)f(lik)o(e)h (Matlab)i(or)h(Mathematica,)f(p)q(erhaps)h(with)f(\\exp)q(ert")65 497 y(driv)o(ers)19 b(\(i.e.,)h(kno)o(wledge-based)h(systems\).)33 b(With)20 b(the)h(gro)o(wing)g(p)q(opularit)o(y)f(of)h(the)g(man)o(y)e(in)o (te-)65 564 y(grated)f(pac)o(k)m(ages)g(based)f(on)h(this)f(idea,)g(this)g (approac)o(h)i(w)o(ould)e(pro)o(vide)f(an)i(in)o(teractiv)o(e,)d(graphical)65 630 y(in)o(terface)22 b(for)i(sp)q(ecifying)f(and)h(solving)g(scien)o (ti\014c)e(problems.)42 b(Both)24 b(the)f(algorithms)g(and)h(data)65 696 y(structures)c(are)g(hidden)f(from)g(the)g(user,)i(b)q(ecause)f(the)f (pac)o(k)m(age)h(itself)f(is)h(resp)q(onsible)f(for)h(storing)65 762 y(and)h(retrieving)d(the)i(problem)f(data)i(in)e(an)i(e\016cien)o(t,)d (distributed)i(manner.)31 b(In)20 b(a)h(heterogeneous)65 828 y(net)o(w)o(ork)o(ed)g(en)o(vironmen)o(t,)f(suc)o(h)i(in)o(terfaces)f(could)h (pro)o(vide)f(seamless)g(access)h(to)h(computational)65 895 y(engines)g(that)h(w)o(ould)g(b)q(e)f(in)o(v)o(ok)o(ed)f(selectiv)o(ely)f (for)i(di\013eren)o(t)g(parts)h(of)g(the)f(user's)h(computation)65 961 y(according)16 b(to)h(whic)o(h)f(mac)o(hine)e(is)i(most)f(appropriate)i (for)g(a)f(particular)g(subproblem.)3 1069 y(4.)24 b(Domain-sp)q(eci\014c)17 b(problem)f(solving)i(en)o(vironmen)o(ts,)e(suc)o(h)i(as)g(those)h(for)f (structural)g(analysis.)26 b(En-)65 1135 y(vironmen)o(ts)13 b(lik)o(e)h(Matlab)i(and)g(Mathematica)e(ha)o(v)o(e)g(pro)o(v)o(en)h(to)h(b)q (e)g(esp)q(ecially)e(attractiv)o(e)g(for)i(rapid)65 1201 y(protot)o(yping)k (of)g(new)g(algorithms)f(and)h(systems)f(that)h(ma)o(y)e(subsequen)o(tly)h(b) q(e)h(implem)o(en)o(te)o(d)d(in)j(a)65 1267 y(more)15 b(customized)f(manner)h (for)i(higher)f(p)q(erformance.)3 1375 y(5.)24 b(Reusable)c(templates)f (\(i.e.,)h(users)h(adapt)h(\\source)f(co)q(de")g(to)g(their)f(particular)h (applications\).)34 b(A)65 1441 y(template)14 b(is)i(a)h(description)f(of)h (a)f(general)g(algorithm)g(rather)g(than)h(the)f(executable)f(ob)s(ject)h(co) q(de)h(or)65 1507 y(the)j(source)h(co)q(de)g(more)e(commonly)f(found)j(in)f (a)h(con)o(v)o(en)o(tional)f(soft)o(w)o(are)g(library)l(.)34 b(Nev)o(ertheless,)65 1574 y(although)23 b(templates)d(are)j(general)e (descriptions)h(of)g(k)o(ey)f(data)i(structures,)g(they)f(o\013er)h(whatev)o (er)65 1640 y(degree)16 b(of)g(customization)f(the)h(user)g(ma)o(y)f(desire.) -57 1769 y(No)o(v)o(el)h(user)i(in)o(terfaces)e(that)i(hide)f(the)h (complexit)o(y)c(of)k(scalable)f(parallelism)f(will)g(require)g(new)i (concepts)-57 1835 y(and)e(mec)o(hanism)o(s)c(for)k(represen)o(ting)e(scien)o (ti\014c)f(computational)h(problems)f(and)j(for)f(sp)q(ecifying)f(ho)o(w)i (those)-57 1901 y(problems)c(relate)h(to)h(eac)o(h)f(other.)20 b(V)l(ery)13 b(high)g(lev)o(el)e(languages)k(and)f(systems,)f(p)q(erhaps)h (graphically)f(based,)-57 1968 y(not)21 b(only)f(w)o(ould)h(facilitate)e(the) h(use)g(of)h(mathematical)c(soft)o(w)o(are)k(from)e(the)h(user's)g(p)q(oin)o (t)h(of)g(view,)f(but)-57 2034 y(also)g(w)o(ould)f(help)g(to)h(automate)f (the)g(determination)e(of)j(e\013ectiv)o(e)d(partitioning,)j(mapping,)f(gran) o(ularit)o(y)l(,)-57 2100 y(data)i(structures,)g(etc.)32 b(Ho)o(w)o(ev)o(er,) 19 b(new)h(concepts)h(in)e(problem)g(sp)q(eci\014cation)h(and)h(represen)o (tation)f(ma)o(y)-57 2166 y(also)d(require)e(new)h(mathematical)e(researc)o (h)h(on)i(the)f(analytic,)g(algebraic,)f(and)i(top)q(ological)g(prop)q (erties)g(of)-57 2232 y(problems)e(\(e.g.,)g(existence)f(and)j(uniqueness\).) -57 2320 y(W)l(e)23 b(ha)o(v)o(e)g(already)g(b)q(egun)h(w)o(ork)f(on)h(dev)o (eloping)e(suc)o(h)h(templates)f(for)h(sparse)h(matrix)e(computations.)-57 2386 y(F)l(uture)16 b(w)o(ork)g(will)f(fo)q(cus)i(on)g(extending)e(the)h(use) h(of)f(templates)f(to)h(dense)g(matrix)f(computations.)-57 2474 y(W)l(e)23 b(hop)q(e)i(the)e(insigh)o(t)h(w)o(e)f(gained)h(from)e(our)j (w)o(ork)e(will)g(in\015uence)g(future)g(dev)o(elop)q(ers)g(of)h(hardw)o (are,)-57 2540 y(compilers)d(and)k(systems)d(soft)o(w)o(are)i(so)g(that)g (they)g(pro)o(vide)e(to)q(ols)j(to)f(facilitate)e(dev)o(elopmen)o(t)f(of)j (high)-57 2606 y(qualit)o(y)15 b(p)q(ortable)i(n)o(umerical)c(soft)o(w)o (are.)939 2825 y(35)p eop %%Page: 36 38 37 bop -57 125 a Fm(The)16 b(EISP)l(A)o(CK,)f(LINP)l(A)o(CK,)g(and)i(LAP)l(A) o(CK)f(linear)f(algebra)i(libraries)e(are)h(in)g(the)g(public)f(domain,)g (and)-57 191 y(are)j(a)o(v)m(ailable)f(from)g Fg(netlib)p Fm(.)29 b(F)l(or)18 b(example,)e(for)i(more)f(information)g(on)h(ho)o(w)h(to)f (obtain)h(LAP)l(A)o(CK,)e(send)-57 257 y(the)f(follo)o(wing)g(one-line)f (email)g(message)g(to)i Fk(netlib@or)o(nl.)o(gov)o Fm(:)-57 345 y Fk(send)24 b(index)g(from)g(lapack)-57 433 y Fm(Information)e(for)i (EISP)l(A)o(CK)f(and)h(LINP)l(A)o(CK)f(can)g(b)q(e)h(similarly)c(obtained.)43 b(W)l(e)24 b(exp)q(ect)e(to)i(mak)o(e)e(a)-57 499 y(preliminary)13 b(v)o(ersion)j(of)h(the)f(ScaLAP)l(A)o(CK)g(library)f(a)o(v)m(ailable)h(from) f Fg(netlib)j Fm(in)e(1993.)-57 672 y Fo(Ac)n(kno)n(wledgmen)n(ts)-57 792 y Fm(This)i(researc)o(h)g(w)o(as)h(p)q(erformed)e(in)h(part)h(using)f (the)g(In)o(tel)f(T)l(ouc)o(hstone)i(Delta)f(System)f(op)q(erated)h(b)o(y)g (the)-57 858 y(California)23 b(Institute)f(of)h(T)l(ec)o(hnology)f(on)i(b)q (ehalf)f(of)g(the)f(Concurren)o(t)h(Sup)q(ercomputing)f(Consortium.)-57 924 y(Access)15 b(to)i(this)f(facilit)o(y)e(w)o(as)j(pro)o(vided)f(through)h (the)f(Cen)o(ter)f(for)i(Researc)o(h)e(on)i(P)o(arallel)e(Computing.)-57 1121 y Fh(References)-33 1258 y Fm([1])24 b(E.)f(Anderson,)i(A.)d(Benzoni,)h (J.)g(J.)g(Dongarra,)j(S.)d(Moulton,)i(S.)d(Ostrouc)o(ho)o(v,)j(B.)d(T)l (ouranc)o(heau,)43 1324 y(and)d(R.)e(v)m(an)h(de)f(Geijn.)25 b(LAP)l(A)o(CK)17 b(for)h(distributed)f(memory)e(arc)o(hitectures:)23 b(Progress)18 b(rep)q(ort.)26 b(In)43 1390 y Fg(Par)n(al)r(lel)19 b(Pr)n(o)n(c)n(essing)e(for)g(Scienti\014c)j(Computing,)e(Fifth)f(SIAM)g (Confer)n(enc)n(e)p Fm(.)g(SIAM,)e(1991.)-33 1498 y([2])24 b(E.)f(Anderson)g(and)h(J.)f(Dongarra.)43 b(Results)23 b(from)e(the)i (initial)f(release)g(of)i(LAP)l(A)o(CK.)40 b(T)l(ec)o(hnical)43 1564 y(Rep)q(ort)15 b(LAP)l(A)o(CK)e(w)o(orking)h(note)g(16,)h(Computer)e (Science)g(Departmen)o(t,)f(Univ)o(ersit)o(y)g(of)i(Tennessee,)43 1630 y(Kno)o(xville,)g(TN,)i(1989.)-33 1738 y([3])24 b(E.)16 b(Anderson)g(and)g(J.)g(Dongarra.)22 b(Ev)m(aluating)17 b(blo)q(c)o(k)e (algorithm)g(v)m(arian)o(ts)h(in)f(LAP)l(A)o(CK.)20 b(T)l(ec)o(hnical)43 1804 y(Rep)q(ort)15 b(LAP)l(A)o(CK)e(w)o(orking)h(note)g(19,)h(Computer)e (Science)g(Departmen)o(t,)f(Univ)o(ersit)o(y)g(of)i(Tennessee,)43 1870 y(Kno)o(xville,)g(TN,)i(1990.)-33 1978 y([4])24 b(C.)17 b(C.)f(Ashcraft.)23 b(The)17 b(distributed)f(solution)h(of)g(linear)f (systems)f(using)j(the)e(torus)i(wrap)f(data)h(map-)43 2044 y(ping.)28 b(Engineering)19 b(Computing)f(and)h(Analysis)f(T)l(ec)o(hnical)f (Rep)q(ort)i(ECA-TR-147,)h(Bo)q(eing)e(Com-)43 2110 y(puter)e(Services,)f (1990.)-33 2218 y([5])24 b(C.)19 b(C.)g(Ashcraft.)29 b(A)18 b(taxonam)o(y)g(of)i(distributed)e(dense)h(LU)g(factorization)g(metho)q(ds.) 28 b(Engineering)43 2284 y(Computing)15 b(and)h(Analysis)f(Tec)o(hnical)f (Rep)q(ort)i(ECA-TR-161,)h(Bo)q(eing)f(Computer)e(Services,)g(1991.)-33 2392 y([6])24 b(M.)19 b(Barnett,)g(D.)g(G.)g(P)o(a)o(yne,)g(and)h(R.)e(v)m (an)i(de)f(Geijn.)30 b(Broadcasting)20 b(on)f(meshes)f(with)h(w)o(orm-hole)43 2458 y(routing.)h(T)l(ec)o(hnical)13 b(rep)q(ort,)j(Departmen)o(t)d(of)i (Computer)f(Science,)g(Univ)o(ersit)o(y)e(of)j(T)l(exas)h(at)f(Austin,)43 2525 y(April)g(1993.)23 b(Submitted)14 b(to)j(Sup)q(ercomputing)e('93.)-33 2632 y([7])24 b(W.)d(S.)f(Brainerd,)g(C.)h(H.)e(Goldb)q(ergs,)k(and)e(J.)f (C.)g(Adams.)33 b Fg(Pr)n(o)n(gr)n(ammers)19 b(Guide)j(to)f(F)l(ortr)n(an)g (90)p Fm(.)43 2698 y(McGra)o(w-Hill,)14 b(New)i(York,)g(1990.)939 2825 y(36)p eop %%Page: 37 39 38 bop -33 125 a Fm([8])24 b(R.)d(P)l(.)f(Bren)o(t.)34 b(The)21 b(LINP)l(A)o(CK)f(b)q(enc)o(hmark)f(for)i(the)g(Fujitsu)g(AP)f(1000.)37 b(In)20 b Fg(Pr)n(o)n(c)n(e)n(e)n(dings)h(of)g(the)43 191 y(F)l(ourth)13 b(Symp)n(osium)f(on)h(the)g(F)l(r)n(ontiers)f(of)h(Massively)g(Par)n(al)r (lel)h(Computation)p Fm(,)e(pages)g(128{135.)i(IEEE)43 257 y(Computer)i(So)q(ciet)o(y)f(Press,)h(1992.)-33 365 y([9])24 b(R.)13 b(P)l(.)g(Bren)o(t.)j(The)e(LINP)l(A)o(CK)e(b)q(enc)o(hmark)g(on)i (the)f(AP)h(1000:)21 b(Preliminary)11 b(rep)q(ort.)17 b(In)c Fg(Pr)n(o)n(c)n(e)n(e)n(dings)43 431 y(of)18 b(the)f(2nd)h(CAP)g(Workshop)p Fm(,)d(NO)o(V)g(1991.)-57 539 y([10])24 b(J.)17 b(Choi,)h(J.)f(J.)g (Dongarra,)i(R.)e(P)o(ozo,)h(and)g(D.)f(W.)g(W)l(alk)o(er.)24 b(Scalapac)o(k:)g(A)17 b(scalable)g(linear)g(algebra)43 605 y(library)f(for)g(distributed)f(memory)e(concurren)o(t)j(computers.)j(In)d Fg(Pr)n(o)n(c)n(e)n(e)n(dings)g(of)i(the)f(F)l(ourth)g(Symp)n(o-)43 671 y(sium)i(on)g(the)g(F)l(r)n(ontiers)f(of)h(Massively)g(Par)n(al)r(lel)h (Computation)p Fm(,)e(pages)g(120{127.)i(IEEE)e(Computer)43 737 y(So)q(ciet)o(y)e(Press,)g(1992.)-57 845 y([11])24 b(J.)c(Choi,)h(J.)e (J.)h(Dongarra,)i(and)f(D.)f(W.)f(W)l(alk)o(er.)32 b(The)20 b(design)g(of)g(scalable)g(soft)o(w)o(are)g(libraries)f(for)43 911 y(distributed)d(memory)d(concurren)o(t)i(computers.)20 b(In)15 b(J.)h(J.)g(Dongarra)h(and)g(B.)e(T)l(ouranc)o(heau,)i(editors,)43 978 y Fg(Envir)n(onments)d(and)g(T)l(o)n(ols)f(for)f(Par)n(al)r(lel)j (Scienti\014c)g(Computing)p Fm(.)d(Elsevier)e(Science)g(Publishers,)i(1993.) -57 1085 y([12])24 b(E.)16 b(Ch)o(u)g(and)g(A.)f(George.)20 b(Gaussian)d(elimination)d(with)h(partial)h(piv)o(oting)f(and)h(load)g (balancing)g(on)h(a)43 1152 y(m)o(ultipro)q(cessor.)j Fg(Par)n(al)r(lel)f (Computing)p Fm(,)d(5:65{74,)i(1987.)-57 1259 y([13])24 b(D.)18 b(E.)g(Culler,)f(A.)g(Dusseau,)i(S.)e(C.)h(Goldstein,)g(A.)f(Krishnam)o(urth) o(y)l(,)f(S.)h(Lumetta,)h(T.)f(v)o(on)h(Eic)o(k)o(en,)43 1325 y(and)j(K.)e(Y)l(elic)o(k.)30 b(In)o(tro)q(duction)20 b(to)g(Split-C:)f(V)l (ersion)h(0.9.)32 b(T)l(ec)o(hnical)19 b(rep)q(ort,)i(Computer)e(Science)43 1392 y(Division)d({)h(EECS,)f(Univ)o(ersit)o(y)e(of)i(California,)g(Berk)o (eley)l(,)d(CA)j(94720,)i(F)l(ebruary)e(1993.)-57 1499 y([14])24 b(J.)14 b(Demmel.)h(LAP)l(A)o(CK:)f(A)g(p)q(ortable)h(linear)f(algebra)h (library)f(for)h(sup)q(ercomputers.)j(In)c Fg(Pr)n(o)n(c)n(e)n(e)n(dings)43 1566 y(of)20 b(the)h(1989)f(IEEE)g(Contr)n(ol)g(Systems)h(So)n(ciety)f (Workshop)g(on)g(Computer-A)o(ide)n(d)h(Contr)n(ol)f(System)43 1632 y(Design)p Fm(,)d(Decem)o(b)q(er)d(1989.)-57 1739 y([15])24 b(J.)d(J.)g(Dongarra.)38 b(Increasing)21 b(the)g(p)q(erformance)f(of)h (mathematical)d(soft)o(w)o(are)k(through)g(high-lev)o(el)43 1806 y(mo)q(dularit)o(y)l(.)11 b(In)g Fg(Pr)n(o)n(c.)h(Sixth)h(Int.)g(Symp.)g (Comp.)f(Metho)n(ds)g(in)h(Eng.)h(&)f(Applie)n(d)g(Scienc)n(es,)i(V)l(ersail) r(les,)43 1872 y(F)l(r)n(anc)n(e)p Fm(,)h(pages)h(239{248.)h(North-Holland,)e (1984.)-57 1980 y([16])24 b(J.)d(J.)g(Dongarra.)39 b(LAP)l(A)o(CK)21 b(W)l(orking)h(Note)f(34:)32 b(W)l(orkshop)22 b(on)g(the)g(BLA)o(CS.)36 b(Computer)20 b(Sci-)43 2046 y(ence)14 b(Dept.)g(T)l(ec)o(hnical)g(Rep)q(ort) h(CS-91-134,)i(Univ)o(ersit)o(y)12 b(of)j(T)l(ennessee,)f(Kno)o(xville,)f (TN,)h(Ma)o(y)g(1991.)43 2112 y(\(LAP)l(A)o(CK)i(W)l(orking)g(Note)g(#34\).) -57 2220 y([17])24 b(J.)14 b(J.)g(Dongarra,)i(J.)e(Du)h(Croz,)f(S.)g (Hammarling,)e(and)j(I.)e(Du\013.)19 b(A)14 b(set)g(of)h(lev)o(el)d(3)j (basic)f(linear)g(algebra)43 2286 y(subprograms.)22 b Fg(A)o(CM)17 b(T)l(r)n(ansactions)g(on)h(Mathematic)n(al)g(Softwar)n(e)p Fm(,)e(16\(1\):1{17,)i(1990.)-57 2394 y([18])24 b(J.)19 b(J.)g(Dongarra,)i (J.)d(Du)i(Croz,)f(S.)g(Hammarling,)e(and)i(R.)g(Hanson.)30 b(An)19 b(extended)f(set)h(of)g(F)l(ortran)43 2460 y(basic)d(linear)e (algebra)i(subroutines.)k Fg(A)o(CM)c(T)l(r)n(ansactions)h(on)g(Mathematic)n (al)g(Softwar)n(e)p Fm(,)e(14\(1\):1{17,)43 2526 y(Marc)o(h)h(1988.)-57 2634 y([19])24 b(J.)14 b(J.)g(Dongarra,)i(I.)e(S.)g(Du\013,)h(D.)f(C.)h (Sorensen,)f(and)h(H.)f(A.)f(V)l(an)i(der)f(V)l(orst.)k Fg(Solving)g(Line)n (ar)d(Systems)43 2700 y(on)j(V)l(e)n(ctor)g(and)g(Shar)n(e)n(d)e(Memory)h (Computers)p Fm(.)k(SIAM)15 b(Publications,)g(Philadelphia,)g(P)l(A,)h(1991.) 939 2825 y(37)p eop %%Page: 38 40 39 bop -57 125 a Fm([20])24 b(J.)19 b(J.)g(Dongarra)j(and)e(E.)f(Grosse.)32 b(Distribution)19 b(of)h(mathematical)c(soft)o(w)o(are)j(via)h(electronic)d (mail.)43 191 y Fg(Communic)n(ations)h(of)f(the)h(A)o(CM)p Fm(,)d(30\(5\):403{407)q(,)k(July)c(1987.)-57 299 y([21])24 b(J.)19 b(J.)g(Dongarra,)i(R.)d(Hemp)q(el,)f(A.)h(J.)h(G.)f(Hey)l(,)h(and)g (D.)g(W.)g(W)l(alk)o(er.)28 b(A)19 b(prop)q(osal)h(for)g(a)f(user-lev)o(el)43 365 y(message)j(passing)h(in)o(terface)e(in)g(a)i(distributed)e(memory)f(en)o (vironmen)o(t.)36 b(T)l(ec)o(hnical)20 b(Rep)q(ort)j(TM-)43 431 y(12231,)18 b(Oak)e(Ridge)g(National)g(Lab)q(oratory)l(,)i(F)l(ebruary)e (1993.)-57 539 y([22])24 b(J.)11 b(J.)f(Dongarra,)k(P)o(eter)c(Ma)o(y)o(es,)g (and)i(Giusepp)q(e)f(Radicati)f(di)h(Brozolo.)h(The)f(IBM)f(RISC)h (System/6000)43 605 y(and)17 b(linear)f(algebra)g(op)q(erations.)23 b Fg(Sup)n(er)n(c)n(omputer)p Fm(,)15 b(44\(VI)q(I)q(I-4\):15{30,)i(1991.)-57 713 y([23])24 b(J.)c(J.)h(Dongarra)h(and)f(S.)g(Ostrouc)o(ho)o(v.)34 b(LAP)l(A)o(CK)20 b(blo)q(c)o(k)g(factorization)g(algorithms)g(on)h(the)f(In) o(tel)43 779 y(iPSC/860.)35 b(T)l(ec)o(hnical)19 b(Rep)q(ort)i(CS-90-115,)j (Univ)o(ersit)o(y)18 b(of)i(T)l(ennessee)g(at)h(Kno)o(xville,)e(Computer)43 845 y(Science)c(Departmen)o(t,)f(Octob)q(er)i(1990.)-57 953 y([24])24 b(J.)14 b(J.)f(Dongarra,)j(R.)e(P)o(ozo,)g(and)h(D.)e(W.)h(W)l(alk) o(er.)j(An)c(ob)s(ject)h(orien)o(ted)f(design)h(for)g(high)g(p)q(erformance) 43 1019 y(linear)k(algebra)h(on)f(distributed)g(memory)d(arc)o(hitectures.)26 b(In)18 b Fg(Pr)n(o)n(c)n(e)n(e)n(dings)h(of)g(the)h(Obje)n(ct)g(Oriente)n(d) 43 1085 y(Numerics)e(Confer)n(enc)n(e)p Fm(,)f(1993.)-57 1193 y([25])24 b(J.)c(J.)h(Dongarra,)i(R.)d(v)m(an)h(de)f(Geijn,)h(and)g(D.)g(W.)f (W)l(alk)o(er.)33 b(A)20 b(lo)q(ok)h(at)g(scalable)g(dense)f(linear)g(al-)43 1259 y(gebra)f(libraries.)25 b(In)17 b(IEEE,)h(editor,)g Fg(Pr)n(o)n(c)n(e)n (e)n(dings)f(of)i(the)h(Sc)n(alable)h(High-Performanc)n(e)e(Computing)43 1326 y(Confer)n(enc)n(e)p Fm(,)e(pages)g(372{379.)h(IEEE)f(Publishers,)e (1992.)-57 1433 y([26])24 b(J.)12 b(J.)h(Dongarra)h(and)f(R.)f(A.)g(v)m(an)h (de)f(Geijn.)j(Tw)o(o-dimensional)d(basic)g(linear)g(algebra)h(comm)o (unication)43 1499 y(subprograms.)20 b(T)l(ec)o(hnical)14 b(Rep)q(ort)h(LAP)l (A)o(CK)g(w)o(orking)g(note)h(37,)f(Computer)f(Science)g(Departmen)o(t,)43 1566 y(Univ)o(ersit)o(y)g(of)i(Tennessee,)g(Kno)o(xville,)e(TN,)h(Octob)q(er) h(1991.)-57 1673 y([27])24 b(J.)c(J.)f(Dongarra)j(and)f(R.)e(A.)g(v)m(an)i (de)f(Geijn.)31 b(Reduction)20 b(to)g(condensed)g(form)f(for)h(the)g(eigen)o (v)m(alue)43 1740 y(problem)15 b(on)i(distributed)e(memory)f(arc)o (hitectures.)19 b Fg(Par)n(al)r(lel)g(Computing)p Fm(,)d(18:973{982,)j(1992.) -57 1847 y([28])24 b(J.)16 b(Du)h(Croz)f(and)h(M.)e(P)o(on)o(t.)21 b(The)16 b(dev)o(elopmen)o(t)d(of)k(a)f(\015oating-p)q(oin)o(t)i(v)m (alidation)e(pac)o(k)m(age.)21 b(In)16 b(M.)f(J.)43 1914 y(Irwin)e(and)i(R.)e (Stefanelli,)f(editors,)i Fg(Pr)n(o)n(c)n(e)n(e)n(dings)g(of)h(the)h(8th)f (Symp)n(osium)f(on)i(Computer)f(A)o(rithmetic,)43 1980 y(Como,)i(Italy,)h (May)e(19-21,)i(1987)p Fm(.)d(IEEE)i(Computer)e(So)q(ciet)o(y)g(Press,)h (1987.)-57 2087 y([29])24 b(T.)14 b(H.)e(Dunigan.)18 b(Comm)o(unication)11 b(p)q(erformance)i(of)h(the)f(In)o(tel)f(Touc)o(hstone)i(Delta)g(mesh.)h(T)l (ec)o(hnical)43 2154 y(Rep)q(ort)i(TM-11983,)h(Oak)e(Ridge)g(National)g(Lab)q (oratory)l(,)i(Jan)o(uary)f(1992.)-57 2261 y([30])24 b(A.)11 b(Edelman.)i(Large)g(dense)e(n)o(umerical)e(linear)i(algebra)i(in)e(1993:)21 b(The)12 b(parallel)f(computing)f(in\015uence.)43 2328 y Fg(International)19 b(Journal)e(Sup)n(er)n(c)n(omputer)g(Applic)n(ations)p Fm(,)g(1993.)22 b(Accepted)15 b(for)i(publication.)-57 2435 y([31])24 b(E.)16 b(W.)f(F)l(elten)g(and)h(S.)g(W.)f(Otto.)21 b(Coheren)o(t)16 b(parallel)f(C.)21 b(In)15 b(G.)h(C.)f(F)l(o)o(x,)g(editor,)h Fg(Pr)n(o)n(c)n(e)n(e)n(dings)g(of)g(the)43 2502 y(Thir)n(d)21 b(Confer)n(enc)n(e)i(on)f(Hyp)n(er)n(cub)n(e)g(Concurr)n(ent)g(Computers)g (and)g(Applic)n(ations)p Fm(,)h(pages)f(440{450.)43 2568 y(A)o(CM)16 b(Press,)g(1988.)939 2825 y(38)p eop %%Page: 39 41 40 bop -57 125 a Fm([32])24 b(G.)18 b(C.)g(F)l(o)o(x,)g(M.)f(A.)g(Johnson,)i (G.)f(A.)g(Lyzenga,)g(S.)g(W.)f(Otto,)i(J.)e(K.)h(Salmon,)f(and)i(D.)f(W.)f (W)l(alk)o(er.)43 191 y Fg(Solving)25 b(Pr)n(oblems)e(on)g(Concurr)n(ent)g (Pr)n(o)n(c)n(essors)p Fm(,)f(v)o(olume)e(1.)39 b(Pren)o(tice)21 b(Hall,)h(Englew)o(o)q(o)q(d)h(Cli\013s,)43 257 y(N.J.,)15 b(1988.)-57 364 y([33])24 b(K.)e(Galliv)m(an,)i(R.)e(Plemmons,)g(and)h(A.)f (Sameh.)39 b(P)o(arallel)22 b(algorithms)g(for)h(dense)g(linear)f(algebra)43 430 y(computations.)f Fg(SIAM)c(R)n(eview)p Fm(,)g(32\(1\):54{135,)i(1990.) -57 536 y([34])24 b(A.)16 b(Geist)g(and)g(M.)g(Heath.)21 b(Matrix)15 b(factorization)i(on)f(a)h(h)o(yp)q(ercub)q(e)f(m)o(ultipro)q(cessor.)j(In)d (M.)g(Heath,)43 602 y(editor,)e Fg(Hyp)n(er)n(cub)n(e)i(Multipr)n(o)n(c)n (essors,)f(1986)p Fm(,)f(pages)h(161{180,)j(Philadelphia,)13 b(P)l(A,)h(1986.)i(So)q(ciet)o(y)e(for)43 669 y(Industrial)i(and)h(Applied)e (Mathematics.)-57 775 y([35])24 b(A.)12 b(Geist)h(and)g(C.)g(Romine.)h(LU)f (factorization)g(algorithms)f(on)h(distributed-memory)d(m)o(ultipro)q(cessor) 43 841 y(arc)o(hitectures.)20 b Fg(SIAM)e(J.)e(Sci.)j(Statist.)f(Comput.)p Fm(,)d(9\(4\):639{649,)k(July)d(1988.)-57 948 y([36])24 b(G.)17 b(H.)f(Golub)h(and)h(C.)f(F.)f(V)l(an)h(Loan.)24 b Fg(Matrix)18 b(Computations)p Fm(.)23 b(The)17 b(Johns)h(Hopkins)f(Press,)g(Balti-)43 1014 y(more,)e(Maryland,)g(2nd)i(edition,)e(1989.)-57 1120 y([37])24 b(A.)16 b(Gupta)i(and)f(V.)f(Kumar.)22 b(On)17 b(the)f(scalabilit)o (y)f(of)i(FFT)g(on)g(parallel)f(computers.)22 b(In)16 b Fg(Pr)n(o)n(c)n(e)n (e)n(dings)43 1187 y(of)k(the)g(F)l(r)n(ontiers)g(90)f(Confer)n(enc)n(e)i(on) f(Massively)g(Par)n(al)r(lel)h(Computation)p Fm(.)e(IEEE)g(Computer)f(So)q (ci-)43 1253 y(et)o(y)h(Press,)i(1990.)35 b(Also)20 b(a)o(v)m(ailable)f(as)i (tec)o(hnical)d(rep)q(ort)j(TR)f(90-20)i(from)d(the)h(Computer)g(Science)43 1319 y(Departmen)o(t,)14 b(Univ)o(ersit)o(y)g(of)j(Minnesota,)f(Minneap)q (olis,)f(MN)h(55455.)-57 1425 y([38])24 b(R.)18 b(Harrington.)29 b(Origin)18 b(and)h(dev)o(elopmen)o(t)d(of)i(the)h(metho)q(d)f(of)g(momen)o (ts)e(for)j(\014eld)f(computation.)43 1492 y Fg(IEEE)g(A)o(ntennas)h(and)e (Pr)n(op)n(agation)g(Magazine)p Fm(,)f(June)g(1990.)-57 1598 y([39])24 b(B.)18 b(Hendric)o(kson)f(and)i(D.)f(W)l(om)o(ble.)25 b(The)19 b(torus-wrap)h(mapping)d(for)i(dense)f(matrix)e(computations)43 1664 y(on)g(massiv)o(ely)c(parallel)i(computers.)k(T)l(ec)o(hnical)13 b(Rep)q(ort)j(SAND92-0792,)g(Sandia)g(National)f(Lab)q(ora-)43 1730 y(tories,)h(April)f(1992.)-57 1837 y([40])24 b(J.)f(L.)h(Hess.)42 b(P)o(anel)23 b(metho)q(ds)g(in)g(computational)g(\015uid)g(dynamics.)41 b Fg(A)o(nnual)25 b(R)n(eviews)g(of)f(Fluid)43 1903 y(Me)n(chanics)p Fm(,)17 b(22:255{274,)h(1990.)-57 2009 y([41])24 b(J.)e(L.)g(Hess)g(and)h(M.) f(O.)f(Smith.)38 b(Calculation)22 b(of)g(p)q(oten)o(tial)g(\015o)o(ws)h(ab)q (out)g(arbitrary)g(b)q(o)q(dies.)39 b(In)43 2076 y(D.)14 b(K)q(\177)-26 b(uc)o(hemann,)13 b(editor,)g Fg(Pr)n(o)n(gr)n(ess)h(in)h(A)n(er)n(onautic)n (al)h(Scienc)n(es,)h(V)l(olume)f(8)p Fm(.)e(P)o(ergamon)f(Press,)h(1967.)-57 2182 y([42])24 b(High)16 b(P)o(erformance)f(F)l(ortran)i(F)l(orum.)k Fg(High)d(Performanc)n(e)g(F)l(ortr)n(an)e(L)n(anguage)j(Sp)n(e)n(ci\014c)n (ation,)f(V)l(er-)43 2248 y(sion)g(1.0)p Fm(,)d(Jan)o(uary)i(1993.)-57 2355 y([43])24 b(R.)18 b(W.)f(Ho)q(c)o(kney)g(and)i(C.)f(R.)f(Jesshop)q(e.)28 b Fg(Par)n(al)r(lel)20 b(Computers)p Fm(.)26 b(Adam)17 b(Hilger)g(Ltd.,)h (Bristol,)f(UK,)43 2421 y(1981.)-57 2527 y([44])24 b(W.)16 b(Kahan.)22 b(P)o(aranoia.)g(Av)m(ailable)15 b(from)h(netlib)f([20].)-57 2634 y([45])24 b(C.)c(La)o(wson,)j(R.)c(Hanson,)j(D.)e(Kincaid,)g(and)h(F.)e (Krogh.)34 b(Basic)20 b(linear)f(algebra)i(subprograms)g(for)43 2700 y(Fortran)c(usage.)22 b Fg(A)o(CM)17 b(T)l(r)n(ans.)g(Math.)g(Softw.)p Fm(,)f(5:308{323,)j(1979.)939 2825 y(39)p eop %%Page: 40 42 41 bop -57 125 a Fm([46])24 b(C.)19 b(Leiserson.)30 b(F)l(at)19 b(trees:)26 b(Univ)o(ersal)18 b(net)o(w)o(orks)g(for)i(hardw)o(are-e\016cien) o(t)d(sup)q(ercomputing.)29 b Fg(IEEE)43 191 y(T)l(r)n(ansactions)18 b(on)g(Computers)p Fm(,)d(C-34\(10\):892{9)q(01,)k(1985.)-57 299 y([47])24 b(W.)c(Lic)o(h)o(tenstein)e(and)i(S.)f(L.)h(Johnsson.)33 b(Blo)q(c)o(k-cyclic)16 b(dense)k(linear)f(algebra.)31 b(T)l(ec)o(hnical)19 b(Rep)q(ort)43 365 y(TR-04-92,)c(Harv)m(ard)d(Univ)o(ersit)o(y)l(,)e(Cen)o (ter)h(for)i(Researc)o(h)e(in)h(Computing)f(T)l(ec)o(hnology)l(,)h(Jan)o (uary)h(1992.)-57 473 y([48])24 b(M.)13 b(Lin,)g(D.)g(Du,)h(A.)e(E.)h (Klietz,)f(and)i(S.)f(Saro\013.)k(P)o(erformance)12 b(ev)m(aluation)h(of)g (the)g(CM-5)h(in)o(terconnec-)43 539 y(tion)i(net)o(w)o(ork.)j(T)l(ec)o (hnical)13 b(rep)q(ort,)j(Departmen)o(t)e(of)h(Computer)g(Science,)e(Univ)o (ersit)o(y)g(of)i(Minnesota,)43 605 y(1992.)-57 713 y([49])24 b(R.)16 b(P)o(onn)o(usam)o(y)l(,)f(A.)h(Choudhary)l(,)h(and)g(G.)g(F)l(o)o (x.)22 b(Comm)o(unicati)o(on)15 b(o)o(v)o(erhead)g(on)j(CM-5:)k(An)16 b(exp)q(er-)43 779 y(imen)o(tal)f(p)q(erformance)i(ev)m(aluation.)25 b(In)17 b Fg(Pr)n(o)n(c)n(e)n(e)n(dings)g(of)i(the)g(F)l(ourth)f(Symp)n (osium)g(on)g(the)i(F)l(r)n(ontiers)43 845 y(of)e(Massively)f(Par)n(al)r(lel) i(Computation)p Fm(,)d(pages)h(108{115.)i(IEEE)d(Computer)f(So)q(ciet)o(y)h (Press,)g(1992.)-57 953 y([50])24 b(Y.)e(Saad)h(and)g(M.)f(H.)f(Sc)o(h)o (ultz.)38 b(P)o(arallel)22 b(direct)f(metho)q(ds)h(for)h(solving)f(banded)h (linear)f(systems.)43 1019 y(T)l(ec)o(hnical)11 b(Rep)q(ort)i(Y)l (ALEU/DCS/RR-387,)i(Departmen)o(t)c(of)i(Computer)e(Science,)g(Y)l(ale)h (Univ)o(ersit)o(y)l(,)43 1085 y(1985.)-57 1193 y([51])24 b(S.)17 b(R.)g(Seidel.)22 b(Broadcasting)c(on)g(linear)f(arra)o(ys)g(and)h(meshes.)23 b(T)l(ec)o(hnical)15 b(Rep)q(ort)j(TM-12356,)h(Oak)43 1259 y(Ridge)d(National)g(Lab)q(oratory)l(,)i(April)d(1993.)-57 1367 y([52])24 b(A.)d(Skjellum)e(and)k(A.)e(Leung.)38 b(LU)22 b(factorization)g(of)g(sparse,)h(unsymmetric,)c(Jacobian)k(matrices)43 1433 y(on)f(m)o(ulticom)o(puters.)34 b(In)22 b(D.)f(W.)g(W)l(alk)o(er)g(and)h (Q.)f(F.)g(Stout,)i(editors,)f Fg(Pr)n(o)n(c)n(e)n(e)n(dings)f(of)h(the)h (Fifth)43 1499 y(Distribute)n(d)18 b(Memory)e(Concurr)n(ent)i(Computing)g (Confer)n(enc)n(e)p Fm(,)f(pages)g(328{337.)h(IEEE)f(Press,)f(1990.)-57 1607 y([53])24 b(Thinking)16 b(Mac)o(hines)g(Corp)q(oration,)h(Cam)o(bridge,) e(MA.)20 b Fg(CM-5)d(T)l(e)n(chnic)n(al)i(Summary)p Fm(,)c(1991.)-57 1715 y([54])24 b(R.)14 b(A.)f(v)m(an)i(de)f(Geijn.)k(Massiv)o(ely)12 b(parallel)i(LINP)l(A)o(CK)f(b)q(enc)o(hmark)g(on)i(the)f(In)o(tel)e(Touc)o (hstone)j(Delta)43 1781 y(and)i(iPSC/860)h(systems.)i(Computer)15 b(Science)g(rep)q(ort)i(TR-91-28,)h(Univ.)c(of)j(T)l(exas,)f(1991.)-57 1889 y([55])24 b(E.)13 b(F.)f(V)l(an)h(de)g(V)l(elde.)h(Data)g (redistribution)e(and)h(concurrency)l(.)i Fg(Par)n(al)r(lel)h(Computing)p Fm(,)d(16,)h(Decem)o(b)q(er)43 1955 y(1990.)-57 2063 y([56])24 b(J.)14 b(J.)g(H.)f(W)l(ang.)19 b Fg(Gener)n(alize)n(d)d(Moment)g(Metho)n(ds) f(in)g(Ele)n(ctr)n(omagnetics)p Fm(.)20 b(John)15 b(Wiley)d(&)j(Sons,)g(New) 43 2129 y(Y)l(ork,)h(1991.)-57 2237 y([57])24 b(J.)19 b(Wilkinson)g(and)g(C.) g(Reinsc)o(h.)29 b Fg(Handb)n(o)n(ok)20 b(for)g(A)o(utomatic)g(Computation:) 28 b(V)l(olume)22 b(II)d(-)i(Line)n(ar)43 2303 y(A)o(lgebr)n(a)p Fm(.)h(Springer-V)l(erlag,)15 b(New)h(Y)l(ork,)g(1971.)939 2825 y(40)p eop %%Page: 41 43 42 bop 228 442 1470 2 v 228 1951 2 1509 v 250 509 a Fm(p)q(col=)14 b Ff(q)412 516 y Fe(0)250 575 y Fm(pro)o(w=)h Ff(p)431 582 y Fe(0)250 642 y Fm(do)i(k=)d(0)p Ff(;)8 b Fm(min)f(\()p Ff(M)598 649 y Fd(b)615 642 y Ff(;)h(N)676 649 y Fd(b)693 642 y Fm(\))j Fl(\000)g Fm(1)p 299 672 1245 2 v 299 1111 2 439 v 322 739 a(do)16 b(i=)e(0)p Ff(;)8 b(r)k Fl(\000)f Fm(1)370 806 y(if)16 b(\()p Ff(q)f Fm(=p)q(col\))i(\014nd)f(piv)o(ot)g(v)m(alue)g(and)h(lo)q (cation)370 872 y(broadcast)h(piv)o(ot)e(v)m(alue)g(and)g(lo)q(cation)h(to)g (all)e(pro)q(cesses)370 938 y(exc)o(hange)h(piv)o(ot)g(ro)o(ws)370 1004 y(if)g(\()p Ff(q)f Fm(=p)q(col\))i(divide)e(column)f(r)j(b)q(elo)o(w)f (diagonal)h(b)o(y)f(piv)o(ot)322 1070 y(end)g(do)p 1542 1111 V 299 1113 1245 2 v 299 1131 V 299 1437 2 307 v 322 1198 a(if)f(\()p Ff(p)g Fm(=pro)o(w\))h(then)370 1264 y(broadcast)i Ff(L)626 1271 y Fe(0)662 1264 y Fm(to)f(all)e(pro)q(cess)i(in)f(same)f(template)g(ro)o (w)370 1330 y(solv)o(e)h Ff(L)523 1337 y Fe(0)543 1330 y Ff(U)576 1337 y Fe(1)610 1330 y Fm(=)d Ff(C)322 1397 y Fm(end)j(if)p 1542 1437 V 299 1439 1245 2 v 299 1457 V 299 1697 2 241 v 322 1524 a(broadcast)h Ff(L)577 1531 y Fe(1)613 1524 y Fm(to)g(all)f(pro)q (cesses)g(in)g(same)g(template)e(ro)o(w)322 1591 y(broadcast)j Ff(U)577 1598 y Fe(1)613 1591 y Fm(to)g(all)f(pro)q(cesses)g(in)g(same)g (template)e(column)322 1657 y(up)q(date)j Ff(E)f Fl( )e Ff(E)g Fl(\000)d Ff(L)735 1664 y Fe(1)755 1657 y Ff(U)788 1664 y Fe(1)p 1542 1697 V 299 1699 1245 2 v 299 1778 a Fm(p)q(col=)j(\(p)q(col)d(+)g(1\))j (mo)q(d)g Ff(Q)299 1844 y Fm(pro)o(w=)g(\(pro)o(w)e(+)f(1\))k(mo)q(d)e Ff(P)250 1910 y Fm(end)k(do)p 1696 1951 2 1509 v 228 1953 1470 2 v -57 2060 a(Figure)f(15:)21 b(Pseudo)q(co)q(de)d(for)e(the)g(basic)g (parallel)f(blo)q(c)o(k-partitioned)h(LU)g(factorization)g(algorithm.)k(This) -57 2126 y(co)q(de)13 b(is)g(executed)e(b)o(y)h(eac)o(h)g(pro)q(cess.)21 b(The)13 b(\014rst)g(b)q(o)o(x)g(inside)f(the)g Ff(k)j Fm(lo)q(op)f(factors)f (the)g Ff(k)r Fm(th)f(column)g(of)h(blo)q(c)o(ks.)-57 2193 y(The)i(second)h(b)q(o)o(x)g(solv)o(es)f(a)g(lo)o(w)o(er)g(triangular)h (system)e(to)i(ev)m(aluate)f(the)g Ff(k)r Fm(th)g(ro)o(w)h(of)g(blo)q(c)o(ks) f(of)h Ff(U)5 b Fm(,)15 b(and)h(the)-57 2259 y(third)i(b)q(o)o(x)g(up)q (dates)h(the)f(trailing)g(submatrix.)25 b(The)18 b(template)f(o\013set)h(is)g (giv)o(en)g(b)o(y)f(\()p Ff(p)1593 2266 y Fe(0)1613 2259 y Ff(;)8 b(q)1657 2266 y Fe(0)1677 2259 y Fm(\),)18 b(and)g(\()p Ff(p;)8 b(q)r Fm(\))18 b(is)-57 2325 y(p)q(osition)f(of)f(a)h(pro)q(cess)g (in)f(the)g(template.)939 2825 y(41)p eop %%Page: 42 44 43 bop 689 1209 a Fm(\(a\))16 b(Broadcast)h(along)g(ro)o(ws.)649 1885 y(\(b\))f(Broadcast)h(along)g(columns.)-57 1923 y @beginspecial @setspecial %%BeginDocument: broadcast.ps /arrowdict 13 dict def % Local storage for the procedure % ``arrow.'' /arrow % The procedure ``arrow'' adds an { arrowdict begin % arrow shape to the current path. /headlength exch def % It takes seven arguments: the x /halfheadthickness exch 2 div def % and y coordinates of the tail /halfthickness exch 2 div def % (imagine that a line has been /tipy exch def /tipx exch def % drawn down the center of the /taily exch def /tailx exch def % arrow from the tip to the tail, % then x and y lie on this line), % the x and y coordinates of the % tip of the arrow, the thickness % of the arrow in the tail % portion, the thickness of the % arrow at the widest part of the % arrowhead and the length of the % arrowhead. /dx tipx tailx sub def % Compute the differences in x and /dy tipy taily sub def % y for the tip and tail. These /arrowlength dx dx mul dy dy mul add % will be used to compute the sqrt def % length of the arrow and to /angle dy dx atan def % compute the angle of direction % that the arrow is facing with % respect to the current user % coordinate system origin. /base arrowlength headlength sub def % Compute where the base of the % arrowhead will be. /savematrix matrix currentmatrix def % Save the current user coordinate % system. We are using the same % strategy to localize the effect % of transformations as was used % in the program to draw an % ellipse. tailx taily translate % Translate to the starting point % of the tail. angle rotate % Rotate the x-axis to correspond % with the center line of the % arrow. 0 halfthickness neg moveto % Add the arrow shape to the % current path. base halfthickness neg lineto base halfheadthickness neg lineto arrowlength 0 lineto base halfheadthickness lineto base halfthickness lineto 0 halfthickness lineto closepath savematrix setmatrix % Restore the current user % coordinate system. end } def /Box { /height exch def /length exch def length 0 rlineto 0 height rlineto length neg 0 rlineto closepath } def /Grid { /ny exch def /nx exch def /dely exch def /delx exch def /leny { ny dely mul} def /lenx { nx delx mul} def currentpoint /ypos exch def /xpos exch def /y ypos def /x xpos def 0 1 ny { pop x y moveto lenx 0 rlineto stroke /y y dely add def} for /y ypos def /x xpos def 0 1 nx { pop x y moveto 0 leny rlineto stroke /x x delx add def} for } def /P 4 def /Q 6 def /Del 28.36 def 5.7 72 mul Q Del 2 mul mul 70 add sub 2 div 30 translate 0 Del P mul 40 add translate Del 2 mul 0 moveto Del Del P mul Box gsave 0.85 setgray fill grestore 0 0 moveto Del Del Q P Grid /Mid Del 2.5 mul def /Offset 7 def /Helvetica findfont 18 scalefont setfont Mid (A) stringwidth pop 2 div sub P 0.5 sub Del mul Offset sub moveto (A) show Mid (B) stringwidth pop 2 div sub P 1.5 sub Del mul Offset sub moveto (B) show Mid (C) stringwidth pop 2 div sub P 2.5 sub Del mul Offset sub moveto (C) show Mid (D) stringwidth pop 2 div sub P 3.5 sub Del mul Offset sub moveto (D) show /Twid (A) stringwidth pop 2 div def Mid Twid add 3 add P 0.5 sub Del mul Mid Twid add 18 add P 0.5 sub Del mul 2 7 6 arrow Mid Twid sub 3 sub P 0.5 sub Del mul Mid Twid sub 18 sub P 0.5 sub Del mul 2 7 6 arrow fill Mid Twid add 3 add P 1.5 sub Del mul Mid Twid add 18 add P 1.5 sub Del mul 2 7 6 arrow Mid Twid sub 3 sub P 1.5 sub Del mul Mid Twid sub 18 sub P 1.5 sub Del mul 2 7 6 arrow fill Mid Twid add 3 add P 2.5 sub Del mul Mid Twid add 18 add P 2.5 sub Del mul 2 7 6 arrow Mid Twid sub 3 sub P 2.5 sub Del mul Mid Twid sub 18 sub P 2.5 sub Del mul 2 7 6 arrow fill Mid Twid add 3 add P 3.5 sub Del mul Mid Twid add 18 add P 3.5 sub Del mul 2 7 6 arrow Mid Twid sub 3 sub P 3.5 sub Del mul Mid Twid sub 18 sub P 3.5 sub Del mul 2 7 6 arrow fill Q Del mul 15 add P 2 div Del mul Q Del mul 55 add P 2 div Del mul 8 16 12 arrow stroke 70 Del Q mul add 0 translate 0 0 moveto Del Del Q P Grid 0.5 Del mul (A) stringwidth pop 2 div sub P 0.5 sub Del mul Offset sub moveto (A) show 1.5 Del mul (A) stringwidth pop 2 div sub P 0.5 sub Del mul Offset sub moveto (A) show 2.5 Del mul (A) stringwidth pop 2 div sub P 0.5 sub Del mul Offset sub moveto (A) show 3.5 Del mul (A) stringwidth pop 2 div sub P 0.5 sub Del mul Offset sub moveto (A) show 4.5 Del mul (A) stringwidth pop 2 div sub P 0.5 sub Del mul Offset sub moveto (A) show 5.5 Del mul (A) stringwidth pop 2 div sub P 0.5 sub Del mul Offset sub moveto (A) show 0.5 Del mul (B) stringwidth pop 2 div sub P 1.5 sub Del mul Offset sub moveto (B) show 1.5 Del mul (B) stringwidth pop 2 div sub P 1.5 sub Del mul Offset sub moveto (B) show 2.5 Del mul (B) stringwidth pop 2 div sub P 1.5 sub Del mul Offset sub moveto (B) show 3.5 Del mul (B) stringwidth pop 2 div sub P 1.5 sub Del mul Offset sub moveto (B) show 4.5 Del mul (B) stringwidth pop 2 div sub P 1.5 sub Del mul Offset sub moveto (B) show 5.5 Del mul (B) stringwidth pop 2 div sub P 1.5 sub Del mul Offset sub moveto (B) show 0.5 Del mul (C) stringwidth pop 2 div sub P 2.5 sub Del mul Offset sub moveto (C) show 1.5 Del mul (C) stringwidth pop 2 div sub P 2.5 sub Del mul Offset sub moveto (C) show 2.5 Del mul (C) stringwidth pop 2 div sub P 2.5 sub Del mul Offset sub moveto (C) show 3.5 Del mul (C) stringwidth pop 2 div sub P 2.5 sub Del mul Offset sub moveto (C) show 4.5 Del mul (C) stringwidth pop 2 div sub P 2.5 sub Del mul Offset sub moveto (C) show 5.5 Del mul (C) stringwidth pop 2 div sub P 2.5 sub Del mul Offset sub moveto (C) show 0.5 Del mul (D) stringwidth pop 2 div sub P 3.5 sub Del mul Offset sub moveto (D) show 1.5 Del mul (D) stringwidth pop 2 div sub P 3.5 sub Del mul Offset sub moveto (D) show 2.5 Del mul (D) stringwidth pop 2 div sub P 3.5 sub Del mul Offset sub moveto (D) show 3.5 Del mul (D) stringwidth pop 2 div sub P 3.5 sub Del mul Offset sub moveto (D) show 4.5 Del mul (D) stringwidth pop 2 div sub P 3.5 sub Del mul Offset sub moveto (D) show 5.5 Del mul (D) stringwidth pop 2 div sub P 3.5 sub Del mul Offset sub moveto (D) show 70 Del Q mul add neg Del P mul 40 add neg translate 0 Del 2 mul moveto Del Q mul Del Box gsave 0.85 setgray fill grestore 0 0 moveto Del Del Q P Grid /Mid Del 2.5 mul def /Offset 7 def /Helvetica findfont 18 scalefont setfont Del 0.5 mul (R) stringwidth pop 2 div sub Mid Offset sub moveto (R) show Del 1.5 mul (S) stringwidth pop 2 div sub Mid Offset sub moveto (S) show Del 2.5 mul (T) stringwidth pop 2 div sub Mid Offset sub moveto (T) show Del 3.5 mul (U) stringwidth pop 2 div sub Mid Offset sub moveto (U) show Del 4.5 mul (V) stringwidth pop 2 div sub Mid Offset sub moveto (V) show Del 5.5 mul (W) stringwidth pop 2 div sub Mid Offset sub moveto (W) show Del 0.5 mul 3 Del mul 4 sub Del 0.5 mul 3 Del mul 11 add 2 7 6 arrow fill Del 0.5 mul 2 Del mul 4 add Del 0.5 mul 2 Del mul 11 sub 2 7 6 arrow fill Del 1.5 mul 3 Del mul 4 sub Del 1.5 mul 3 Del mul 11 add 2 7 6 arrow fill Del 1.5 mul 2 Del mul 4 add Del 1.5 mul 2 Del mul 11 sub 2 7 6 arrow fill Del 2.5 mul 3 Del mul 4 sub Del 2.5 mul 3 Del mul 11 add 2 7 6 arrow fill Del 2.5 mul 2 Del mul 4 add Del 2.5 mul 2 Del mul 11 sub 2 7 6 arrow fill Del 3.5 mul 3 Del mul 4 sub Del 3.5 mul 3 Del mul 11 add 2 7 6 arrow fill Del 3.5 mul 2 Del mul 4 add Del 3.5 mul 2 Del mul 11 sub 2 7 6 arrow fill Del 4.5 mul 3 Del mul 4 sub Del 4.5 mul 3 Del mul 11 add 2 7 6 arrow fill Del 4.5 mul 2 Del mul 4 add Del 4.5 mul 2 Del mul 11 sub 2 7 6 arrow fill Del 5.5 mul 3 Del mul 4 sub Del 5.5 mul 3 Del mul 11 add 2 7 6 arrow fill Del 5.5 mul 2 Del mul 4 add Del 5.5 mul 2 Del mul 11 sub 2 7 6 arrow fill Q Del mul 15 add P 2 div Del mul Q Del mul 55 add P 2 div Del mul 8 16 12 arrow stroke 70 Del Q mul add 0 translate 0 0 moveto Del Del Q P Grid 0 1 P 1 sub { 0.5 add /mm exch def 0.5 Del mul (R) stringwidth pop 2 div sub P mm sub Del mul Offset sub moveto (R) show } for 0 1 P 1 sub { 0.5 add /mm exch def 1.5 Del mul (S) stringwidth pop 2 div sub P mm sub Del mul Offset sub moveto (S) show } for 0 1 P 1 sub { 0.5 add /mm exch def 2.5 Del mul (T) stringwidth pop 2 div sub P mm sub Del mul Offset sub moveto (T) show } for 0 1 P 1 sub { 0.5 add /mm exch def 3.5 Del mul (U) stringwidth pop 2 div sub P mm sub Del mul Offset sub moveto (U) show } for 0 1 P 1 sub { 0.5 add /mm exch def 4.5 Del mul (V) stringwidth pop 2 div sub P mm sub Del mul Offset sub moveto (V) show } for 0 1 P 1 sub { 0.5 add /mm exch def 5.5 Del mul (W) stringwidth pop 2 div sub P mm sub Del mul Offset sub moveto (W) show } for %%EndDocument @endspecial 95 x(Figure)j(16:)32 b(Sc)o(hematic)18 b(represen)o(tation)i(of) h(broadcast)h(along)g(ro)o(ws)f(and)h(columns)d(of)i(a)h(4)14 b Fl(\002)g Fm(6)21 b(pro)q(cess)-57 2084 y(template.)e(In)14 b(\(a\),)h(eac)o(h)f(shaded)h(pro)q(cess)h(broadcasts)g(to)f(the)f(pro)q (cesses)i(in)e(the)g(same)g(ro)o(w)h(of)g(the)f(pro)q(cess)-57 2150 y(template.)31 b(In)20 b(\(b\),)g(eac)o(h)g(shaded)g(pro)q(cess)h (broadcasts)h(to)e(the)g(pro)q(cesses)h(in)e(the)h(same)f(column)g(of)h(the) -57 2217 y(pro)q(cess)d(template.)939 2825 y(42)p eop %%Page: 43 45 44 bop -57 1691 a @beginspecial @setspecial %%BeginDocument: choi1.ps /dwdict 100 dict def dwdict begin /PlotAxes % ptsize.....point size of numbers on axes { % ticksize...length of tick marks /ytick exch def % xlen.......length of x axis /xtick exch def % ylen.......length of y axis /ystart exch def % xinc.......increment between ticks on x axis /xstart exch def % yinc.......increment between ticks on y axis /yinc exch def % xstart.....starting value on x axis /xinc exch def % ystart.....starting value on y axis /ylen exch def % xtick......number of ticks on x axis /xlen exch def % ytick......number of ticks on y axis /ticksize exch def /ptsize exch def /yflag exch def % yflag......if yflag=1 truncate numbers to integers /xflag exch def % xflag......if xflag=1 truncate numbers to integers /BoxFlag exch def % BoxFlag....if 1 then draw 4 axes, <0 ticks inside. % +/-2 then don't label x ticks, % +/-3 then don't label y ticks. newpath xlen 0 moveto 0 0 lineto 0 ylen lineto stroke BoxFlag 0 ne {newpath xlen 0 moveto xlen ylen lineto 0 ylen lineto stroke} if /Times-Roman findfont ptsize scalefont setfont /xscale xlen xinc xtick 1 sub mul div def /yscale ylen yinc ytick 1 sub mul div def /str 10 string def /xpos 0 def /inc xlen xtick 1 sub div def 1 1 xtick { 1 sub xinc mul xstart add /val exch def newpath xpos 0 moveto 0 ticksize neg BoxFlag 0 lt {neg} if rlineto stroke BoxFlag 0 ne {newpath xpos ylen moveto 0 ticksize BoxFlag 0 lt {neg} if rlineto stroke} if BoxFlag 2 ne BoxFlag -2 ne and{ xflag 1 eq { val cvi str cvs} { val str cvs} ifelse dup stringwidth pop 2 div xpos exch sub ptsize ticksize add BoxFlag 0 lt {ticksize sub} if neg moveto show} if /xpos xpos inc add def } for /ypos 0 def /inc ylen ytick 1 sub div def 1 1 ytick { 1 sub yinc mul ystart add /val exch def newpath 0 ypos moveto ticksize neg BoxFlag 0 lt {neg} if 0 rlineto stroke BoxFlag 0 ne {newpath xlen ypos moveto ticksize BoxFlag 0 lt {neg} if 0 rlineto stroke} if BoxFlag 3 ne BoxFlag -3 ne and{ yflag 1 eq { val cvi str cvs} { val str cvs} ifelse dup stringwidth pop ticksize add BoxFlag 0 lt {ticksize sub} if ptsize 3 div add neg ypos ypos 0 eq {ptsize 4 div} {ptsize 3 div} ifelse sub moveto show} if /ypos ypos inc add def } for } def /PlotPoints % stack: PlotArray isymbol LogFlag LineFlag=> nothing { /LineFlag exch def % LineFlag<0 no line, 0 solid, >0 dashed lines /LogFlag exch def % LogFlag = 1 for log10x, 2 for log10y, 3 both, <0 ln /isymbol exch def % isymbol = 1.....open circle /PlotArray exch def % = 2.....plus (+) % = 3.....cross (x) % = 4.....open triangle % = 5.....filled circle % = 6.....filled triangle % = 7.....asterisk (*) % = 8.....square =9...filled square % PlotArray = [ [x0 y0] [x1 y1] [x2 y2]...[xn yn] ] LineFlag 1 eq {[1 1] 0 setdash} if LineFlag 2 eq {[2 2] 0 setdash} if LineFlag 3 eq {[3 3] 0 setdash} if LineFlag 4 eq {[4 4] 0 setdash} if LineFlag 5 eq {[5 5] 0 setdash} if LineFlag 6 eq {[6 6] 0 setdash} if LineFlag 7 eq {[7 7] 0 setdash} if LineFlag 8 eq {[8 8] 0 setdash} if LineFlag 9 eq {[3 2 1 2] 0 setdash} if /icount 1 def PlotArray { /xypoint exch def xypoint aload pop /y exch def /x exch def LogFlag 1 eq LogFlag 3 eq or { /x x log def} if LogFlag 2 eq LogFlag 3 eq or { /y y log def} if LogFlag -1 eq LogFlag -3 eq or { /x x ln def} if LogFlag -2 eq LogFlag -3 eq or { /y y ln def} if /xv {x xstart sub xscale mul} def /yv {y ystart sub yscale mul} def LineFlag 0 ge icount 1 gt and { newpath xl yl moveto xv yv lineto stroke} if /icount icount 1 add def /xl xv def /yl yv def } forall [] 0 setdash PlotArray { /xypoint exch def xypoint aload pop /y exch def /x exch def LogFlag 1 eq LogFlag 3 eq or { /x x log def} if LogFlag 2 eq LogFlag 3 eq or { /y y log def} if LogFlag -1 eq LogFlag -3 eq or { /x x ln def} if LogFlag -2 eq LogFlag -3 eq or { /y y ln def} if /xv {x xstart sub xscale mul} def /yv {y ystart sub yscale mul} def xv yv newpath isymbol 1 eq { ticksize 2 div Circle gsave 1.0 setgray fill grestore stroke } if isymbol 2 eq { Plus } if isymbol 3 eq { Cross } if isymbol 4 eq { Triangle gsave 1.0 setgray fill grestore stroke } if isymbol 5 eq { ticksize 2 div Circle fill } if isymbol 6 eq { Triangle fill } if isymbol 7 eq { 2 copy Plus Cross } if isymbol 8 eq { Square gsave 1.0 setgray fill grestore stroke } if isymbol 9 eq { Square fill } if } forall } def /Plus % stack: xcen ycen => ??? Draws + centered on (xcen,ycen) { 2 copy newpath moveto ticksize 2 div 0 rmoveto ticksize neg 0 rlineto stroke newpath moveto 0 ticksize 2 div rmoveto 0 ticksize neg rlineto stroke } def /Square % stack: xcen ycen => ??? Draws square centered on (xcen,ycen) { moveto ticksize 2 div dup rmoveto ticksize neg 0 rlineto 0 ticksize neg rlineto ticksize 0 rlineto closepath } def /Cross % stack: xcen ycen => ??? Draws x centered on (xcen,ycen) { /tinc ticksize 2 sqrt div 2 div def 2 copy newpath moveto tinc tinc rmoveto tinc 2 mul neg dup rlineto stroke newpath moveto tinc neg tinc rmoveto tinc 2 mul dup neg rlineto stroke } def /Triangle % stack: xcen ycen => ??? Draws triangle centered on (xcen ycen) { /tinc ticksize 2 div def /cdis 30 cos tinc mul def /sdis 30 sin tinc mul def moveto cdis neg sdis neg rmoveto cdis 2 mul 0 rlineto cdis neg sdis tinc add rlineto closepath } def /Circle % stack: xcen ycen radius => ??? Draws circle centered on (xcen ycen) { 0 360 arc } def 72 5.7 mul 260 sub 2 div 25 translate 0.5 setlinewidth 1 1 0 10.0 5.0 260.0 200.0 100 5 0 0 6 9 PlotAxes [ [ 10 7.8802 ] [ 11 8.3266 ] [ 12 10.4285 ] [ 13 10.7145 ] [ 14 12.7985 ] [ 15 12.9162 ] [ 16 14.7100 ] [ 17 14.4500 ] [ 18 16.3499 ] [ 19 15.9735 ] [ 20 17.7070 ] [ 21 17.5581 ] [ 22 19.0688 ] [ 25 22.7141 ] [ 30 26.1274 ] [ 40 29.7349 ] [ 50 29.4530 ] [ 60 31.8633 ] [ 80 33.3598 ] [ 100 34.3263 ] [ 120 35.1738 ] [ 125 35.1054 ] [ 126 35.2825 ] [ 127 33.9203 ] [ 129 33.8719 ] [ 130 33.7758 ] [ 135 34.2275 ] [ 140 34.4516 ] [ 150 34.9564 ] [ 160 35.0822 ] [ 180 35.4506 ] [ 200 35.6157 ] [ 250 36.0786 ] [ 300 36.0410 ] [ 350 36.3308 ] [ 400 36.1210 ] [ 450 36.2189 ] [ 500 36.7203 ] ] 0 0 0 PlotPoints [ [ 10 5.0276 ] [ 20 13.5112 ] [ 40 26.8750 ] [ 60 30.4801 ] [ 80 32.8443 ] [ 100 32.6842 ] [ 120 34.6017 ] [ 140 34.3580 ] [ 160 34.9687 ] [ 180 34.6021 ] [ 200 35.4632 ] [ 250 36.3307 ] [ 300 36.1390 ] [ 350 36.1199 ] [ 400 36.2650 ] [ 450 36.1900 ] [ 500 36.5075 ] ] 0 0 3 PlotPoints [ [ 10 2.2604 ] [ 20 11.4058 ] [ 40 20.9068 ] [ 60 28.0906 ] [ 80 30.1549 ] [ 100 31.2846 ] [ 120 32.6557 ] [ 140 33.1458 ] [ 160 33.7460 ] [ 180 34.3786 ] [ 200 34.7263 ] [ 250 35.0089 ] [ 300 34.9900 ] [ 350 35.4920 ] [ 400 35.7016 ] [ 450 35.9241 ] [ 500 36.4087 ] ] 0 0 9 PlotPoints /Symbol findfont 10 scalefont setfont 120 225 140 sub moveto [] 0 setdash 144 225 140 sub lineto stroke (A) dup stringwidth pop 160 exch sub 225 143 sub moveto show /Symbol findfont 7 scalefont setfont (\050 M \264 M \051) dup stringwidth pop 194 exch sub 225 143 sub moveto show /Symbol findfont 10 scalefont setfont 196 225 143 sub moveto (\327 B) show /Symbol findfont 7 scalefont setfont 210 225 143 sub moveto (\050 M \264 M \051) show /Symbol findfont 10 scalefont setfont 120 225 154 sub moveto [3 3] 0 setdash 144 225 154 sub lineto stroke (A) dup stringwidth pop 160 exch sub 225 157 sub moveto show /Symbol findfont 7 scalefont setfont (\050 M \264 M/2 \051) dup stringwidth pop 194 exch sub 225 157 sub moveto show /Symbol findfont 10 scalefont setfont 196 225 157 sub moveto (\327 B) show /Symbol findfont 7 scalefont setfont 210 225 157 sub moveto (\050 M/2 \264 M \051) show /Symbol findfont 10 scalefont setfont 120 225 168 sub moveto [3 2 1 2] 0 setdash 144 225 168 sub lineto stroke (A) dup stringwidth pop 160 exch sub 225 171 sub moveto show /Symbol findfont 7 scalefont setfont (\050 M/2 \264 M \051) dup stringwidth pop 194 exch sub 225 171 sub moveto show /Symbol findfont 10 scalefont setfont 196 225 171 sub moveto (\327 B) show /Symbol findfont 7 scalefont setfont 210 225 171 sub moveto (\050 M \264 M/2 \051) show /Times-Roman findfont 12 scalefont setfont (Matrix Size, M) dup stringwidth pop 260.0 exch sub 2 div -30 moveto show 90 rotate (Mflops) dup stringwidth pop 200.0 exch sub 2 div 25 moveto show end %%EndDocument @endspecial 108 x Fm(Figure)24 b(17:)38 b(P)o(erformance)23 b(of)h(the)g(assem)o(bly-co)q(ded)g(Lev)o(el)f(3)i(BLAS)f(matrix)e(m)o (ultiplic)o(ation)g(routine)-57 1865 y(DGEMM)d(on)h(one)g(i860)g(pro)q (cessor)g(of)g(the)f(In)o(tel)f(Delta)h(system.)30 b(Results)19 b(for)g(square)h(and)g(rectangular)-57 1932 y(matrices)e(are)j(sho)o(wn.)33 b(Note)20 b(that)h(the)f(p)q(eak)g(p)q(erformance)g(of)g(ab)q(out)i(35)f (M\015ops)f(is)g(attained)h(only)f(for)-57 1998 y(matrices)d(whose)j (smallest)e(dimension)f(exceeds)h(100.)31 b(Th)o(us,)20 b(p)q(erformance)e (is)h(impro)o(v)o(ed)e(if)h(a)i(few)f(large)-57 2064 y(matrices)14 b(are)j(m)o(ultipli)o(ed)c(b)o(y)j(eac)o(h)g(pro)q(cess,)g(rather)h(than)f (man)o(y)f(small)g(ones.)939 2825 y(43)p eop %%Page: 44 46 45 bop 228 308 1470 2 v 228 813 2 505 v 250 375 a Fm(do)17 b Ff(b)d Fm(=)g(0)p Ff(;)8 b(b)472 382 y Fe(max)550 375 y Fl(\000)j Fm(1)299 441 y(do)17 b Ff(d)d Fm(=)g(0)p Ff(;)8 b(d)529 448 y Fe(max)608 441 y Fl(\000)j Fm(1)348 507 y(do)17 b Ff(i)c Fm(=)h(0)p Ff(;)8 b(r)13 b Fl(\000)d Fm(1)397 574 y(do)17 b Ff(j)f Fm(=)e(0)p Ff(;)8 b(r)13 b Fl(\000)e Fm(1)446 640 y(do)16 b Ff(k)g Fm(=)e(0)p Ff(;)8 b(r)13 b Fl(\000)d Fm(1)494 706 y Ff(E)s Fm(\()p Ff(b;)e(d)p Fm(;)g Ff(i;)g(j)s Fm(\))13 b(=)h Ff(E)s Fm(\()p Ff(b;)8 b(d)p Fm(;)g Ff(i;)g(j)s Fm(\))i Fl(\000)h Ff(L)1110 713 y Fe(1)1130 706 y Fm(\()p Ff(b;)d(d)p Fm(;)g Ff(i;)g(k)r Fm(\))g Ff(U)1365 713 y Fe(1)1385 706 y Fm(\()p Ff(b;)g(d)p Fm(;)g Ff(k)r(;)g(j)s Fm(\))250 772 y(end)17 b(all)e(do)i(lo)q (ops)p 1696 813 V 228 815 1470 2 v 643 893 a(\(a\))g(Blo)q(c)o(k-blo)q(c)o(k) d(m)o(ultiplication)p 228 936 V 228 1441 2 505 v 250 1003 a(do)j Ff(k)f Fm(=)e(0)p Ff(;)8 b(r)13 b Fl(\000)d Fm(1)299 1070 y(do)17 b Ff(b)d Fm(=)f(0)p Ff(;)8 b(b)520 1077 y Fe(max)599 1070 y Fl(\000)j Fm(1)348 1136 y(do)17 b Ff(j)g Fm(=)c(0)p Ff(;)8 b(r)13 b Fl(\000)e Fm(1)397 1202 y(do)17 b Ff(d)d Fm(=)g(0)p Ff(;)8 b(d)627 1209 y Fe(max)706 1202 y Fl(\000)j Fm(1)446 1268 y(do)16 b Ff(i)e Fm(=)g(0)p Ff(;)8 b(r)k Fl(\000)f Fm(1)494 1334 y Ff(E)s Fm(\()p Ff(b;)d(d)p Fm(;)g Ff(i;)g(j)s Fm(\))13 b(=)h Ff(E)s Fm(\()p Ff(b;)8 b(d)p Fm(;)g Ff(i;)g(j)s Fm(\))i Fl(\000)h Ff(L)1110 1341 y Fe(1)1130 1334 y Fm(\()p Ff(b;)d(d)p Fm(;)g Ff(i;)g(k)r Fm(\))g Ff(U)1365 1341 y Fe(1)1385 1334 y Fm(\()p Ff(b;)g(d)p Fm(;)g Ff(k)r(;)g(j)s Fm(\))250 1401 y(end)17 b(all)e(do)i(lo)q(ops)p 1696 1441 V 228 1443 1470 2 v 590 1522 a(\(b\))g(In)o(termediate)c(form)i(of)i(algorithm)p 228 1565 V 228 1937 2 373 v 250 1632 a(do)g Ff(k)f Fm(=)e(0)p Ff(;)8 b(r)13 b Fl(\000)d Fm(1)299 1698 y(do)17 b Ff(x)d Fm(=)f(0)p Ff(;)8 b(r)q(b)550 1705 y Fe(max)629 1698 y Fl(\000)j Fm(1)348 1764 y(do)17 b Ff(y)e Fm(=)f(0)p Ff(;)8 b(r)q(d)601 1771 y Fe(max)681 1764 y Fl(\000)j Fm(1)397 1830 y Ff(E)s Fm(\()p Ff(x;)d(y)r Fm(\))13 b(=)h Ff(E)s Fm(\()p Ff(x;)8 b(y)r Fm(\))i Fl(\000)h Ff(L)861 1837 y Fe(1)880 1830 y Fm(\()p Ff(x;)d(k)r Fm(\))g Ff(U)1036 1837 y Fe(1)1056 1830 y Fm(\()p Ff(k)r(;)g(y)r Fm(\))250 1897 y(end)17 b(all)e(do)i(lo)q(ops)p 1696 1937 V 228 1939 1470 2 v 575 2018 a(\(c\))f(Outer)g(pro)q(duct)h(form)e(of)h (algorithm)-57 2125 y(Figure)c(18:)19 b(Pseudo)q(co)q(de)14 b(for)e(di\013eren)o(t)f(v)o(ersions)h(of)g(the)g(rank-)p Ff(r)i Fm(up)q(date,)f Ff(E)k Fl( )c Ff(E)5 b Fl(\000)r Ff(L)1560 2132 y Fe(1)1581 2125 y Ff(U)1614 2132 y Fe(1)1634 2125 y Fm(,)12 b(for)h(one)f(pro)q(cess.)-57 2192 y(The)18 b(n)o(um)o(b)q(er)e(of)i(ro)o(w)g (and)g(column)e(blo)q(c)o(ks)i(p)q(er)f(pro)q(cess)i(is)e(giv)o(en)g(b)o(y)g Ff(b)1321 2199 y Fe(max)1407 2192 y Fm(and)h Ff(d)1528 2199 y Fe(max)1596 2192 y Fm(,)g(resp)q(ectiv)o(ely;)d Ff(r)k Fm(is)-57 2258 y(the)c(blo)q(c)o(k)g(size.)20 b(Blo)q(c)o(ks)15 b(are)h(indexed)e(b)o (y)h(\()p Ff(b;)8 b(d)p Fm(\),)15 b(and)h(elemen)o(ts)d(within)i(a)h(blo)q(c) o(k)f(b)o(y)g(\()p Ff(i;)8 b(j)s Fm(\).)21 b(In)15 b(v)o(ersion)g(\(a\))-57 2324 y(the)k Ff(r)14 b Fl(\002)f Ff(r)20 b Fm(blo)q(c)o(ks)f(are)g(m)o (ultiplie)o(d)d(one)k(at)f(a)h(time,)d(giving)h(an)i(inner)e(lo)q(op)i(of)g (length)e Ff(r)q Fm(.)30 b(\(b\))19 b(sho)o(ws)h(the)-57 2390 y(lo)q(ops)15 b(rearranged)g(b)q(efore)f(merging)f(the)g Ff(i)h Fm(and)h Ff(d)f Fm(lo)q(ops,)h(and)g(the)f Ff(j)j Fm(and)d Ff(b)g Fm(lo)q(ops.)22 b(This)14 b(leads)g(to)g(the)g(outer)-57 2456 y(pro)q(duct)j(form)e(of)i(the)f(algorithm)f(sho)o(wn)i(in)f(\(c\))g(in) f(whic)o(h)h(the)g(inner)g(lo)q(op)h(is)f(no)o(w)g(of)h(length)f Ff(r)q(d)1799 2463 y Fe(max)1867 2456 y Fm(.)939 2825 y(44)p eop %%Page: 45 47 46 bop -57 1319 a @beginspecial @setspecial %%BeginDocument: delta_256.ps /dwdict 100 dict def dwdict begin /PlotAxes % ptsize.....point size of numbers on axes { % ticksize...length of tick marks /ytick exch def % xlen.......length of x axis /xtick exch def % ylen.......length of y axis /ystart exch def % xinc.......increment between ticks on x axis /xstart exch def % yinc.......increment between ticks on y axis /yinc exch def % xstart.....starting value on x axis /xinc exch def % ystart.....starting value on y axis /ylen exch def % xtick......number of ticks on x axis /xlen exch def % ytick......number of ticks on y axis /ticksize exch def /ptsize exch def /yflag exch def % yflag......if yflag=1 truncate numbers to integers /xflag exch def % xflag......if xflag=1 truncate numbers to integers /BoxFlag exch def % BoxFlag....if 1 then draw 4 axes, <0 ticks inside. % +/-2 then don't label x ticks, % +/-3 then don't label y ticks. newpath xlen 0 moveto 0 0 lineto 0 ylen lineto stroke BoxFlag 0 ne {newpath xlen 0 moveto xlen ylen lineto 0 ylen lineto stroke} if /Times-Roman findfont ptsize scalefont setfont /xscale xlen xinc xtick 1 sub mul div def /yscale ylen yinc ytick 1 sub mul div def /str 10 string def /xpos 0 def /inc xlen xtick 1 sub div def 1 1 xtick { 1 sub xinc mul xstart add /val exch def newpath xpos 0 moveto 0 ticksize neg BoxFlag 0 lt {neg} if rlineto stroke BoxFlag 0 ne {newpath xpos ylen moveto 0 ticksize BoxFlag 0 lt {neg} if rlineto stroke} if BoxFlag 2 ne BoxFlag -2 ne and{ xflag 1 eq { val cvi str cvs} { val str cvs} ifelse dup stringwidth pop 2 div xpos exch sub ptsize ticksize add BoxFlag 0 lt {ticksize sub} if neg moveto show} if /xpos xpos inc add def } for /ypos 0 def /inc ylen ytick 1 sub div def 1 1 ytick { 1 sub yinc mul ystart add /val exch def newpath 0 ypos moveto ticksize neg BoxFlag 0 lt {neg} if 0 rlineto stroke BoxFlag 0 ne {newpath xlen ypos moveto ticksize BoxFlag 0 lt {neg} if 0 rlineto stroke} if BoxFlag 3 ne BoxFlag -3 ne and{ yflag 1 eq { val cvi str cvs} { val str cvs} ifelse dup stringwidth pop ticksize add BoxFlag 0 lt {ticksize sub} if ptsize 3 div add neg ypos ypos 0 eq {ptsize 4 div} {ptsize 3 div} ifelse sub moveto show} if /ypos ypos inc add def } for } def /PlotPoints % stack: PlotArray isymbol LogFlag LineFlag=> nothing { /LineFlag exch def % LineFlag<0 no line, 0 solid, >0 dashed lines /LogFlag exch def % LogFlag = 1 for log10x, 2 for log10y, 3 both, <0 ln /isymbol exch def % isymbol = 1.....open circle /PlotArray exch def % = 2.....plus (+) % = 3.....cross (x) % = 4.....open triangle % = 5.....filled circle % = 6.....filled triangle % = 7.....asterisk (*) % = 8.....square =9...filled square % PlotArray = [ [x0 y0] [x1 y1] [x2 y2]...[xn yn] ] LineFlag 1 eq {[1 1] 0 setdash} if LineFlag 2 eq {[2 2] 0 setdash} if LineFlag 3 eq {[3 3] 0 setdash} if LineFlag 4 eq {[4 4] 0 setdash} if LineFlag 5 eq {[5 5] 0 setdash} if LineFlag 6 eq {[6 6] 0 setdash} if LineFlag 7 eq {[7 7] 0 setdash} if LineFlag 8 eq {[8 8] 0 setdash} if LineFlag 9 eq {[9 9] 0 setdash} if /icount 1 def PlotArray { /xypoint exch def xypoint aload pop /y exch def /x exch def LogFlag 1 eq LogFlag 3 eq or { /x x log def} if LogFlag 2 eq LogFlag 3 eq or { /y y log def} if LogFlag -1 eq LogFlag -3 eq or { /x x ln def} if LogFlag -2 eq LogFlag -3 eq or { /y y ln def} if /xv {x xstart sub xscale mul} def /yv {y ystart sub yscale mul} def LineFlag 0 ge icount 1 gt and { newpath xl yl moveto xv yv lineto stroke} if /icount icount 1 add def /xl xv def /yl yv def } forall [] 0 setdash PlotArray { /xypoint exch def xypoint aload pop /y exch def /x exch def LogFlag 1 eq LogFlag 3 eq or { /x x log def} if LogFlag 2 eq LogFlag 3 eq or { /y y log def} if LogFlag -1 eq LogFlag -3 eq or { /x x ln def} if LogFlag -2 eq LogFlag -3 eq or { /y y ln def} if /xv {x xstart sub xscale mul} def /yv {y ystart sub yscale mul} def xv yv newpath isymbol 1 eq { ticksize 2 div Circle gsave 1.0 setgray fill grestore stroke } if isymbol 2 eq { Plus } if isymbol 3 eq { Cross } if isymbol 4 eq { Triangle gsave 1.0 setgray fill grestore stroke } if isymbol 5 eq { ticksize 2 div Circle fill } if isymbol 6 eq { Triangle fill } if isymbol 7 eq { 2 copy Plus Cross } if isymbol 8 eq { Square gsave 1.0 setgray fill grestore stroke } if isymbol 9 eq { Square fill } if } forall } def /Plus % stack: xcen ycen => ??? Draws + centered on (xcen,ycen) { 2 copy newpath moveto ticksize 2 div 0 rmoveto ticksize neg 0 rlineto stroke newpath moveto 0 ticksize 2 div rmoveto 0 ticksize neg rlineto stroke } def /Square % stack: xcen ycen => ??? Draws square centered on 9xcen,ycen) { moveto ticksize 2 div dup rmoveto ticksize neg 0 rlineto 0 ticksize neg rlineto ticksize 0 rlineto closepath } def /Cross % stack: xcen ycen => ??? Draws x centered on (xcen,ycen) { /tinc ticksize 2 sqrt div 2 div def 2 copy newpath moveto tinc tinc rmoveto tinc 2 mul neg dup rlineto stroke newpath moveto tinc neg tinc rmoveto tinc 2 mul dup neg rlineto stroke } def /Triangle % stack: xcen ycen => ??? Draws triangle centered on (xcen ycen) { /tinc ticksize 2 div def /cdis 30 cos tinc mul def /sdis 30 sin tinc mul def moveto cdis neg sdis neg rmoveto cdis 2 mul 0 rlineto cdis neg sdis tinc add rlineto closepath } def /Circle % stack: xcen ycen radius => ??? Draws circle centered on (xcen ycen) { 0 360 arc } def 5.7 72 mul 300 sub 2 div 25 translate .5 setlinewidth 1 1 1 10.0 5.0 300.0 225.0 3000 1 0 0 7 7 PlotAxes [ % 4x64 [ 1000 0.3912] [ 2000 1.0378] [ 3000 1.6841] [ 4000 2.2291] [ 5000 2.7003] [ 6000 3.1397] [ 7000 3.5025] [ 8000 3.7999] [ 9000 4.0397] [10000 4.3308] [11000 4.5189] [12000 4.7168] [13000 4.8764] [14000 5.0468] [15000 5.1878] [16000 5.3428] [17000 5.4457] [18000 5.5534] ] 0 0 3 PlotPoints [ % 8x32 [ 1000 0.4295] [ 2000 1.2543] [ 3000 2.0505] [ 4000 2.6849] [ 5000 3.2223] [ 6000 3.6362] [ 7000 4.0059] [ 8000 4.3213] [ 9000 4.5717] [10000 4.8044] [11000 5.0045] [12000 5.1797] [13000 5.3354] [14000 5.4930] [15000 5.6369] [16000 5.7594] [17000 5.8572] [18000 5.9600] ] 0 0 0 PlotPoints [ % 12x21 [ 1000 0.3002] [ 2000 0.9200] [ 3000 1.6020] [ 4000 2.2255] [ 5000 2.7595] [ 6000 3.2035] [ 7000 3.5842] [ 8000 3.8479] [ 9000 4.1574] [10000 4.4058] [11000 4.5876] [12000 4.7954] [13000 4.9699] [14000 5.1213] [15000 5.2718] [16000 5.3662] [17000 5.4893] [18000 5.6113] ] 0 0 1 PlotPoints [ % 16x16 [ 1000 0.3152] [ 2000 0.9387] [ 3000 1.5780] [ 4000 2.1262] [ 5000 2.5682] [ 6000 2.8791] [ 7000 3.2004] [ 8000 3.4355] [ 9000 3.7168] [10000 3.9302] [11000 4.1393] [12000 4.3118] [13000 4.4804] [14000 4.6445] [15000 4.7893] [16000 4.9204] [17000 5.0382] [18000 5.1529] ] 0 0 4 PlotPoints /Times-Roman findfont 12 scalefont setfont (Matrix Size, M) dup stringwidth pop 300.0 exch sub 2 div -30 moveto show 90 rotate (Gflop/s) dup stringwidth pop 225.0 exch sub 2 div 20 moveto show -90 rotate /Symbol findfont 12 scalefont setfont 10 225 14 sub moveto [3 3] 0 setdash 30 225 14 sub lineto stroke (4) dup stringwidth pop 35 exch sub 7 add 225 18 sub moveto show 38 7 add 225 18 sub moveto (\264 64) show 10 225 26 sub moveto [] 0 setdash 30 225 26 sub lineto stroke (8) dup stringwidth pop 35 exch sub 7 add 225 30 sub moveto show 38 7 add 225 30 sub moveto (\264 32) show 10 225 38 sub moveto [1 1] 0 setdash 30 225 38 sub lineto stroke (12) dup stringwidth pop 35 exch sub 7 add 225 42 sub moveto show 38 7 add 225 42 sub moveto (\264 21) show 10 225 50 sub moveto [4 4] 0 setdash 30 225 50 sub lineto stroke (16) dup stringwidth pop 35 exch sub 7 add 225 54 sub moveto show 38 7 add 225 54 sub moveto (\264 16) show end %%EndDocument @endspecial 107 x Fm(Figure)14 b(19:)20 b(P)o(erformance)13 b(of)h(LU)g(factorization)g(on)g(the)g(In)o(tel)f(Delta)g(as)i(a)f(function)g (of)g(square)h(matrix)d(size)-57 1492 y(for)h(di\013eren)o(t)g(pro)q(cessor)h (templates)d(con)o(taining)i(appro)o(ximately)e(256)j(pro)q(cessors.)22 b(The)13 b(b)q(est)g(p)q(erformance)-57 1559 y(is)j(for)h(an)f(asp)q(ect)h (ratio)f(of)h(1/4,)g(though)g(the)f(dep)q(endence)g(on)h(asp)q(ect)f(ratio)h (is)f(rather)g(w)o(eak.)p 228 1701 1470 2 v 228 2339 2 638 v 250 1768 a(if)g(\()p Ff(q)f Fm(=p)q(col\))i(then)299 1835 y(do)g(i=)c(0)p Ff(;)8 b(r)13 b Fl(\000)e Fm(1)348 1901 y(\014nd)17 b(piv)o(ot)e(v)m(alue)h(and)h(lo)q(cation)348 1967 y(exc)o(hange)f(piv)o(ot)f (ro)o(ws)i(lying)f(within)f(panel)348 2033 y(divide)g(column)g(r)h(b)q(elo)o (w)g(diagonal)h(b)o(y)f(piv)o(ot)299 2100 y(end)g(do)250 2166 y(end)h(if)250 2232 y(broadcast)h(piv)o(ot)e(information)f(for)h Ff(r)i Fm(piv)o(ots)e(along)h(template)d(ro)o(ws)250 2298 y(exc)o(hange)i (piv)o(ot)g(ro)o(ws)h(lying)e(outside)h(the)g(panel)g(for)h(eac)o(h)f(of)g Ff(r)i Fm(piv)o(ots)p 1696 2339 V 228 2341 1470 2 v -57 2448 a(Figure)i(20:)30 b(Pseudo)q(co)q(de)22 b(fragmen)o(t)d(for)i(partial)f(piv)o (oting)g(o)o(v)o(er)g(ro)o(ws.)34 b(This)21 b(ma)o(y)e(b)q(e)h(regarded)h(as) g(re-)-57 2514 y(placing)16 b(the)h(\014rst)g(b)q(o)o(x)g(inside)f(the)g Ff(k)j Fm(lo)q(op)e(in)g(Figure)f(15.)23 b(In)17 b(the)f(ab)q(o)o(v)o(e)h(co) q(de)g(piv)o(ot)f(information)f(is)i(\014rst)-57 2581 y(disseminated)11 b(within)h(the)g(template)e(column)h(doing)i(the)f(panel)g(factorization.)20 b(The)13 b(piv)o(oting)e(of)i(the)f(parts)-57 2647 y(of)17 b(the)f(ro)o(ws)g(lying)g(outside)g(the)g(panel)g(is)g(deferred)g(un)o(til)f (the)h(panel)g(factorization)g(has)h(b)q(een)f(completed.)939 2825 y(45)p eop %%Page: 46 48 47 bop -57 1059 a @beginspecial @setspecial %%BeginDocument: delta_results.ps /dwdict 100 dict def dwdict begin /PlotAxes % ptsize.....point size of numbers on axes { % ticksize...length of tick marks /ytick exch def % xlen.......length of x axis /xtick exch def % ylen.......length of y axis /ystart exch def % xinc.......increment between ticks on x axis /xstart exch def % yinc.......increment between ticks on y axis /yinc exch def % xstart.....starting value on x axis /xinc exch def % ystart.....starting value on y axis /ylen exch def % xtick......number of ticks on x axis /xlen exch def % ytick......number of ticks on y axis /ticksize exch def /ptsize exch def /yflag exch def % yflag......if yflag=1 truncate numbers to integers /xflag exch def % xflag......if xflag=1 truncate numbers to integers /BoxFlag exch def % BoxFlag....if 1 then draw 4 axes, <0 ticks inside. % +/-2 then don't label x ticks, % +/-3 then don't label y ticks. newpath xlen 0 moveto 0 0 lineto 0 ylen lineto stroke BoxFlag 0 ne {newpath xlen 0 moveto xlen ylen lineto 0 ylen lineto stroke} if /Times-Roman findfont ptsize scalefont setfont /xscale xlen xinc xtick 1 sub mul div def /yscale ylen yinc ytick 1 sub mul div def /str 10 string def /xpos 0 def /inc xlen xtick 1 sub div def 1 1 xtick { 1 sub xinc mul xstart add /val exch def newpath xpos 0 moveto 0 ticksize neg BoxFlag 0 lt {neg} if rlineto stroke BoxFlag 0 ne {newpath xpos ylen moveto 0 ticksize BoxFlag 0 lt {neg} if rlineto stroke} if BoxFlag 2 ne BoxFlag -2 ne and{ xflag 1 eq { val cvi str cvs} { val str cvs} ifelse dup stringwidth pop 2 div xpos exch sub ptsize ticksize add BoxFlag 0 lt {ticksize sub} if neg moveto show} if /xpos xpos inc add def } for /ypos 0 def /inc ylen ytick 1 sub div def 1 1 ytick { 1 sub yinc mul ystart add /val exch def newpath 0 ypos moveto ticksize neg BoxFlag 0 lt {neg} if 0 rlineto stroke BoxFlag 0 ne {newpath xlen ypos moveto ticksize BoxFlag 0 lt {neg} if 0 rlineto stroke} if BoxFlag 3 ne BoxFlag -3 ne and{ yflag 1 eq { val cvi str cvs} { val str cvs} ifelse dup stringwidth pop ticksize add BoxFlag 0 lt {ticksize sub} if ptsize 3 div add neg ypos ypos 0 eq {ptsize 4 div} {ptsize 3 div} ifelse sub moveto show} if /ypos ypos inc add def } for } def /PlotPoints % stack: PlotArray isymbol LogFlag LineFlag=> nothing { /LineFlag exch def % LineFlag<0 no line, 0 solid, >0 dashed lines /LogFlag exch def % LogFlag = 1 for log10x, 2 for log10y, 3 both, <0 ln /isymbol exch def % isymbol = 1.....open circle /PlotArray exch def % = 2.....plus (+) % = 3.....cross (x) % = 4.....open triangle % = 5.....filled circle % = 6.....filled triangle % = 7.....asterisk (*) % = 8.....square =9...filled square % PlotArray = [ [x0 y0] [x1 y1] [x2 y2]...[xn yn] ] LineFlag 1 eq {[1 1] 0 setdash} if LineFlag 2 eq {[2 2] 0 setdash} if LineFlag 3 eq {[3 3] 0 setdash} if LineFlag 4 eq {[4 4] 0 setdash} if LineFlag 5 eq {[5 5] 0 setdash} if LineFlag 6 eq {[6 6] 0 setdash} if LineFlag 7 eq {[7 7] 0 setdash} if LineFlag 8 eq {[8 8] 0 setdash} if LineFlag 9 eq {[9 9] 0 setdash} if /icount 1 def PlotArray { /xypoint exch def xypoint aload pop /y exch def /x exch def LogFlag 1 eq LogFlag 3 eq or { /x x log def} if LogFlag 2 eq LogFlag 3 eq or { /y y log def} if LogFlag -1 eq LogFlag -3 eq or { /x x ln def} if LogFlag -2 eq LogFlag -3 eq or { /y y ln def} if /xv {x xstart sub xscale mul} def /yv {y ystart sub yscale mul} def LineFlag 0 ge icount 1 gt and { newpath xl yl moveto xv yv lineto stroke} if /icount icount 1 add def /xl xv def /yl yv def } forall [] 0 setdash PlotArray { /xypoint exch def xypoint aload pop /y exch def /x exch def LogFlag 1 eq LogFlag 3 eq or { /x x log def} if LogFlag 2 eq LogFlag 3 eq or { /y y log def} if LogFlag -1 eq LogFlag -3 eq or { /x x ln def} if LogFlag -2 eq LogFlag -3 eq or { /y y ln def} if /xv {x xstart sub xscale mul} def /yv {y ystart sub yscale mul} def xv yv newpath isymbol 1 eq { ticksize 2 div Circle gsave 1.0 setgray fill grestore stroke } if isymbol 2 eq { Plus } if isymbol 3 eq { Cross } if isymbol 4 eq { Triangle gsave 1.0 setgray fill grestore stroke } if isymbol 5 eq { ticksize 2 div Circle fill } if isymbol 6 eq { Triangle fill } if isymbol 7 eq { 2 copy Plus Cross } if isymbol 8 eq { Square gsave 1.0 setgray fill grestore stroke } if isymbol 9 eq { Square fill } if } forall } def /Plus % stack: xcen ycen => ??? Draws + centered on (xcen,ycen) { 2 copy newpath moveto ticksize 2 div 0 rmoveto ticksize neg 0 rlineto stroke newpath moveto 0 ticksize 2 div rmoveto 0 ticksize neg rlineto stroke } def /Square % stack: xcen ycen => ??? Draws square centered on 9xcen,ycen) { moveto ticksize 2 div dup rmoveto ticksize neg 0 rlineto 0 ticksize neg rlineto ticksize 0 rlineto closepath } def /Cross % stack: xcen ycen => ??? Draws x centered on (xcen,ycen) { /tinc ticksize 2 sqrt div 2 div def 2 copy newpath moveto tinc tinc rmoveto tinc 2 mul neg dup rlineto stroke newpath moveto tinc neg tinc rmoveto tinc 2 mul dup neg rlineto stroke } def /Triangle % stack: xcen ycen => ??? Draws triangle centered on (xcen ycen) { /tinc ticksize 2 div def /cdis 30 cos tinc mul def /sdis 30 sin tinc mul def moveto cdis neg sdis neg rmoveto cdis 2 mul 0 rlineto cdis neg sdis tinc add rlineto closepath } def /Circle % stack: xcen ycen radius => ??? Draws circle centered on (xcen ycen) { 0 360 arc } def 5.7 72 mul 240 sub 2 div 25 translate 0.5 setlinewidth 0.8 0.8 scale 1 1 1 12.0 5.0 350.0 260.0 4000 2 0 0 8 7 PlotAxes [ [ 1000 0.2706] [ 2000 0.4687] [ 3000 0.5843] [ 4000 0.6635] [ 5000 0.7164] [ 6000 0.7565] ] 0 0 0 PlotPoints /Symbol findfont 12 scalefont setfont 6300 xscale mul 0.6 yscale mul moveto (2 \264 16) show [ [ 1000 0.3706] [ 2000 0.7361] [ 3000 0.9459] [ 4000 1.1395] [ 5000 1.2602] [ 6000 1.3514] [ 7000 1.4264] [ 8000 1.4870] [ 9000 1.5310] ] 0 0 0 PlotPoints /Symbol findfont 12 scalefont setfont 9300 xscale mul 1.4 yscale mul moveto (4 \264 16) show [ [ 1000 0.3812] [ 2000 0.9313] [ 3000 1.3684] [ 4000 1.6985] [ 5000 1.9744] [ 6000 2.1861] [ 7000 2.3604] [ 8000 2.5144] [ 9000 2.6366] [10000 2.7939] [11000 2.8301] [12000 2.9131] [13000 2.9363] ] 0 0 0 PlotPoints /Symbol findfont 12 scalefont setfont 13300 xscale mul 2.8 yscale mul moveto (4 \264 32) show [ [ 1000 0.4295] [ 2000 1.2543] [ 3000 2.0505] [ 4000 2.6849] [ 5000 3.2223] [ 6000 3.6362] [ 7000 4.0059] [ 8000 4.3213] [ 9000 4.5717] [10000 4.8044] [11000 5.0045] [12000 5.1797] [13000 5.3354] [14000 5.4930] [15000 5.6369] [16000 5.7594] [17000 5.8572] [18000 5.9600] ] 0 0 0 PlotPoints /Symbol findfont 12 scalefont setfont 18300 xscale mul 5.8 yscale mul moveto (8 \264 32) show [ [ 1000 0.4582] [ 4000 3.5106] [ 7000 6.0104] [10000 7.6360] [13000 8.9478] [16000 9.9107] [19000 10.6223] [22000 11.1653] [25000 11.6282] [26000 11.7564] ] 0 0 0 PlotPoints /Symbol findfont 12 scalefont setfont 24000 xscale mul 10.8 yscale mul moveto (8 \264 64) show /Times-Roman findfont 14 scalefont setfont (Matrix Size, M) dup stringwidth pop 300.0 exch sub 2 div -30 moveto show 90 rotate (Gflops) dup stringwidth pop 225.0 exch sub 2 div 20 moveto show end %%EndDocument @endspecial 108 x Fm(Figure)14 b(21:)20 b(P)o(erformance)13 b(of)h(LU)g(factorization)g(on)g(the)g(In)o(tel)f(Delta)g(as)i(a)f(function)g (of)g(square)h(matrix)d(size)-57 1233 y(for)18 b(di\013eren)o(t)f(n)o(um)o(b) q(ers)g(of)h(pro)q(cessors.)28 b(F)l(or)18 b(eac)o(h)g(curv)o(e,)f(results)g (are)i(sho)o(wn)f(for)h(the)e(pro)q(cess)i(template)-57 1300 y(con\014guration)e(that)g(ga)o(v)o(e)f(the)g(b)q(est)g(p)q(erformance)f(for) i(that)g(n)o(um)o(b)q(er)d(of)j(pro)q(cessors.)-57 2360 y @beginspecial @setspecial %%BeginDocument: iso.ps /dwdict 100 dict def dwdict begin /PlotAxes % ptsize.....point size of numbers on axes { % ticksize...length of tick marks /ytick exch def % xlen.......length of x axis /xtick exch def % ylen.......length of y axis /ystart exch def % xinc.......increment between ticks on x axis /xstart exch def % yinc.......increment between ticks on y axis /yinc exch def % xstart.....starting value on x axis /xinc exch def % ystart.....starting value on y axis /ylen exch def % xtick......number of ticks on x axis /xlen exch def % ytick......number of ticks on y axis /ticksize exch def /ptsize exch def /yflag exch def % yflag......if yflag=1 truncate numbers to integers /xflag exch def % xflag......if xflag=1 truncate numbers to integers /BoxFlag exch def % BoxFlag....if 1 then draw 4 axes, <0 ticks inside. % +/-2 then don't label x ticks, % +/-3 then don't label y ticks. newpath xlen 0 moveto 0 0 lineto 0 ylen lineto stroke BoxFlag 0 ne {newpath xlen 0 moveto xlen ylen lineto 0 ylen lineto stroke} if /Times-Roman findfont ptsize scalefont setfont /xscale xlen xinc xtick 1 sub mul div def /yscale ylen yinc ytick 1 sub mul div def /str 10 string def /xpos 0 def /inc xlen xtick 1 sub div def 1 1 xtick { 1 sub xinc mul xstart add /val exch def newpath xpos 0 moveto 0 ticksize neg BoxFlag 0 lt {neg} if rlineto stroke BoxFlag 0 ne {newpath xpos ylen moveto 0 ticksize BoxFlag 0 lt {neg} if rlineto stroke} if BoxFlag 2 ne BoxFlag -2 ne and{ xflag 1 eq { val cvi str cvs} { val str cvs} ifelse dup stringwidth pop 2 div xpos exch sub ptsize ticksize add BoxFlag 0 lt {ticksize sub} if neg moveto show} if /xpos xpos inc add def } for /ypos 0 def /inc ylen ytick 1 sub div def 1 1 ytick { 1 sub yinc mul ystart add /val exch def newpath 0 ypos moveto ticksize neg BoxFlag 0 lt {neg} if 0 rlineto stroke BoxFlag 0 ne {newpath xlen ypos moveto ticksize BoxFlag 0 lt {neg} if 0 rlineto stroke} if BoxFlag 3 ne BoxFlag -3 ne and{ yflag 1 eq { val cvi str cvs} { val str cvs} ifelse dup stringwidth pop ticksize add BoxFlag 0 lt {ticksize sub} if ptsize 3 div add neg ypos ypos 0 eq {ptsize 4 div} {ptsize 3 div} ifelse sub moveto show} if /ypos ypos inc add def } for } def /PlotPoints % stack: PlotArray isymbol LogFlag LineFlag=> nothing { /LineFlag exch def % LineFlag<0 no line, 0 solid, >0 dashed lines /LogFlag exch def % LogFlag = 1 for log10x, 2 for log10y, 3 both, <0 ln /isymbol exch def % isymbol = 1.....open circle /PlotArray exch def % = 2.....plus (+) % = 3.....cross (x) % = 4.....open triangle % = 5.....filled circle % = 6.....filled triangle % = 7.....asterisk (*) % = 8.....square =9...filled square % PlotArray = [ [x0 y0] [x1 y1] [x2 y2]...[xn yn] ] LineFlag 1 eq {[1 1] 0 setdash} if LineFlag 2 eq {[2 2] 0 setdash} if LineFlag 3 eq {[3 3] 0 setdash} if LineFlag 4 eq {[4 4] 0 setdash} if LineFlag 5 eq {[5 5] 0 setdash} if LineFlag 6 eq {[6 6] 0 setdash} if LineFlag 7 eq {[7 7] 0 setdash} if LineFlag 8 eq {[8 8] 0 setdash} if LineFlag 9 eq {[9 9] 0 setdash} if /icount 1 def PlotArray { /xypoint exch def xypoint aload pop /y exch def /x exch def LogFlag 1 eq LogFlag 3 eq or { /x x log def} if LogFlag 2 eq LogFlag 3 eq or { /y y log def} if LogFlag -1 eq LogFlag -3 eq or { /x x ln def} if LogFlag -2 eq LogFlag -3 eq or { /y y ln def} if /xv {x xstart sub xscale mul} def /yv {y ystart sub yscale mul} def LineFlag 0 ge icount 1 gt and { newpath xl yl moveto xv yv lineto stroke} if /icount icount 1 add def /xl xv def /yl yv def } forall [] 0 setdash PlotArray { /xypoint exch def xypoint aload pop /y exch def /x exch def LogFlag 1 eq LogFlag 3 eq or { /x x log def} if LogFlag 2 eq LogFlag 3 eq or { /y y log def} if LogFlag -1 eq LogFlag -3 eq or { /x x ln def} if LogFlag -2 eq LogFlag -3 eq or { /y y ln def} if /xv {x xstart sub xscale mul} def /yv {y ystart sub yscale mul} def xv yv newpath isymbol 1 eq { ticksize 2 div Circle gsave 1.0 setgray fill grestore stroke } if isymbol 2 eq { Plus } if isymbol 3 eq { Cross } if isymbol 4 eq { Triangle gsave 1.0 setgray fill grestore stroke } if isymbol 5 eq { ticksize 2 div Circle fill } if isymbol 6 eq { Triangle fill } if isymbol 7 eq { 2 copy Plus Cross } if isymbol 8 eq { Square gsave 1.0 setgray fill grestore stroke } if isymbol 9 eq { Square fill } if } forall } def /Plus % stack: xcen ycen => ??? Draws + centered on (xcen,ycen) { 2 copy newpath moveto ticksize 2 div 0 rmoveto ticksize neg 0 rlineto stroke newpath moveto 0 ticksize 2 div rmoveto 0 ticksize neg rlineto stroke } def /Square % stack: xcen ycen => ??? Draws square centered on 9xcen,ycen) { moveto ticksize 2 div dup rmoveto ticksize neg 0 rlineto 0 ticksize neg rlineto ticksize 0 rlineto closepath } def /Cross % stack: xcen ycen => ??? Draws x centered on (xcen,ycen) { /tinc ticksize 2 sqrt div 2 div def 2 copy newpath moveto tinc tinc rmoveto tinc 2 mul neg dup rlineto stroke newpath moveto tinc neg tinc rmoveto tinc 2 mul dup neg rlineto stroke } def /Triangle % stack: xcen ycen => ??? Draws triangle centered on (xcen ycen) { /tinc ticksize 2 div def /cdis 30 cos tinc mul def /sdis 30 sin tinc mul def moveto cdis neg sdis neg rmoveto cdis 2 mul 0 rlineto cdis neg sdis tinc add rlineto closepath } def /Circle % stack: xcen ycen radius => ??? Draws circle centered on (xcen ycen) { 0 360 arc } def 0.5 setlinewidth 5.7 72 mul 240 sub 2 div 25 translate 0.8 0.8 scale 1 1 1 12.0 5.0 350.0 260.0 100 2 0 0 7 7 PlotAxes [ % granularity = 25000**2/512 [ 32 0.76] [ 64 1.52] [ 128 2.92] [ 256 5.92] [ 512 11.63] ] 3 0 0 PlotPoints 520 xscale mul 11.63 yscale mul 4 sub moveto (1.221) show [ % granularity = 10000**2/512 [ 32 0.52] [ 64 1.05] [ 128 1.974] [ 256 4.0] [ 512 7.636] ] 3 0 0 PlotPoints 520 xscale mul 7.636 yscale mul 4 sub moveto (\0400.195) show [ % granularity = 7000**2/512 [ 32 0.32] [ 64 0.85] [ 128 1.520] [ 256 3.222] [ 512 4.991] ] 3 0 0 PlotPoints 520 xscale mul 4.991 yscale mul 4 sub moveto (\0400.096) show [ % granularity = 16000**2/512 [ 32 0.66] [ 64 1.33] [ 128 2.51] [ 256 5.06] [ 512 9.91] ] 3 0 0 PlotPoints 520 xscale mul 9.91 yscale mul 4 sub moveto (\0400.500) show /Times-Roman findfont 14 scalefont setfont (Number of Processors) dup stringwidth pop 300.0 exch sub 2 div -30 moveto show 90 rotate (Gflops) dup stringwidth pop 250.0 exch sub 2 div 20 moveto show end %%EndDocument @endspecial 107 x(Figure)e(22:)21 b(Isogran)o(ularit)o(y)15 b(curv)o(es)f(in)g(the)h(\()p Ff(N)834 2474 y Fd(p)854 2467 y Ff(;)8 b(G)p Fm(\))15 b(plane)g(for)h(the)e(LU)h(factorization)g(of)h (square)f(matrices)-57 2533 y(on)22 b(the)g(In)o(tel)e(Delta)i(system.)36 b(The)22 b(curv)o(es)f(are)h(lab)q(eled)f(b)o(y)g(the)h(gran)o(ularit)o(y)f (in)h(units)f(of)h(10)1798 2515 y Fe(6)1841 2533 y Fm(matrix)-57 2600 y(elemen)o(ts)c(p)q(er)i(pro)q(cessor.)35 b(The)20 b(linearit)o(y)f(of)i (the)f(plots)g(for)h(gran)o(ularities)f(exceeding)f(ab)q(out)j(0)p Ff(:)p Fm(2)14 b Fl(\002)g Fm(10)1963 2582 y Fe(6)-57 2666 y Fm(indicates)h(that)i(the)f(LU)g(factorization)h(algorithm)e(scales)h(w)o (ell)f(on)i(the)f(Delta.)939 2825 y(46)p eop %%Trailer end userdict /end-hook known{end-hook}if %%EOF .