COMBINATORIAL_BLAS  1.6
ParFriendsExt.h
Go to the documentation of this file.
1 /****************************************************************/
2 /* Parallel Combinatorial BLAS Library (for Graph Computations) */
3 /* version 1.6 -------------------------------------------------*/
4 /* date: 6/15/2017 ---------------------------------------------*/
5 /* authors: Ariful Azad, Aydin Buluc --------------------------*/
6 /****************************************************************/
7 /*
8  Copyright (c) 2010-2017, The Regents of the University of California
9 
10  Permission is hereby granted, free of charge, to any person obtaining a copy
11  of this software and associated documentation files (the "Software"), to deal
12  in the Software without restriction, including without limitation the rights
13  to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14  copies of the Software, and to permit persons to whom the Software is
15  furnished to do so, subject to the following conditions:
16 
17  The above copyright notice and this permission notice shall be included in
18  all copies or substantial portions of the Software.
19 
20  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26  THE SOFTWARE.
27  */
28 
29 
30 #ifndef _PAR_FRIENDS_EXT_H_
31 #define _PAR_FRIENDS_EXT_H_
32 
33 #include "mpi.h"
34 #include <iostream>
35 #include "SpParMat.h"
36 #include "SpParHelper.h"
37 #include "MPIType.h"
38 #include "Friends.h"
39 
40 namespace combblas {
41 
42 template <class IT, class NT, class DER>
43 class SpParMat;
44 
45 /*************************************************************************************************/
46 /**************************** FRIEND FUNCTIONS FOR PARALLEL CLASSES ******************************/
47 /************************** EXTENDED SET (NOT COMMONLY USED FUNCTIONS) ***************************/
48 /*************************************************************************************************/
49 
50 
57 template <typename SR, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
58 SpParMat<IU,typename promote_trait<NU1,NU2>::T_promote,typename promote_trait<UDERA,UDERB>::T_promote> Mult_AnXBn_ActiveTarget
60 
61 {
62  typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
63  typedef typename promote_trait<UDERA,UDERB>::T_promote DER_promote;
64 
65  if(A.getncol() != B.getnrow())
66  {
67  std::cout<<"Can not multiply, dimensions does not match"<<std::endl;
68  MPI_Abort(MPI_COMM_WORLD, DIMMISMATCH);
70  }
71  int stages, Aoffset, Boffset; // stages = inner dimension of matrix blocks
72  std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, Aoffset, Boffset);
73 
74  IU C_m = A.spSeq->getnrow();
75  IU C_n = B.spSeq->getncol();
76 
77  UDERA A1seq, A2seq;
78  (A.spSeq)->Split( A1seq, A2seq);
79 
80  // ABAB: It should be able to perform split/merge with the transpose option [single function call]
81  const_cast< UDERB* >(B.spSeq)->Transpose();
82 
83  UDERB B1seq, B2seq;
84  (B.spSeq)->Split( B1seq, B2seq);
85 
86  // Create row and column windows (collective operation, i.e. everybody exposes its window to others)
87  std::vector<MPI_Win> rowwins1, rowwins2, colwins1, colwins2;
88  SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), A1seq, rowwins1);
89  SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), A2seq, rowwins2);
90  SpParHelper::SetWindows((B.commGrid)->GetColWorld(), B1seq, colwins1);
91  SpParHelper::SetWindows((B.commGrid)->GetColWorld(), B2seq, colwins2);
92 
93  // ABAB: We can optimize the call to create windows in the absence of passive synchronization
94  // MPI_Info info;
95  // MPI_Info_create ( &info );
96  // MPI_Info_set( info, "no_locks", "true" );
97  // MPI_Win_create( . . ., info, . . . );
98  // MPI_Info_free( &info );
99 
100  IU ** ARecvSizes1 = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
101  IU ** ARecvSizes2 = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
102  IU ** BRecvSizes1 = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
103  IU ** BRecvSizes2 = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
104 
105  SpParHelper::GetSetSizes( A1seq, ARecvSizes1, (A.commGrid)->GetRowWorld());
106  SpParHelper::GetSetSizes( A2seq, ARecvSizes2, (A.commGrid)->GetRowWorld());
107  SpParHelper::GetSetSizes( B1seq, BRecvSizes1, (B.commGrid)->GetColWorld());
108  SpParHelper::GetSetSizes( B2seq, BRecvSizes2, (B.commGrid)->GetColWorld());
109 
110  // Remotely fetched matrices are stored as pointers
111  UDERA * ARecv1, * ARecv2;
112  UDERB * BRecv1, * BRecv2;
113  std::vector< SpTuples<IU,N_promote> *> tomerge;
114 
115  MPI_Group row_group, col_group;
116  MPI_Comm_group((A.commGrid)->GetRowWorld(), &row_group);
117  MPI_Comm_group((B.commGrid)->GetColWorld(), &col_group);
118 
119  int Aself = (A.commGrid)->GetRankInProcRow();
120  int Bself = (B.commGrid)->GetRankInProcCol();
121 
122 #ifdef SPGEMMDEBUG
123  MPI_Barrier(GridC->GetWorld());
124  SpParHelper::Print("Writing to file\n");
125  std::ofstream oput;
126  GridC->OpenDebugFile("deb", oput);
127  oput << "A1seq: " << A1seq.getnrow() << " " << A1seq.getncol() << " " << A1seq.getnnz() << std::endl;
128  oput << "A2seq: " << A2seq.getnrow() << " " << A2seq.getncol() << " " << A2seq.getnnz() << std::endl;
129  oput << "B1seq: " << B1seq.getnrow() << " " << B1seq.getncol() << " " << B1seq.getnnz() << std::endl;
130  oput << "B2seq: " << B2seq.getnrow() << " " << B2seq.getncol() << " " << B2seq.getnnz() << std::endl;
131  SpParHelper::Print("Wrote to file\n");
132  MPI_Barrier(GridC->GetWorld());
133 #endif
134 
135  SpParHelper::PostExposureEpoch(Aself, rowwins1, row_group);
136  SpParHelper::PostExposureEpoch(Aself, rowwins2, row_group);
137  SpParHelper::PostExposureEpoch(Bself, colwins1, col_group);
138  SpParHelper::PostExposureEpoch(Bself, colwins2, col_group);
139 
140  MPI_Barrier(GridC->GetWorld());
141  SpParHelper::Print("Exposure epochs posted\n");
142  MPI_Barrier(GridC->GetWorld());
143 
144  int Aowner = (0+Aoffset) % stages;
145  int Bowner = (0+Boffset) % stages;
146  SpParHelper::AccessNFetch(ARecv1, Aowner, rowwins1, row_group, ARecvSizes1);
147  SpParHelper::AccessNFetch(ARecv2, Aowner, rowwins2, row_group, ARecvSizes2); // Start prefetching next half
148 
149  for(int j=0; j< rowwins1.size(); ++j) // wait for the first half to complete
150  rowwins1[j].Complete();
151 
152  SpParHelper::AccessNFetch(BRecv1, Bowner, colwins1, col_group, BRecvSizes1);
153  SpParHelper::AccessNFetch(BRecv2, Bowner, colwins2, col_group, BRecvSizes2); // Start prefetching next half
154 
155  for(int j=0; j< colwins1.size(); ++j)
156  colwins1[j].Complete();
157 
158  for(int i = 1; i < stages; ++i)
159  {
160 
161 #ifdef SPGEMMDEBUG
162  SpParHelper::Print("Stage starting\n");
163 #endif
164 
165  SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv1, *BRecv1, false, true);
166 
167 #ifdef SPGEMMDEBUG
168  SpParHelper::Print("Multiplied\n");
169 #endif
170 
171  if(!C_cont->isZero())
172  tomerge.push_back(C_cont);
173 
174 #ifdef SPGEMMDEBUG
175  SpParHelper::Print("Pushed back\n");
176  MPI_Barrier(GridC->GetWorld());
177 #endif
178 
179  bool remoteA = false;
180  bool remoteB = false;
181 
182  delete ARecv1; // free the memory of the previous first half
183  for(int j=0; j< rowwins2.size(); ++j) // wait for the previous second half to complete
184  rowwins2[j].Complete();
185  SpParHelper::Print("Completed A\n");
186 
187  delete BRecv1;
188  for(int j=0; j< colwins2.size(); ++j) // wait for the previous second half to complete
189  colwins2[j].Complete();
190 
191 #ifdef SPGEMMDEBUG
192  SpParHelper::Print("Completed B\n");
193  MPI_Barrier(GridC->GetWorld());
194 #endif
195 
196  Aowner = (i+Aoffset) % stages;
197  Bowner = (i+Boffset) % stages;
198 
199  // start fetching the current first half
200  SpParHelper::AccessNFetch(ARecv1, Aowner, rowwins1, row_group, ARecvSizes1);
201  SpParHelper::AccessNFetch(BRecv1, Bowner, colwins1, col_group, BRecvSizes1);
202 
203 #ifdef SPGEMMDEBUG
204  SpParHelper::Print("Fetched next\n");
205  MPI_Barrier(GridC->GetWorld());
206 #endif
207 
208  // while multiplying the already completed previous second halfs
209  C_cont = MultiplyReturnTuples<SR>(*ARecv2, *BRecv2, false, true);
210  if(!C_cont->isZero())
211  tomerge.push_back(C_cont);
212 
213 #ifdef SPGEMMDEBUG
214  SpParHelper::Print("Multiplied and pushed\n");
215  MPI_Barrier(GridC->GetWorld());
216 #endif
217 
218  delete ARecv2; // free memory of the previous second half
219  delete BRecv2;
220 
221  for(int j=0; j< rowwins1.size(); ++j) // wait for the current first half to complte
222  rowwins1[j].Complete();
223  for(int j=0; j< colwins1.size(); ++j)
224  colwins1[j].Complete();
225 
226 #ifdef SPGEMMDEBUG
227  SpParHelper::Print("Completed next\n");
228  MPI_Barrier(GridC->GetWorld());
229 #endif
230 
231  // start prefetching the current second half
232  SpParHelper::AccessNFetch(ARecv2, Aowner, rowwins2, row_group, ARecvSizes2);
233  SpParHelper::AccessNFetch(BRecv2, Bowner, colwins2, col_group, BRecvSizes2);
234  }
235  SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv1, *BRecv1, false, true);
236  if(!C_cont->isZero())
237  tomerge.push_back(C_cont);
238 
239  delete ARecv1; // free the memory of the previous first half
240  for(int j=0; j< rowwins2.size(); ++j) // wait for the previous second half to complete
241  rowwins2[j].Complete();
242  delete BRecv1;
243  for(int j=0; j< colwins2.size(); ++j) // wait for the previous second half to complete
244  colwins2[j].Complete();
245 
246  C_cont = MultiplyReturnTuples<SR>(*ARecv2, *BRecv2, false, true);
247  if(!C_cont->isZero())
248  tomerge.push_back(C_cont);
249 
250  delete ARecv2;
251  delete BRecv2;
252 
253  SpHelper::deallocate2D(ARecvSizes1, UDERA::esscount);
254  SpHelper::deallocate2D(ARecvSizes2, UDERA::esscount);
255  SpHelper::deallocate2D(BRecvSizes1, UDERB::esscount);
256  SpHelper::deallocate2D(BRecvSizes2, UDERB::esscount);
257 
258  DER_promote * C = new DER_promote(MergeAll<SR>(tomerge, C_m, C_n), false, NULL); // First get the result in SpTuples, then convert to UDER
259  for(int i=0; i<tomerge.size(); ++i)
260  {
261  delete tomerge[i];
262  }
263 
264  // MPI_Win_Wait() works like a barrier as it waits for all origins to finish their remote memory operation on "this" window
265  SpParHelper::WaitNFree(rowwins1);
266  SpParHelper::WaitNFree(rowwins2);
267  SpParHelper::WaitNFree(colwins1);
268  SpParHelper::WaitNFree(colwins2);
269 
270  (A.spSeq)->Merge(A1seq, A2seq);
271  (B.spSeq)->Merge(B1seq, B2seq);
272 
273  MPI_Group_free(&row_group);
274  MPI_Group_free(&col_group);
275  const_cast< UDERB* >(B.spSeq)->Transpose(); // transpose back to original
276  return SpParMat<IU,N_promote,DER_promote> (C, GridC); // return the result object
277 }
278 
290 template <typename SR, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
293 
294 {
295  typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
296  typedef typename promote_trait<UDERA,UDERB>::T_promote DER_promote;
297 
298  if(A.getncol() != B.getnrow())
299  {
300  std::cout<<"Can not multiply, dimensions does not match"<<std::endl;
301  MPI_Abort(MPI_COMM_WORLD, DIMMISMATCH);
303  }
304  int stages, Aoffset, Boffset; // stages = inner dimension of matrix blocks
305  std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, Aoffset, Boffset);
306 
307  IU C_m = A.spSeq->getnrow();
308  IU C_n = B.spSeq->getncol();
309 
310  UDERA A1seq, A2seq;
311  (A.spSeq)->Split( A1seq, A2seq);
312 
313  // ABAB: It should be able to perform split/merge with the transpose option [single function call]
314  const_cast< UDERB* >(B.spSeq)->Transpose();
315 
316  UDERB B1seq, B2seq;
317  (B.spSeq)->Split( B1seq, B2seq);
318 
319  // Create row and column windows (collective operation, i.e. everybody exposes its window to others)
320  std::vector<MPI_Win> rowwins1, rowwins2, colwins1, colwins2;
321  SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), A1seq, rowwins1);
322  SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), A2seq, rowwins2);
323  SpParHelper::SetWindows((B.commGrid)->GetColWorld(), B1seq, colwins1);
324  SpParHelper::SetWindows((B.commGrid)->GetColWorld(), B2seq, colwins2);
325 
326  IU ** ARecvSizes1 = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
327  IU ** ARecvSizes2 = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
328  IU ** BRecvSizes1 = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
329  IU ** BRecvSizes2 = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
330 
331  SpParHelper::GetSetSizes( A1seq, ARecvSizes1, (A.commGrid)->GetRowWorld());
332  SpParHelper::GetSetSizes( A2seq, ARecvSizes2, (A.commGrid)->GetRowWorld());
333  SpParHelper::GetSetSizes( B1seq, BRecvSizes1, (B.commGrid)->GetColWorld());
334  SpParHelper::GetSetSizes( B2seq, BRecvSizes2, (B.commGrid)->GetColWorld());
335 
336  // Remotely fetched matrices are stored as pointers
337  UDERA * ARecv1, * ARecv2;
338  UDERB * BRecv1, * BRecv2;
339  std::vector< SpTuples<IU,N_promote> *> tomerge; // sorted triples to be merged
340 
341  MPI_Group row_group, col_group;
342  MPI_Comm_group((A.commGrid)->GetRowWorld(), &row_group);
343  MPI_Comm_group((B.commGrid)->GetColWorld(), &col_group);
344 
345  int Aself = (A.commGrid)->GetRankInProcRow();
346  int Bself = (B.commGrid)->GetRankInProcCol();
347 
348  int Aowner = (0+Aoffset) % stages;
349  int Bowner = (0+Boffset) % stages;
350 
351  SpParHelper::LockNFetch(ARecv1, Aowner, rowwins1, row_group, ARecvSizes1);
352  SpParHelper::LockNFetch(ARecv2, Aowner, rowwins2, row_group, ARecvSizes2); // Start prefetching next half
353  SpParHelper::LockNFetch(BRecv1, Bowner, colwins1, col_group, BRecvSizes1);
354  SpParHelper::LockNFetch(BRecv2, Bowner, colwins2, col_group, BRecvSizes2); // Start prefetching next half
355 
356  // Finish the first halfs
357  SpParHelper::UnlockWindows(Aowner, rowwins1);
358  SpParHelper::UnlockWindows(Bowner, colwins1);
359 
360  for(int i = 1; i < stages; ++i)
361  {
362  SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv1, *BRecv1, false, true);
363 
364  if(!C_cont->isZero())
365  tomerge.push_back(C_cont);
366 
367  bool remoteA = false;
368  bool remoteB = false;
369 
370  delete ARecv1; // free the memory of the previous first half
371  delete BRecv1;
372 
373  SpParHelper::UnlockWindows(Aowner, rowwins2); // Finish the second half
374  SpParHelper::UnlockWindows(Bowner, colwins2);
375 
376  Aowner = (i+Aoffset) % stages;
377  Bowner = (i+Boffset) % stages;
378 
379  // start fetching the current first half
380  SpParHelper::LockNFetch(ARecv1, Aowner, rowwins1, row_group, ARecvSizes1);
381  SpParHelper::LockNFetch(BRecv1, Bowner, colwins1, col_group, BRecvSizes1);
382 
383  // while multiplying the already completed previous second halfs
384  C_cont = MultiplyReturnTuples<SR>(*ARecv2, *BRecv2, false, true);
385  if(!C_cont->isZero())
386  tomerge.push_back(C_cont);
387 
388  delete ARecv2; // free memory of the previous second half
389  delete BRecv2;
390 
391  // wait for the current first half to complte
392  SpParHelper::UnlockWindows(Aowner, rowwins1);
393  SpParHelper::UnlockWindows(Bowner, colwins1);
394 
395  // start prefetching the current second half
396  SpParHelper::LockNFetch(ARecv2, Aowner, rowwins2, row_group, ARecvSizes2);
397  SpParHelper::LockNFetch(BRecv2, Bowner, colwins2, col_group, BRecvSizes2);
398  }
399 
400  SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv1, *BRecv1, false, true);
401  if(!C_cont->isZero())
402  tomerge.push_back(C_cont);
403 
404  delete ARecv1; // free the memory of the previous first half
405  delete BRecv1;
406 
407  SpParHelper::UnlockWindows(Aowner, rowwins2);
408  SpParHelper::UnlockWindows(Bowner, colwins2);
409 
410  C_cont = MultiplyReturnTuples<SR>(*ARecv2, *BRecv2, false, true);
411  if(!C_cont->isZero())
412  tomerge.push_back(C_cont);
413 
414  delete ARecv2;
415  delete BRecv2;
416 
417  SpHelper::deallocate2D(ARecvSizes1, UDERA::esscount);
418  SpHelper::deallocate2D(ARecvSizes2, UDERA::esscount);
419  SpHelper::deallocate2D(BRecvSizes1, UDERB::esscount);
420  SpHelper::deallocate2D(BRecvSizes2, UDERB::esscount);
421 
422  DER_promote * C = new DER_promote(MergeAll<SR>(tomerge, C_m, C_n), false, NULL); // First get the result in SpTuples, then convert to UDER
423  for(int i=0; i<tomerge.size(); ++i)
424  {
425  delete tomerge[i];
426  }
427 
428  SpParHelper::FreeWindows(rowwins1);
429  SpParHelper::FreeWindows(rowwins2);
430  SpParHelper::FreeWindows(colwins1);
431  SpParHelper::FreeWindows(colwins2);
432 
433  (A.spSeq)->Merge(A1seq, A2seq);
434  (B.spSeq)->Merge(B1seq, B2seq);
435 
436  MPI_Group_free(&row_group);
437  MPI_Group_free(&col_group);
438  const_cast< UDERB* >(B.spSeq)->Transpose(); // transpose back to original
439  return SpParMat<IU,N_promote,DER_promote> (C, GridC); // return the result object
440 }
441 
447 template <typename SR, typename IU, typename NU1, typename NU2, typename UDERA, typename UDERB>
448 SpParMat<IU,typename promote_trait<NU1,NU2>::T_promote,typename promote_trait<UDERA,UDERB>::T_promote> Mult_AnXBn_Fence
450 {
451  typedef typename promote_trait<NU1,NU2>::T_promote N_promote;
452  typedef typename promote_trait<UDERA,UDERB>::T_promote DER_promote;
453 
454  if(A.getncol() != B.getnrow())
455  {
456  std::cout<<"Can not multiply, dimensions does not match"<<std::endl;
457  MPI_Abort(MPI_COMM_WORLD, DIMMISMATCH);
459  }
460 
461  int stages, Aoffset, Boffset; // stages = inner dimension of matrix blocks
462  std::shared_ptr<CommGrid> GridC = ProductGrid((A.commGrid).get(), (B.commGrid).get(), stages, Aoffset, Boffset);
463 
464  std::ofstream oput;
465  GridC->OpenDebugFile("deb", oput);
466  const_cast< UDERB* >(B.spSeq)->Transpose();
467 
468  // set row & col window handles
469  std::vector<MPI_Win> rowwindows, colwindows;
470  std::vector<MPI_Win> rowwinnext, colwinnext;
471  SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), *(A.spSeq), rowwindows);
472  SpParHelper::SetWindows((B.commGrid)->GetColWorld(), *(B.spSeq), colwindows);
473  SpParHelper::SetWindows((A.commGrid)->GetRowWorld(), *(A.spSeq), rowwinnext);
474  SpParHelper::SetWindows((B.commGrid)->GetColWorld(), *(B.spSeq), colwinnext);
475 
476  IU ** ARecvSizes = SpHelper::allocate2D<IU>(UDERA::esscount, stages);
477  IU ** BRecvSizes = SpHelper::allocate2D<IU>(UDERB::esscount, stages);
478 
479  SpParHelper::GetSetSizes( *(A.spSeq), ARecvSizes, (A.commGrid)->GetRowWorld());
480  SpParHelper::GetSetSizes( *(B.spSeq), BRecvSizes, (B.commGrid)->GetColWorld());
481 
482  UDERA * ARecv, * ARecvNext;
483  UDERB * BRecv, * BRecvNext;
484  std::vector< SpTuples<IU,N_promote> *> tomerge;
485 
486  // Prefetch first
487  for(int j=0; j< rowwindows.size(); ++j)
488  MPI_Win_fence(MPI_MODE_NOPRECEDE, rowwindows[j]);
489  for(int j=0; j< colwindows.size(); ++j)
490  MPI_Win_fence(MPI_MODE_NOPRECEDE, colwindows[j]);
491 
492  for(int j=0; j< rowwinnext.size(); ++j)
493  MPI_Win_fence(MPI_MODE_NOPRECEDE, rowwinnext[j]);
494  for(int j=0; j< colwinnext.size(); ++j)
495  MPI_Win_fence(MPI_MODE_NOPRECEDE, colwinnext[j]);
496 
497 
498  int Aownind = (0+Aoffset) % stages;
499  int Bownind = (0+Boffset) % stages;
500  if(Aownind == (A.commGrid)->GetRankInProcRow())
501  {
502  ARecv = A.spSeq; // shallow-copy
503  }
504  else
505  {
506  std::vector<IU> ess1(UDERA::esscount); // pack essentials to a vector
507  for(int j=0; j< UDERA::esscount; ++j)
508  {
509  ess1[j] = ARecvSizes[j][Aownind];
510  }
511  ARecv = new UDERA(); // create the object first
512 
513  oput << "For A (out), Fetching " << (void*)rowwindows[0] << std::endl;
514  SpParHelper::FetchMatrix(*ARecv, ess1, rowwindows, Aownind); // fetch its elements later
515  }
516  if(Bownind == (B.commGrid)->GetRankInProcCol())
517  {
518  BRecv = B.spSeq; // shallow-copy
519  }
520  else
521  {
522  std::vector<IU> ess2(UDERB::esscount); // pack essentials to a vector
523  for(int j=0; j< UDERB::esscount; ++j)
524  {
525  ess2[j] = BRecvSizes[j][Bownind];
526  }
527  BRecv = new UDERB();
528 
529  oput << "For B (out), Fetching " << (void*)colwindows[0] << std::endl;
530  SpParHelper::FetchMatrix(*BRecv, ess2, colwindows, Bownind); // No lock version, only get !
531  }
532 
533  int Aownprev = Aownind;
534  int Bownprev = Bownind;
535 
536  for(int i = 1; i < stages; ++i)
537  {
538  Aownind = (i+Aoffset) % stages;
539  Bownind = (i+Boffset) % stages;
540 
541  if(i % 2 == 1) // Fetch RecvNext via winnext, fence on Recv via windows
542  {
543  if(Aownind == (A.commGrid)->GetRankInProcRow())
544  {
545  ARecvNext = A.spSeq; // shallow-copy
546  }
547  else
548  {
549  std::vector<IU> ess1(UDERA::esscount); // pack essentials to a vector
550  for(int j=0; j< UDERA::esscount; ++j)
551  {
552  ess1[j] = ARecvSizes[j][Aownind];
553  }
554  ARecvNext = new UDERA(); // create the object first
555 
556  oput << "For A, Fetching " << (void*) rowwinnext[0] << std::endl;
557  SpParHelper::FetchMatrix(*ARecvNext, ess1, rowwinnext, Aownind);
558  }
559 
560  if(Bownind == (B.commGrid)->GetRankInProcCol())
561  {
562  BRecvNext = B.spSeq; // shallow-copy
563  }
564  else
565  {
566  std::vector<IU> ess2(UDERB::esscount); // pack essentials to a vector
567  for(int j=0; j< UDERB::esscount; ++j)
568  {
569  ess2[j] = BRecvSizes[j][Bownind];
570  }
571  BRecvNext = new UDERB();
572 
573  oput << "For B, Fetching " << (void*)colwinnext[0] << std::endl;
574  SpParHelper::FetchMatrix(*BRecvNext, ess2, colwinnext, Bownind); // No lock version, only get !
575  }
576 
577  oput << "Fencing " << (void*) rowwindows[0] << std::endl;
578  oput << "Fencing " << (void*) colwindows[0] << std::endl;
579 
580  for(int j=0; j< rowwindows.size(); ++j)
581  MPI_Win_fence(MPI_MODE_NOSTORE, rowwindows[j]); // Synch using "other" windows
582  for(int j=0; j< colwindows.size(); ++j)
583  MPI_Win_fence(MPI_MODE_NOSTORE, colwindows[j]);
584 
585  SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv, *BRecv, false, true);
586  if(!C_cont->isZero())
587  tomerge.push_back(C_cont);
588 
589  if(Aownprev != (A.commGrid)->GetRankInProcRow()) delete ARecv;
590  if(Bownprev != (B.commGrid)->GetRankInProcCol()) delete BRecv;
591 
592  Aownprev = Aownind;
593  Bownprev = Bownind;
594  }
595  else // fetch to Recv via windows, fence on RecvNext via winnext
596  {
597 
598  if(Aownind == (A.commGrid)->GetRankInProcRow())
599  {
600  ARecv = A.spSeq; // shallow-copy
601  }
602  else
603  {
604  std::vector<IU> ess1(UDERA::esscount); // pack essentials to a vector
605  for(int j=0; j< UDERA::esscount; ++j)
606  {
607  ess1[j] = ARecvSizes[j][Aownind];
608  }
609  ARecv = new UDERA(); // create the object first
610 
611  oput << "For A, Fetching " << (void*) rowwindows[0] << std::endl;
612  SpParHelper::FetchMatrix(*ARecv, ess1, rowwindows, Aownind);
613  }
614 
615  if(Bownind == (B.commGrid)->GetRankInProcCol())
616  {
617  BRecv = B.spSeq; // shallow-copy
618  }
619  else
620  {
621  std::vector<IU> ess2(UDERB::esscount); // pack essentials to a vector
622  for(int j=0; j< UDERB::esscount; ++j)
623  {
624  ess2[j] = BRecvSizes[j][Bownind];
625  }
626  BRecv = new UDERB();
627 
628  oput << "For B, Fetching " << (void*)colwindows[0] << std::endl;
629  SpParHelper::FetchMatrix(*BRecv, ess2, colwindows, Bownind); // No lock version, only get !
630  }
631 
632  oput << "Fencing " << (void*) rowwinnext[0] << std::endl;
633  oput << "Fencing " << (void*) rowwinnext[0] << std::endl;
634 
635  for(int j=0; j< rowwinnext.size(); ++j)
636  MPI_Win_fence(MPI_MODE_NOSTORE, rowwinnext[j]); // Synch using "other" windows
637  for(int j=0; j< colwinnext.size(); ++j)
638  MPI_Win_fence(MPI_MODE_NOSTORE, colwinnext[j]);
639 
640  SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecvNext, *BRecvNext, false, true);
641  if(!C_cont->isZero())
642  tomerge.push_back(C_cont);
643 
644 
645  if(Aownprev != (A.commGrid)->GetRankInProcRow()) delete ARecvNext;
646  if(Bownprev != (B.commGrid)->GetRankInProcCol()) delete BRecvNext;
647 
648  Aownprev = Aownind;
649  Bownprev = Bownind;
650  }
651 
652  }
653 
654  if(stages % 2 == 1) // fence on Recv via windows
655  {
656  oput << "Fencing " << (void*) rowwindows[0] << std::endl;
657  oput << "Fencing " << (void*) colwindows[0] << std::endl;
658 
659  for(int j=0; j< rowwindows.size(); ++j)
660  MPI_Win_fence(MPI_MODE_NOSUCCEED, rowwindows[j]); // Synch using "prev" windows
661  for(int j=0; j< colwindows.size(); ++j)
662  MPI_Win_fence(MPI_MODE_NOSUCCEED, colwindows[j]);
663 
664  SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecv, *BRecv, false, true);
665  if(!C_cont->isZero())
666  tomerge.push_back(C_cont);
667 
668  if(Aownprev != (A.commGrid)->GetRankInProcRow()) delete ARecv;
669  if(Bownprev != (B.commGrid)->GetRankInProcRow()) delete BRecv;
670  }
671  else // fence on RecvNext via winnext
672  {
673  oput << "Fencing " << (void*) rowwinnext[0] << std::endl;
674  oput << "Fencing " << (void*) colwinnext[0] << std::endl;
675 
676  for(int j=0; j< rowwinnext.size(); ++j)
677  MPI_Win_fence(MPI_MODE_NOSUCCEED, rowwinnext[j]); // Synch using "prev" windows
678  for(int j=0; j< colwinnext.size(); ++j)
679  MPI_Win_fence(MPI_MODE_NOSUCCEED, colwinnext[j]);
680 
681  SpTuples<IU,N_promote> * C_cont = MultiplyReturnTuples<SR>(*ARecvNext, *BRecvNext, false, true);
682  if(!C_cont->isZero())
683  tomerge.push_back(C_cont);
684 
685  if(Aownprev != (A.commGrid)->GetRankInProcRow()) delete ARecvNext;
686  if(Bownprev != (B.commGrid)->GetRankInProcRow()) delete BRecvNext;
687  }
688  for(int i=0; i< rowwindows.size(); ++i)
689  {
690  MPI_Win_free(&rowwindows[i]);
691  MPI_Win_free(&rowwinnext[i]);
692  }
693  for(int i=0; i< colwindows.size(); ++i)
694  {
695  MPI_Win_free(&colwindows[i]);
696  MPI_Win_free(&colwinnext[i]);
697  }
698  MPI_Barrier(GridC->GetWorld());
699 
700  IU C_m = A.spSeq->getnrow();
701  IU C_n = B.spSeq->getncol();
702  DER_promote * C = new DER_promote(MergeAll<SR>(tomerge, C_m, C_n), false, NULL); // First get the result in SpTuples, then convert to UDER
703  for(int i=0; i<tomerge.size(); ++i)
704  {
705  delete tomerge[i];
706  }
707  SpHelper::deallocate2D(ARecvSizes, UDERA::esscount);
708  SpHelper::deallocate2D(BRecvSizes, UDERB::esscount);
709 
710  const_cast< UDERB* >(B.spSeq)->Transpose(); // transpose back to original
711  return SpParMat<IU,N_promote,DER_promote> (C, GridC); // return the result object
712 }
713 
714 }
715 
716 #endif
double B
static void LockNFetch(DER *&Matrix, int owner, std::vector< MPI_Win > &arrwin, MPI_Group &group, IT **sizes)
static void AccessNFetch(DER *&Matrix, int owner, std::vector< MPI_Win > &arrwin, MPI_Group &group, IT **sizes)
bool isZero() const
Definition: SpTuples.h:266
static void GetSetSizes(const SpMat< IT, NT, DER > &Matrix, IT **&sizes, MPI_Comm &comm1d)
shared_ptr< CommGrid > ProductGrid(CommGrid *gridA, CommGrid *gridB, int &innerdim, int &Aoffset, int &Boffset)
Definition: CommGrid.cpp:164
static void WaitNFree(std::vector< MPI_Win > &arrwin)
static void FetchMatrix(SpMat< IT, NT, DER > &MRecv, const std::vector< IT > &essentials, std::vector< MPI_Win > &arrwin, int ownind)
SpParMat< IU, typename promote_trait< NU1, NU2 >::T_promote, typename promote_trait< UDERA, UDERB >::T_promote > Mult_AnXBn_ActiveTarget(const SpParMat< IU, NU1, UDERA > &A, const SpParMat< IU, NU2, UDERB > &B)
Definition: ParFriendsExt.h:59
double A
static void Print(const std::string &s)
IT getncol() const
Definition: SpParMat.cpp:694
double C
static void UnlockWindows(int ownind, std::vector< MPI_Win > &arrwin)
#define DIMMISMATCH
Definition: SpDefs.h:73
Definition: CCGrid.h:4
static void PostExposureEpoch(int self, std::vector< MPI_Win > &arrwin, MPI_Group &group)
static void deallocate2D(T **array, I m)
Definition: SpHelper.h:249
SpParMat< IU, typename promote_trait< NU1, NU2 >::T_promote, typename promote_trait< UDERA, UDERB >::T_promote > Mult_AnXBn_Fence(const SpParMat< IU, NU1, UDERA > &A, const SpParMat< IU, NU2, UDERB > &B)
static void FreeWindows(std::vector< MPI_Win > &arrwin)
static void SetWindows(MPI_Comm &comm1d, const SpMat< IT, NT, DER > &Matrix, std::vector< MPI_Win > &arrwin)
SpParMat< IU, typename promote_trait< NU1, NU2 >::T_promote, typename promote_trait< UDERA, UDERB >::T_promote > Mult_AnXBn_PassiveTarget(const SpParMat< IU, NU1, UDERA > &A, const SpParMat< IU, NU2, UDERB > &B)
IT getnrow() const
Definition: SpParMat.cpp:685