PROGRAM possibly_recoverable_simulation USE, INTRINSIC :: ISO_FORTRAN_ENV, ONLY:TEAM_TYPE, STAT_FAILED_IMAGE IMPLICIT NONE INTEGER, ALLOCATABLE :: failures (:) ! Indices of the failed images. INTEGER, ALLOCATABLE :: old_failures(:) ! Previous failures. INTEGER, ALLOCATABLE :: map(:) ! For each spare image k in use, ! map(k) holds the index of the failed image it replaces. INTEGER :: images_spare ! No. spare images. Not altered in main loop. INTEGER :: images_used ! Max index of image in use INTEGER :: failed ! Index of a failed image INTEGER :: i, j, k ! Temporaries INTEGER :: status ! stat= value INTEGER :: team_number [*] ! 1 if in working team; 2 otherwise. INTEGER :: local_index [*] ! Index of the image in the team TYPE (TEAM_TYPE) :: simulation_team LOGICAL :: read_checkpoint ! If read_checkpoint true on ! entering simulation_procedure, go back to previous check point. LOGICAL :: done [*] ! True if computation finished on the image LOGICAL :: start ! True initially. ! Keep 1% spare images if we have a lot, just 1 if 10-199 images, ! 0 if <10. images_spare = MAX(INT(0.01*NUM_IMAGES()),0,MIN(NUM_IMAGES()-10,1)) images_used = NUM_IMAGES () - images_spare ALLOCATE ( old_failures(0), map(images_used+1:NUM_IMAGES()) ) start = .true. outer : DO local_index = THIS_IMAGE () team_number = MERGE (1, 2, local_index<=images_used) SYNC ALL (STAT = status) IF (status/=0 .AND. status/=STAT_FAILED_IMAGE) EXIT outer IF (IMAGE_STATUS (1) == STAT_FAILED_IMAGE) & ERROR STOP "cannot recover" IF (THIS_IMAGE () == 1) THEN ! For each newly failed image in team 1, move into team 1 a ! non-failed image of team 2. failures = FAILED_IMAGES () ! Note that the values ! returned by FAILED_IMAGES increase monotonically. k = images_used j = 1 DO i = 1, SIZE (failures) IF (failures(i) > images_used) EXIT ! This failed image and ! all further failed images are in team 2 and do not matter. failed = failures(i) ! Check whether this is an old failed image. IF (j <= SIZE (old_failures)) THEN IF (failed == old_failures(j) ) THEN j = j+1 CYCLE ! No action needed for old failed image. END IF END IF ! Allow for the failed image being a replacement image IF( failed > NUM_IMAGES()-images_spare ) failed = map(failed) ! Seek a non-failed image DO k = k+1, NUM_IMAGES () IF (IMAGE_STATUS (k) == 0) EXIT END DO IF (k > NUM_IMAGES ()) ERROR STOP "cannot recover" local_index [k] = failed team_number [k] = 1 map(k) = failed END DO old_failures = failures images_used = k ! Find the local indices of team 2 j = 0 DO k = k+1, NUM_IMAGES () IF (IMAGE_STATUS (k) == 0) THEN j = j+1 local_index[k] = j END IF END DO END IF SYNC ALL (STAT = status) IF (status/=0 .AND. status/=STAT_FAILED_IMAGE) EXIT outer ! ! Set up a simulation team of constant size. ! Team 2 is the set of spares, so does not participate. FORM TEAM (team_number, simulation_team, NEW_INDEX=local_index, & STAT=status) IF (status/=0 .AND. status/=STAT_FAILED_IMAGE) EXIT outer simulation : CHANGE TEAM (simulation_team, STAT=status) IF (status == STAT_FAILED_IMAGE) EXIT simulation IF (start) read_checkpoint = .FALSE. start = .FALSE. IF (team_number == 1) THEN iter : DO CALL simulation_procedure (read_checkpoint, status, done) ! The simulation_procedure: ! - sets up and performs some part of the simulation; ! - resets to the last checkpoint if requested; ! - sets status from its internal synchronizations; ! - sets done to .TRUE. when the simulation has completed. IF (status == STAT_FAILED_IMAGE) THEN read_checkpoint = .TRUE. EXIT simulation ELSE IF (done) THEN EXIT iter END IF read_checkpoint = .FALSE. END DO iter END IF END TEAM (STAT=status) simulation SYNC ALL (STAT=status) IF (THIS_IMAGE () > images_used) done = done[1] IF (done) EXIT outer END DO outer IF (status/=0 .AND. status/=STAT_FAILED_IMAGE) & PRINT *,'Unexpected failure',status END PROGRAM possibly_recoverable_simulation