summaryrefslogtreecommitdiff
path: root/net/mpich/files/patch-l0-fallback
blob: 35f18dc272a5b577c1cfd875eb1d8d153f444f32 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
$ pkg delete intel-compute-runtime
$ mpivars
PCI: Failed to initialize libpciaccess with pci_system_init(): 6 (Permission denied)
Abort(268484367) on node 0 (rank 0 in comm 0): Fatal error in PMPI_Init_thread: Other MPI error, error stack:
MPIR_Init_thread(153):  gpu_init failed
[unset]: write_line error; fd=-1 buf=:cmd=abort exitcode=268484367
:
system msg for write_line failure : Bad file descriptor
Attempting to use an MPI routine before initializing MPICH

--- src/mpi/init/initthread.c.orig	2021-05-25 17:37:05 UTC
+++ src/mpi/init/initthread.c
@@ -150,7 +150,9 @@ int MPIR_Init_thread(int *argc, char ***argv, int user
      * inside MPID_Init */
     if (MPIR_CVAR_ENABLE_GPU) {
         int mpl_errno = MPL_gpu_init();
-        MPIR_ERR_CHKANDJUMP(mpl_errno != MPL_SUCCESS, mpi_errno, MPI_ERR_OTHER, "**gpu_init");
+        MPIR_ERR_CHKANDJUMP(
+            mpl_errno != MPL_SUCCESS && mpl_errno != MPL_ERR_GPU_INTERNAL,
+            mpi_errno, MPI_ERR_OTHER, "**gpu_init");
     }
 
     MPL_atomic_store_int(&MPIR_Process.mpich_state, MPICH_MPI_STATE__IN_INIT);
--- src/mpid/ch4/netmod/ofi/ofi_init.c.orig	2021-05-25 17:37:05 UTC
+++ src/mpid/ch4/netmod/ofi/ofi_init.c
@@ -731,7 +731,6 @@ int MPIDI_OFI_mpi_init_hook(int rank, int size, int ap
             MPL_gpu_malloc_host(&(MPIDI_OFI_global.am_bufs[i]), MPIDI_OFI_AM_BUFF_SZ);
             MPIDI_OFI_global.am_reqs[i].event_id = MPIDI_OFI_EVENT_AM_RECV;
             MPIDI_OFI_global.am_reqs[i].index = i;
-            MPIR_Assert(MPIDI_OFI_global.am_bufs[i]);
             MPIDI_OFI_global.am_iov[i].iov_base = MPIDI_OFI_global.am_bufs[i];
             MPIDI_OFI_global.am_iov[i].iov_len = MPIDI_OFI_AM_BUFF_SZ;
             MPIDI_OFI_global.am_msg[i].msg_iov = &MPIDI_OFI_global.am_iov[i];
--- src/mpl/src/gpu/mpl_gpu_ze.c.orig	2021-05-25 17:37:05 UTC
+++ src/mpl/src/gpu/mpl_gpu_ze.c
@@ -33,7 +33,7 @@ int MPL_gpu_get_dev_count(int *dev_cnt, int *dev_id)
 {
     int ret = MPL_SUCCESS;
     if (!gpu_initialized) {
-        ret = MPL_gpu_init();
+        MPL_gpu_init();
     }
 
     *dev_cnt = device_count;