1 /*
2  * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
3  *                         University Research and Technology
4  *                         Corporation.  All rights reserved.
5  * Copyright (c) 2004-2011 The University of Tennessee and The University
6  *                         of Tennessee Research Foundation.  All rights
7  *                         reserved.
8  * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9  *                         University of Stuttgart.  All rights reserved.
10  * Copyright (c) 2004-2005 The Regents of the University of California.
11  *                         All rights reserved.
12  * Copyright (c) 2006-2013 Los Alamos National Security, LLC.
13  *                         All rights reserved.
14  * Copyright (c) 2009-2012 Cisco Systems, Inc.  All rights reserved.
15  * Copyright (c) 2011      Oak Ridge National Labs.  All rights reserved.
16  * Copyright (c) 2013-2018 Intel, Inc.  All rights reserved.
17  * Copyright (c) 2015      Mellanox Technologies, Inc.  All rights reserved.
18  * $COPYRIGHT$
19  *
20  * Additional copyrights may follow
21  *
22  * $HEADER$
23  *
24  */
25 
26 #define _GNU_SOURCE
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <unistd.h>
31 #include <time.h>
32 
33 #include <pmix_tool.h>
34 
35 /* define a structure for collecting returned
36  * info from a query */
37 typedef struct {
38     volatile bool active;
39     pmix_info_t *info;
40     size_t ninfo;
41 } myquery_data_t;
42 
43 
44 static pmix_proc_t myproc;
45 
46 /* this is a callback function for the PMIx_Query
47  * API. The query will callback with a status indicating
48  * if the request could be fully satisfied, partially
49  * satisfied, or completely failed. The info parameter
50  * contains an array of the returned data, with the
51  * info->key field being the key that was provided in
52  * the query call. Thus, you can correlate the returned
53  * data in the info->value field to the requested key.
54  *
55  * Once we have dealt with the returned data, we must
56  * call the release_fn so that the PMIx library can
57  * cleanup */
cbfunc(pmix_status_t status,pmix_info_t * info,size_t ninfo,void * cbdata,pmix_release_cbfunc_t release_fn,void * release_cbdata)58 static void cbfunc(pmix_status_t status,
59                    pmix_info_t *info, size_t ninfo,
60                    void *cbdata,
61                    pmix_release_cbfunc_t release_fn,
62                    void *release_cbdata)
63 {
64     myquery_data_t *mq = (myquery_data_t*)cbdata;
65     size_t n;
66 
67     /* save the returned info - it will be
68      * released in the release_fn */
69     if (0 < ninfo) {
70         PMIX_INFO_CREATE(mq->info, ninfo);
71         mq->ninfo = ninfo;
72         for (n=0; n < ninfo; n++) {
73             fprintf(stderr, "Transferring %s\n", info[n].key);
74             PMIX_INFO_XFER(&mq->info[n], &info[n]);
75         }
76     }
77 
78     /* let the library release the data */
79     if (NULL != release_fn) {
80         release_fn(release_cbdata);
81     }
82 
83     /* release the block */
84     mq->active = false;
85 }
86 
87 /* this is the event notification function we pass down below
88  * when registering for general events - i.e.,, the default
89  * handler. We don't technically need to register one, but it
90  * is usually good practice to catch any events that occur */
notification_fn(size_t evhdlr_registration_id,pmix_status_t status,const pmix_proc_t * source,pmix_info_t info[],size_t ninfo,pmix_info_t results[],size_t nresults,pmix_event_notification_cbfunc_fn_t cbfunc,void * cbdata)91 static void notification_fn(size_t evhdlr_registration_id,
92                             pmix_status_t status,
93                             const pmix_proc_t *source,
94                             pmix_info_t info[], size_t ninfo,
95                             pmix_info_t results[], size_t nresults,
96                             pmix_event_notification_cbfunc_fn_t cbfunc,
97                             void *cbdata)
98 {
99     if (NULL != cbfunc) {
100         cbfunc(PMIX_EVENT_ACTION_COMPLETE, NULL, 0, NULL, NULL, cbdata);
101     }
102 }
103 
104 /* event handler registration is done asynchronously because it
105  * may involve the PMIx server registering with the host RM for
106  * external events. So we provide a callback function that returns
107  * the status of the request (success or an error), plus a numerical index
108  * to the registered event. The index is used later on to deregister
109  * an event handler - if we don't explicitly deregister it, then the
110  * PMIx server will do so when it see us exit */
evhandler_reg_callbk(pmix_status_t status,size_t evhandler_ref,void * cbdata)111 static void evhandler_reg_callbk(pmix_status_t status,
112                                  size_t evhandler_ref,
113                                  void *cbdata)
114 {
115     volatile int *active = (volatile int*)cbdata;
116 
117     if (PMIX_SUCCESS != status) {
118         fprintf(stderr, "Client %s:%d EVENT HANDLER REGISTRATION FAILED WITH STATUS %d, ref=%lu\n",
119                    myproc.nspace, myproc.rank, status, (unsigned long)evhandler_ref);
120     }
121     *active = status;
122 }
123 
main(int argc,char ** argv)124 int main(int argc, char **argv)
125 {
126     pmix_status_t rc;
127     pmix_value_t *val;
128     pmix_proc_t proc;
129     pmix_info_t *info;
130     size_t ninfo;
131     volatile int active;
132     pmix_query_t *query;
133     size_t nq, n;
134     myquery_data_t myquery_data;
135 
136 fprintf(stderr, "I AM HERE\n");
137 fflush(stderr);
138     sleep(10);
139     exit(0);
140 
141     /* init us - since we were launched by the RM, our connection info
142      * will have been provided at startup. */
143     if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, NULL, 0))) {
144         fprintf(stderr, "Debugger daemon ns %s rank %d: PMIx_tool_init failed: %d\n", myproc.nspace, myproc.rank, rc);
145         exit(0);
146     }
147     fprintf(stderr, "Debugger daemon ns %s rank %d: Running\n", myproc.nspace, myproc.rank);
148 
149 
150     /* register our default event handler */
151     active = -1;
152     PMIx_Register_event_handler(NULL, 0, NULL, 0,
153                                 notification_fn, evhandler_reg_callbk, (void*)&active);
154     while (-1 == active) {
155         usleep(10);
156     }
157     if (0 != active) {
158         exit(active);
159     }
160 
161     /* get the nspace of the job we are to debug */
162     (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
163     proc.rank = PMIX_RANK_WILDCARD;
164     if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_DEBUG_JOB, NULL, 0, &val))) {
165         fprintf(stderr, "[%s:%d] Failed to get job being debugged - error %d\n", myproc.nspace, myproc.rank, rc);
166         goto done;
167     }
168     if (NULL == val) {
169         fprintf(stderr, "Got NULL return\n");
170         goto done;
171     }
172     fprintf(stderr, "[%s:%d] Debugging %s\n", myproc.nspace, myproc.rank, val->data.string);
173 
174     /* get our local proctable - for scalability reasons, we don't want to
175      * have our "root" debugger process get the proctable for everybody and
176      * send it out to us. So ask the local PMIx server for the pid's of
177      * our local target processes */
178     nq = 1;
179     PMIX_QUERY_CREATE(query, nq);
180     PMIX_ARGV_APPEND(rc, query[0].keys, PMIX_QUERY_LOCAL_PROC_TABLE);
181     query[0].nqual = 1;
182     PMIX_INFO_CREATE(query[0].qualifiers, 1);
183     PMIX_INFO_LOAD(&query[0].qualifiers[0], PMIX_NSPACE, val->data.string, PMIX_STRING);  // the nspace we are enquiring about
184     /* setup the caddy to retrieve the data */
185     myquery_data.info = NULL;
186     myquery_data.ninfo = 0;
187     myquery_data.active = true;
188     /* execute the query */
189     if (PMIX_SUCCESS != (rc = PMIx_Query_info_nb(query, nq, cbfunc, (void*)&myquery_data))) {
190         fprintf(stderr, "PMIx_Query_info failed: %d\n", rc);
191         goto done;
192     }
193     while (myquery_data.active) {
194         usleep(10);
195     }
196     fprintf(stderr, "[%s:%d] Local proctable received\n", myproc.nspace, myproc.rank);
197 
198 
199     /* now that we have the proctable for our local processes, we can do our
200      * magic debugger stuff and attach to them. We then send a "release" event
201      * to them - i.e., it's the equivalent to setting the MPIR breakpoint. We
202      * do this with the event notification system */
203     (void)strncpy(proc.nspace, val->data.string, PMIX_MAX_NSLEN);
204     proc.rank = PMIX_RANK_WILDCARD;
205     /* we send the notification to just the local procs of the job being debugged */
206     ninfo = 1;
207     PMIX_INFO_CREATE(info, ninfo);
208     PMIX_INFO_LOAD(&info[0], PMIX_EVENT_CUSTOM_RANGE, &proc, PMIX_PROC);  // deliver to the target nspace
209     fprintf(stderr, "[%s:%u] Sending release\n", myproc.nspace, myproc.rank);
210     PMIx_Notify_event(PMIX_ERR_DEBUGGER_RELEASE,
211                       NULL, PMIX_RANGE_LOCAL,
212                       info, ninfo, NULL, NULL);
213 
214     /* do some debugger magic */
215     n = 0;
216     fprintf(stderr, "[%s:%u] Hanging around awhile, doing debugger magic\n", myproc.nspace, myproc.rank);
217     while (n < 5) {
218         usleep(1000);
219         ++n;
220     }
221 
222   done:
223     /* finalize us */
224     fprintf(stderr, "Debugger daemon ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank);
225     if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
226         fprintf(stderr, "Debugger daemon ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
227     } else {
228         fprintf(stderr, "Debugger daemon ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
229     }
230     fflush(stderr);
231     return(0);
232 }
233