1 /*
2 * Copyright (c) 2004-2010 The Trustees of Indiana University and Indiana
3 * University Research and Technology
4 * Corporation. All rights reserved.
5 * Copyright (c) 2004-2011 The University of Tennessee and The University
6 * of Tennessee Research Foundation. All rights
7 * reserved.
8 * Copyright (c) 2004-2005 High Performance Computing Center Stuttgart,
9 * University of Stuttgart. All rights reserved.
10 * Copyright (c) 2004-2005 The Regents of the University of California.
11 * All rights reserved.
12 * Copyright (c) 2006-2013 Los Alamos National Security, LLC.
13 * All rights reserved.
14 * Copyright (c) 2009-2012 Cisco Systems, Inc. All rights reserved.
15 * Copyright (c) 2011 Oak Ridge National Labs. All rights reserved.
16 * Copyright (c) 2013-2018 Intel, Inc. All rights reserved.
17 * Copyright (c) 2015 Mellanox Technologies, Inc. All rights reserved.
18 * $COPYRIGHT$
19 *
20 * Additional copyrights may follow
21 *
22 * $HEADER$
23 *
24 */
25
26 #define _GNU_SOURCE
27 #include <stdbool.h>
28 #include <stdio.h>
29 #include <stdlib.h>
30 #include <unistd.h>
31 #include <time.h>
32
33 #include <pmix_tool.h>
34
35 /* define a structure for collecting returned
36 * info from a query */
37 typedef struct {
38 volatile bool active;
39 pmix_info_t *info;
40 size_t ninfo;
41 } myquery_data_t;
42
43
44 static pmix_proc_t myproc;
45
46 /* this is a callback function for the PMIx_Query
47 * API. The query will callback with a status indicating
48 * if the request could be fully satisfied, partially
49 * satisfied, or completely failed. The info parameter
50 * contains an array of the returned data, with the
51 * info->key field being the key that was provided in
52 * the query call. Thus, you can correlate the returned
53 * data in the info->value field to the requested key.
54 *
55 * Once we have dealt with the returned data, we must
56 * call the release_fn so that the PMIx library can
57 * cleanup */
cbfunc(pmix_status_t status,pmix_info_t * info,size_t ninfo,void * cbdata,pmix_release_cbfunc_t release_fn,void * release_cbdata)58 static void cbfunc(pmix_status_t status,
59 pmix_info_t *info, size_t ninfo,
60 void *cbdata,
61 pmix_release_cbfunc_t release_fn,
62 void *release_cbdata)
63 {
64 myquery_data_t *mq = (myquery_data_t*)cbdata;
65 size_t n;
66
67 /* save the returned info - it will be
68 * released in the release_fn */
69 if (0 < ninfo) {
70 PMIX_INFO_CREATE(mq->info, ninfo);
71 mq->ninfo = ninfo;
72 for (n=0; n < ninfo; n++) {
73 fprintf(stderr, "Transferring %s\n", info[n].key);
74 PMIX_INFO_XFER(&mq->info[n], &info[n]);
75 }
76 }
77
78 /* let the library release the data */
79 if (NULL != release_fn) {
80 release_fn(release_cbdata);
81 }
82
83 /* release the block */
84 mq->active = false;
85 }
86
87 /* this is the event notification function we pass down below
88 * when registering for general events - i.e.,, the default
89 * handler. We don't technically need to register one, but it
90 * is usually good practice to catch any events that occur */
notification_fn(size_t evhdlr_registration_id,pmix_status_t status,const pmix_proc_t * source,pmix_info_t info[],size_t ninfo,pmix_info_t results[],size_t nresults,pmix_event_notification_cbfunc_fn_t cbfunc,void * cbdata)91 static void notification_fn(size_t evhdlr_registration_id,
92 pmix_status_t status,
93 const pmix_proc_t *source,
94 pmix_info_t info[], size_t ninfo,
95 pmix_info_t results[], size_t nresults,
96 pmix_event_notification_cbfunc_fn_t cbfunc,
97 void *cbdata)
98 {
99 if (NULL != cbfunc) {
100 cbfunc(PMIX_EVENT_ACTION_COMPLETE, NULL, 0, NULL, NULL, cbdata);
101 }
102 }
103
104 /* event handler registration is done asynchronously because it
105 * may involve the PMIx server registering with the host RM for
106 * external events. So we provide a callback function that returns
107 * the status of the request (success or an error), plus a numerical index
108 * to the registered event. The index is used later on to deregister
109 * an event handler - if we don't explicitly deregister it, then the
110 * PMIx server will do so when it see us exit */
evhandler_reg_callbk(pmix_status_t status,size_t evhandler_ref,void * cbdata)111 static void evhandler_reg_callbk(pmix_status_t status,
112 size_t evhandler_ref,
113 void *cbdata)
114 {
115 volatile int *active = (volatile int*)cbdata;
116
117 if (PMIX_SUCCESS != status) {
118 fprintf(stderr, "Client %s:%d EVENT HANDLER REGISTRATION FAILED WITH STATUS %d, ref=%lu\n",
119 myproc.nspace, myproc.rank, status, (unsigned long)evhandler_ref);
120 }
121 *active = status;
122 }
123
main(int argc,char ** argv)124 int main(int argc, char **argv)
125 {
126 pmix_status_t rc;
127 pmix_value_t *val;
128 pmix_proc_t proc;
129 pmix_info_t *info;
130 size_t ninfo;
131 volatile int active;
132 pmix_query_t *query;
133 size_t nq, n;
134 myquery_data_t myquery_data;
135
136 fprintf(stderr, "I AM HERE\n");
137 fflush(stderr);
138 sleep(10);
139 exit(0);
140
141 /* init us - since we were launched by the RM, our connection info
142 * will have been provided at startup. */
143 if (PMIX_SUCCESS != (rc = PMIx_tool_init(&myproc, NULL, 0))) {
144 fprintf(stderr, "Debugger daemon ns %s rank %d: PMIx_tool_init failed: %d\n", myproc.nspace, myproc.rank, rc);
145 exit(0);
146 }
147 fprintf(stderr, "Debugger daemon ns %s rank %d: Running\n", myproc.nspace, myproc.rank);
148
149
150 /* register our default event handler */
151 active = -1;
152 PMIx_Register_event_handler(NULL, 0, NULL, 0,
153 notification_fn, evhandler_reg_callbk, (void*)&active);
154 while (-1 == active) {
155 usleep(10);
156 }
157 if (0 != active) {
158 exit(active);
159 }
160
161 /* get the nspace of the job we are to debug */
162 (void)strncpy(proc.nspace, myproc.nspace, PMIX_MAX_NSLEN);
163 proc.rank = PMIX_RANK_WILDCARD;
164 if (PMIX_SUCCESS != (rc = PMIx_Get(&proc, PMIX_DEBUG_JOB, NULL, 0, &val))) {
165 fprintf(stderr, "[%s:%d] Failed to get job being debugged - error %d\n", myproc.nspace, myproc.rank, rc);
166 goto done;
167 }
168 if (NULL == val) {
169 fprintf(stderr, "Got NULL return\n");
170 goto done;
171 }
172 fprintf(stderr, "[%s:%d] Debugging %s\n", myproc.nspace, myproc.rank, val->data.string);
173
174 /* get our local proctable - for scalability reasons, we don't want to
175 * have our "root" debugger process get the proctable for everybody and
176 * send it out to us. So ask the local PMIx server for the pid's of
177 * our local target processes */
178 nq = 1;
179 PMIX_QUERY_CREATE(query, nq);
180 PMIX_ARGV_APPEND(rc, query[0].keys, PMIX_QUERY_LOCAL_PROC_TABLE);
181 query[0].nqual = 1;
182 PMIX_INFO_CREATE(query[0].qualifiers, 1);
183 PMIX_INFO_LOAD(&query[0].qualifiers[0], PMIX_NSPACE, val->data.string, PMIX_STRING); // the nspace we are enquiring about
184 /* setup the caddy to retrieve the data */
185 myquery_data.info = NULL;
186 myquery_data.ninfo = 0;
187 myquery_data.active = true;
188 /* execute the query */
189 if (PMIX_SUCCESS != (rc = PMIx_Query_info_nb(query, nq, cbfunc, (void*)&myquery_data))) {
190 fprintf(stderr, "PMIx_Query_info failed: %d\n", rc);
191 goto done;
192 }
193 while (myquery_data.active) {
194 usleep(10);
195 }
196 fprintf(stderr, "[%s:%d] Local proctable received\n", myproc.nspace, myproc.rank);
197
198
199 /* now that we have the proctable for our local processes, we can do our
200 * magic debugger stuff and attach to them. We then send a "release" event
201 * to them - i.e., it's the equivalent to setting the MPIR breakpoint. We
202 * do this with the event notification system */
203 (void)strncpy(proc.nspace, val->data.string, PMIX_MAX_NSLEN);
204 proc.rank = PMIX_RANK_WILDCARD;
205 /* we send the notification to just the local procs of the job being debugged */
206 ninfo = 1;
207 PMIX_INFO_CREATE(info, ninfo);
208 PMIX_INFO_LOAD(&info[0], PMIX_EVENT_CUSTOM_RANGE, &proc, PMIX_PROC); // deliver to the target nspace
209 fprintf(stderr, "[%s:%u] Sending release\n", myproc.nspace, myproc.rank);
210 PMIx_Notify_event(PMIX_ERR_DEBUGGER_RELEASE,
211 NULL, PMIX_RANGE_LOCAL,
212 info, ninfo, NULL, NULL);
213
214 /* do some debugger magic */
215 n = 0;
216 fprintf(stderr, "[%s:%u] Hanging around awhile, doing debugger magic\n", myproc.nspace, myproc.rank);
217 while (n < 5) {
218 usleep(1000);
219 ++n;
220 }
221
222 done:
223 /* finalize us */
224 fprintf(stderr, "Debugger daemon ns %s rank %d: Finalizing\n", myproc.nspace, myproc.rank);
225 if (PMIX_SUCCESS != (rc = PMIx_Finalize(NULL, 0))) {
226 fprintf(stderr, "Debugger daemon ns %s rank %d:PMIx_Finalize failed: %d\n", myproc.nspace, myproc.rank, rc);
227 } else {
228 fprintf(stderr, "Debugger daemon ns %s rank %d:PMIx_Finalize successfully completed\n", myproc.nspace, myproc.rank);
229 }
230 fflush(stderr);
231 return(0);
232 }
233