1 #include "HalideRuntimeOpenGL.h"
2 #include "device_interface.h"
3 #include "mini_opengl.h"
4 #include "printer.h"
5 
6 // This constant is used to indicate that the application will take
7 // responsibility for binding the output render target before calling the
8 // Halide function.
9 #define HALIDE_OPENGL_RENDER_TARGET ((uint64_t)-1)
10 
11 // Implementation note: all function that directly or indirectly access the
12 // runtime state in halide_opengl_state must be declared as WEAK, otherwise
13 // the behavior at runtime is undefined.
14 
15 // List of all OpenGL functions used by the runtime. The list is used to
16 // declare and initialize the dispatch table in OpenGLState below.
17 #define USED_GL_FUNCTIONS                                                \
18     GLFUNC(PFNGLDELETETEXTURESPROC, DeleteTextures);                     \
19     GLFUNC(PFNGLGENTEXTURESPROC, GenTextures);                           \
20     GLFUNC(PFNGLBINDTEXTUREPROC, BindTexture);                           \
21     GLFUNC(PFNGLGETERRORPROC, GetError);                                 \
22     GLFUNC(PFNGLVIEWPORTPROC, Viewport);                                 \
23     GLFUNC(PFNGLGENBUFFERSPROC, GenBuffers);                             \
24     GLFUNC(PFNGLDELETEBUFFERSPROC, DeleteBuffers);                       \
25     GLFUNC(PFNGLBINDBUFFERPROC, BindBuffer);                             \
26     GLFUNC(PFNGLBUFFERDATAPROC, BufferData);                             \
27     GLFUNC(PFNGLTEXPARAMETERIPROC, TexParameteri);                       \
28     GLFUNC(PFNGLTEXIMAGE2DPROC, TexImage2D);                             \
29     GLFUNC(PFNGLTEXSUBIMAGE2DPROC, TexSubImage2D);                       \
30     GLFUNC(PFNGLDISABLEPROC, Disable);                                   \
31     GLFUNC(PFNGLDISABLEPROC, Enable);                                    \
32     GLFUNC(PFNGLCREATESHADERPROC, CreateShader);                         \
33     GLFUNC(PFNGLACTIVETEXTUREPROC, ActiveTexture);                       \
34     GLFUNC(PFNGLSHADERSOURCEPROC, ShaderSource);                         \
35     GLFUNC(PFNGLCOMPILESHADERPROC, CompileShader);                       \
36     GLFUNC(PFNGLGETSHADERIVPROC, GetShaderiv);                           \
37     GLFUNC(PFNGLGETSHADERINFOLOGPROC, GetShaderInfoLog);                 \
38     GLFUNC(PFNGLDELETESHADERPROC, DeleteShader);                         \
39     GLFUNC(PFNGLCREATEPROGRAMPROC, CreateProgram);                       \
40     GLFUNC(PFNGLATTACHSHADERPROC, AttachShader);                         \
41     GLFUNC(PFNGLLINKPROGRAMPROC, LinkProgram);                           \
42     GLFUNC(PFNGLGETPROGRAMIVPROC, GetProgramiv);                         \
43     GLFUNC(PFNGLGETPROGRAMINFOLOGPROC, GetProgramInfoLog);               \
44     GLFUNC(PFNGLUSEPROGRAMPROC, UseProgram);                             \
45     GLFUNC(PFNGLDELETEPROGRAMPROC, DeleteProgram);                       \
46     GLFUNC(PFNGLGETUNIFORMLOCATIONPROC, GetUniformLocation);             \
47     GLFUNC(PFNGLUNIFORM1IVPROC, Uniform1iv);                             \
48     GLFUNC(PFNGLUNIFORM2IVPROC, Uniform2iv);                             \
49     GLFUNC(PFNGLUNIFORM2IVPROC, Uniform4iv);                             \
50     GLFUNC(PFNGLUNIFORM1FVPROC, Uniform1fv);                             \
51     GLFUNC(PFNGLUNIFORM1FVPROC, Uniform4fv);                             \
52     GLFUNC(PFNGLGENFRAMEBUFFERSPROC, GenFramebuffers);                   \
53     GLFUNC(PFNGLDELETEFRAMEBUFFERSPROC, DeleteFramebuffers);             \
54     GLFUNC(PFNGLCHECKFRAMEBUFFERSTATUSPROC, CheckFramebufferStatus);     \
55     GLFUNC(PFNGLBINDFRAMEBUFFERPROC, BindFramebuffer);                   \
56     GLFUNC(PFNGLFRAMEBUFFERTEXTURE2DPROC, FramebufferTexture2D);         \
57     GLFUNC(PFNGLGETATTRIBLOCATIONPROC, GetAttribLocation);               \
58     GLFUNC(PFNGLVERTEXATTRIBPOINTERPROC, VertexAttribPointer);           \
59     GLFUNC(PFNGLDRAWELEMENTSPROC, DrawElements);                         \
60     GLFUNC(PFNGLENABLEVERTEXATTRIBARRAYPROC, EnableVertexAttribArray);   \
61     GLFUNC(PFNGLDISABLEVERTEXATTRIBARRAYPROC, DisableVertexAttribArray); \
62     GLFUNC(PFNGLGETVERTEXATTRIBIVPROC, GetVertexAttribiv);               \
63     GLFUNC(PFNGLPIXELSTOREIPROC, PixelStorei);                           \
64     GLFUNC(PFNGLREADPIXELS, ReadPixels);                                 \
65     GLFUNC(PFNGLGETSTRINGPROC, GetString);                               \
66     GLFUNC(PFNGLGETINTEGERV, GetIntegerv);                               \
67     GLFUNC(PFNGLGETBOOLEANV, GetBooleanv);                               \
68     GLFUNC(PFNGLFINISHPROC, Finish);
69 
70 // List of all OpenGL functions used by the runtime, which may not
71 // exist due to an older or less capable version of GL. In using any
72 // of these functions, code must test if they are NULL.
73 #define OPTIONAL_GL_FUNCTIONS                            \
74     GLFUNC(PFNGLGENVERTEXARRAYS, GenVertexArrays);       \
75     GLFUNC(PFNGLBINDVERTEXARRAY, BindVertexArray);       \
76     GLFUNC(PFNGLDELETEVERTEXARRAYS, DeleteVertexArrays); \
77     GLFUNC(PFNDRAWBUFFERS, DrawBuffers)
78 
79 // ---------- Types ----------
80 
81 using namespace Halide::Runtime::Internal;
82 
83 namespace Halide {
84 namespace Runtime {
85 namespace Internal {
86 namespace OpenGL {
87 
88 extern WEAK halide_device_interface_t opengl_device_interface;
89 
gl_error_name(int32_t err)90 WEAK const char *gl_error_name(int32_t err) {
91     const char *result;
92     switch (err) {
93     case 0x500:
94         result = "GL_INVALID_ENUM";
95         break;
96     case 0x501:
97         result = "GL_INVALID_VALUE";
98         break;
99     case 0x502:
100         result = "GL_INVALID_OPERATION";
101         break;
102     case 0x503:
103         result = "GL_STACK_OVERFLOW";
104         break;
105     case 0x504:
106         result = "GL_STACK_UNDERFLOW";
107         break;
108     case 0x505:
109         result = "GL_OUT_OF_MEMORY";
110         break;
111     case 0x506:
112         result = "GL_INVALID_FRAMEBUFFER_OPERATION";
113         break;
114     case 0x507:
115         result = "GL_CONTEXT_LOST";
116         break;
117     case 0x8031:
118         result = "GL_TABLE_TOO_LARGE";
119         break;
120     default:
121         result = "<unknown GL error>";
122         break;
123     }
124     return result;
125 }
126 
127 struct HalideMalloc {
HalideMallocHalide::Runtime::Internal::OpenGL::HalideMalloc128     ALWAYS_INLINE HalideMalloc(void *user_context, size_t size)
129         : user_context(user_context), ptr(halide_malloc(user_context, size)) {
130     }
~HalideMallocHalide::Runtime::Internal::OpenGL::HalideMalloc131     ALWAYS_INLINE ~HalideMalloc() {
132         halide_free(user_context, ptr);
133     }
134     void *const user_context;
135     void *const ptr;
136 };
137 
138 enum OpenGLProfile {
139     OpenGL,
140     OpenGLES
141 };
142 
143 struct Argument {
144     // The kind of data stored in an argument
145     enum Kind {
146         Invalid,
147         Uniform,  // uniform variable
148         Varying,  // varying attribute
149         Inbuf,    // input texture
150         Outbuf    // output texture
151     };
152 
153     // The elementary data type of the argument
154     enum Type {
155         Void,
156         Bool,
157         Float,
158         Int8,
159         Int16,
160         Int32,
161         UInt8,
162         UInt16,
163         UInt32
164     };
165 
166     char *name;
167     Kind kind;
168     Type type;
169     Argument *next;
170 };
171 
172 struct KernelInfo {
173     char *name;
174     char *source;
175     Argument *arguments;
176     GLuint shader_id;
177     GLuint program_id;
178 };
179 
180 struct ModuleState {
181     KernelInfo *kernel;
182     ModuleState *next;
183 };
184 
185 // All persistent state maintained by the runtime.
186 struct GlobalState {
187     void init();
188     bool CheckAndReportError(void *user_context, const char *location);
189 
190     bool initialized;
191 
192     // Information about the OpenGL platform we're running on.
193     OpenGLProfile profile;
194     int major_version, minor_version;
195     bool have_vertex_array_objects;
196     bool have_texture_rg;
197     bool have_texture_float;
198     bool have_texture_rgb8_rgba8;
199 
200     // Various objects shared by all filter kernels
201     GLuint framebuffer_id;
202     GLuint vertex_array_object;
203     GLuint vertex_buffer;
204     GLuint element_buffer;
205 
206     // Declare pointers used OpenGL functions
207 #define GLFUNC(PTYPE, VAR) PTYPE VAR
208     USED_GL_FUNCTIONS;
209     OPTIONAL_GL_FUNCTIONS;
210 #undef GLFUNC
211 };
212 
CheckAndReportError(void * user_context,const char * location)213 WEAK bool GlobalState::CheckAndReportError(void *user_context, const char *location) {
214     GLenum err = GetError();
215     if (err != GL_NO_ERROR) {
216         error(user_context) << "OpenGL error " << gl_error_name(err) << "(" << (int)err << ")"
217                             << " at " << location << ".\n";
218         return true;
219     }
220     return false;
221 }
222 
223 WEAK GlobalState global_state;
224 
225 // Saves & restores OpenGL state
226 class GLStateSaver {
227 public:
GLStateSaver()228     ALWAYS_INLINE GLStateSaver() {
229         save();
230     }
~GLStateSaver()231     ALWAYS_INLINE ~GLStateSaver() {
232         restore();
233     }
234 
235 private:
236     // The state variables
237     GLint active_texture;
238     GLint array_buffer_binding;
239     GLint element_array_buffer_binding;
240     GLint framebuffer_binding;
241     GLint program;
242     GLint vertex_array_binding;
243     GLint viewport[4];
244     GLboolean cull_face;
245     GLboolean depth_test;
246     int max_combined_texture_image_units;
247     GLint *texture_2d_binding;
248     int max_vertex_attribs;
249     GLint *vertex_attrib_array_enabled;
250 
251     // Define these out-of-line as WEAK, to avoid LLVM error "MachO doesn't support COMDATs"
252     void save();
253     void restore();
254 };
255 
save()256 WEAK void GLStateSaver::save() {
257     global_state.GetIntegerv(GL_ACTIVE_TEXTURE, &active_texture);
258     global_state.GetIntegerv(GL_ARRAY_BUFFER_BINDING, &array_buffer_binding);
259     global_state.GetIntegerv(GL_ELEMENT_ARRAY_BUFFER_BINDING, &element_array_buffer_binding);
260     global_state.GetIntegerv(GL_FRAMEBUFFER_BINDING, &framebuffer_binding);
261     global_state.GetIntegerv(GL_CURRENT_PROGRAM, &program);
262     global_state.GetBooleanv(GL_CULL_FACE, &cull_face);
263     global_state.GetBooleanv(GL_DEPTH_TEST, &depth_test);
264     global_state.GetIntegerv(GL_VIEWPORT, viewport);
265 
266     global_state.GetIntegerv(GL_MAX_COMBINED_TEXTURE_IMAGE_UNITS, &max_combined_texture_image_units);
267     texture_2d_binding = (GLint *)malloc(max_combined_texture_image_units * sizeof(GLint));
268     for (int i = 0; i < max_combined_texture_image_units; i++) {
269         global_state.ActiveTexture(GL_TEXTURE0 + i);
270         global_state.GetIntegerv(GL_TEXTURE_BINDING_2D, &texture_2d_binding[i]);
271     }
272 
273     global_state.GetIntegerv(GL_MAX_VERTEX_ATTRIBS, &max_vertex_attribs);
274     vertex_attrib_array_enabled = (GLint *)malloc(max_vertex_attribs * sizeof(GLint));
275     for (int i = 0; i < max_vertex_attribs; i++) {
276         global_state.GetVertexAttribiv(i, GL_VERTEX_ATTRIB_ARRAY_ENABLED, &vertex_attrib_array_enabled[i]);
277     }
278 
279     if (global_state.have_vertex_array_objects) {
280         global_state.GetIntegerv(GL_VERTEX_ARRAY_BINDING, &vertex_array_binding);
281     }
282 
283 #ifdef DEBUG_RUNTIME
284     debug(NULL) << "Saved OpenGL state\n";
285 #endif
286 }
287 
restore()288 WEAK void GLStateSaver::restore() {
289 #ifdef DEBUG_RUNTIME
290     debug(NULL) << "Restoring OpenGL state\n";
291 #endif
292 
293     for (int i = 0; i < max_combined_texture_image_units; i++) {
294         global_state.ActiveTexture(GL_TEXTURE0 + i);
295         global_state.BindTexture(GL_TEXTURE_2D, texture_2d_binding[i]);
296     }
297     free(texture_2d_binding);
298 
299     for (int i = 0; i < max_vertex_attribs; i++) {
300         if (vertex_attrib_array_enabled[i])
301             global_state.EnableVertexAttribArray(i);
302         else
303             global_state.DisableVertexAttribArray(i);
304     }
305     free(vertex_attrib_array_enabled);
306 
307     if (global_state.have_vertex_array_objects) {
308         global_state.BindVertexArray(vertex_array_binding);
309     }
310 
311     global_state.ActiveTexture(active_texture);
312     global_state.BindFramebuffer(GL_FRAMEBUFFER, framebuffer_binding);
313     global_state.BindBuffer(GL_ARRAY_BUFFER, array_buffer_binding);
314     global_state.BindBuffer(GL_ELEMENT_ARRAY_BUFFER, element_array_buffer_binding);
315     global_state.UseProgram(program);
316     global_state.Viewport(viewport[0], viewport[1], viewport[2], viewport[3]);
317     (cull_face ? global_state.Enable : global_state.Disable)(GL_CULL_FACE);
318     (depth_test ? global_state.Enable : global_state.Disable)(GL_DEPTH_TEST);
319 }
320 
321 // A list of module-specific state. Each module corresponds to a single Halide filter
322 WEAK ModuleState *state_list;
323 
324 WEAK const char *kernel_marker = "/// KERNEL ";
325 WEAK const char *input_marker = "/// IN_BUFFER ";
326 WEAK const char *output_marker = "/// OUT_BUFFER ";
327 WEAK const char *uniform_marker = "/// UNIFORM ";
328 WEAK const char *varying_marker = "/// VARYING ";
329 
330 // ---------- Helper functions ----------
331 
strndup(const char * s,size_t n)332 WEAK char *strndup(const char *s, size_t n) {
333     char *p = (char *)malloc(n + 1);
334     memcpy(p, s, n);
335     p[n] = '\0';
336     return p;
337 }
338 
339 // Strip whitespace from the right side of
340 // a string
strstrip(char * str,size_t n)341 WEAK char *strstrip(char *str, size_t n) {
342     char *pos = str;
343     while (pos != str + n && *pos != '\0' && *pos != '\n' && *pos != ' ') {
344         pos++;
345     }
346     *pos = '\0';
347     return str;
348 }
349 
debug_buffer(void * user_context,halide_buffer_t * buf)350 WEAK void debug_buffer(void *user_context, halide_buffer_t *buf) {
351     debug(user_context) << *buf << "\n";
352 }
353 
make_shader(void * user_context,GLenum type,const char * source,GLint * length)354 WEAK GLuint make_shader(void *user_context, GLenum type,
355                         const char *source, GLint *length) {
356 #ifdef DEBUG_RUNTIME
357     {
358         debug(user_context) << ((type == GL_VERTEX_SHADER) ? "GL_VERTEX_SHADER" : "GL_FRAGMENT_SHADER")
359                             << " SOURCE:\n";
360         // debug() will go thru Printer<> which has a fixed, non-growing size.
361         // Just pass the source directly to halide_print instead, so it won't get clipped.
362         halide_print(user_context, source);
363     }
364 #endif
365 
366     GLuint shader = global_state.CreateShader(type);
367     if (global_state.CheckAndReportError(user_context, "make_shader(1)")) {
368         return 1;
369     }
370     if (*source == '\0') {
371         debug(user_context) << "Halide GLSL: passed shader source is empty, using default.\n";
372         const char *default_shader = "varying vec2 pixcoord;\n void main() { }";
373         global_state.ShaderSource(shader, 1, (const GLchar **)&default_shader, NULL);
374     } else {
375         global_state.ShaderSource(shader, 1, (const GLchar **)&source, length);
376     }
377     if (global_state.CheckAndReportError(user_context, "make_shader(2)")) {
378         return 1;
379     }
380     global_state.CompileShader(shader);
381     if (global_state.CheckAndReportError(user_context, "make_shader(3)")) {
382         return 1;
383     }
384 
385     GLint shader_ok = 0;
386     global_state.GetShaderiv(shader, GL_COMPILE_STATUS, &shader_ok);
387     if (!shader_ok) {
388         print(user_context) << "Could not compile shader:\n";
389         GLint log_len;
390         global_state.GetShaderiv(shader, GL_INFO_LOG_LENGTH, &log_len);
391         HalideMalloc log_tmp(user_context, log_len);
392         if (log_tmp.ptr) {
393             char *log = (char *)log_tmp.ptr;
394             global_state.GetShaderInfoLog(shader, log_len, NULL, log);
395             print(user_context) << log << "\n";
396         }
397         global_state.DeleteShader(shader);
398         return 0;
399     }
400     return shader;
401 }
402 
403 // Check whether string starts with a given prefix.
404 // Returns pointer to character after matched prefix if successful or NULL.
match_prefix(const char * s,const char * prefix)405 WEAK const char *match_prefix(const char *s, const char *prefix) {
406     if (0 == strncmp(s, prefix, strlen(prefix))) {
407         return s + strlen(prefix);
408     }
409     return NULL;
410 }
411 
412 // Parse declaration of the form "type name" and construct matching Argument.
parse_argument(void * user_context,const char * src,const char * end)413 WEAK Argument *parse_argument(void *user_context, const char *src,
414                               const char *end) {
415     const char *name;
416     Argument::Type type = Argument::Void;
417     if ((name = match_prefix(src, "float "))) {
418         type = Argument::Float;
419     } else if ((name = match_prefix(src, "bool "))) {
420         type = Argument::Bool;
421     } else if ((name = match_prefix(src, "int8_t "))) {
422         type = Argument::Int8;
423     } else if ((name = match_prefix(src, "int16_t "))) {
424         type = Argument::Int16;
425     } else if ((name = match_prefix(src, "int32_t "))) {
426         type = Argument::Int32;
427     } else if ((name = match_prefix(src, "uint8_t "))) {
428         type = Argument::UInt8;
429     } else if ((name = match_prefix(src, "uint16_t "))) {
430         type = Argument::UInt16;
431     } else if ((name = match_prefix(src, "uint32_t "))) {
432         type = Argument::UInt32;
433     }
434     if (type == Argument::Void) {
435         error(user_context) << "Internal error: argument type not supported";
436         return NULL;
437     }
438 
439     Argument *arg = (Argument *)malloc(sizeof(Argument));
440     arg->name = strndup(name, end - name);
441     arg->type = type;
442     arg->kind = Argument::Invalid;
443     arg->next = 0;
444     return arg;
445 }
446 
447 // Create KernelInfo for a piece of GLSL code
create_kernel(void * user_context,const char * src,int size)448 WEAK KernelInfo *create_kernel(void *user_context, const char *src, int size) {
449     KernelInfo *kernel = (KernelInfo *)malloc(sizeof(KernelInfo));
450 
451     kernel->source = strndup(src, size);
452     kernel->arguments = NULL;
453     kernel->program_id = 0;
454 
455     debug(user_context) << "Compiling GLSL kernel (size = " << size << "):\n";
456 
457     // Parse initial comment block
458     const char *line = kernel->source;
459     while (*line) {
460         const char *next_line = strchr(line, '\n') + 1;
461         if (!next_line)
462             next_line = line + size;
463 
464         const char *args;
465         if ((args = match_prefix(line, kernel_marker))) {
466             // set name
467             kernel->name = strstrip(strndup(args, next_line - args), next_line - args);
468         } else if ((args = match_prefix(line, uniform_marker))) {
469             if (Argument *arg =
470                     parse_argument(user_context, args, next_line - 1)) {
471                 arg->kind = Argument::Uniform;
472                 arg->next = kernel->arguments;
473                 kernel->arguments = arg;
474             } else {
475                 halide_error(user_context, "Invalid VAR marker");
476                 goto error;
477             }
478         } else if ((args = match_prefix(line, varying_marker))) {
479             if (Argument *arg =
480                     parse_argument(user_context, args, next_line - 1)) {
481                 arg->kind = Argument::Varying;
482                 arg->next = kernel->arguments;
483                 kernel->arguments = arg;
484             } else {
485                 halide_error(user_context, "Invalid VARYING marker");
486                 goto error;
487             }
488         } else if ((args = match_prefix(line, input_marker))) {
489             if (Argument *arg = parse_argument(user_context, args, next_line - 1)) {
490                 arg->kind = Argument::Inbuf;
491                 arg->next = kernel->arguments;
492                 kernel->arguments = arg;
493             } else {
494                 error(user_context) << "Invalid IN_BUFFER marker";
495                 goto error;
496             }
497         } else if ((args = match_prefix(line, output_marker))) {
498             if (Argument *arg = parse_argument(user_context, args, next_line - 1)) {
499                 arg->kind = Argument::Outbuf;
500                 arg->next = kernel->arguments;
501                 kernel->arguments = arg;
502             } else {
503                 error(user_context) << "Invalid OUT_BUFFER marker";
504                 goto error;
505             }
506         } else {
507             // Stop parsing if we encounter something we don't recognize
508             break;
509         }
510         line = next_line;
511     }
512 
513     // Arguments are currently in reverse order, flip the list.
514     {
515         Argument *cur = kernel->arguments;
516         kernel->arguments = NULL;
517         while (cur) {
518             Argument *next = cur->next;
519             cur->next = kernel->arguments;
520             kernel->arguments = cur;
521             cur = next;
522         }
523     }
524 
525     return kernel;
526 error:
527     free(kernel);
528     return NULL;
529 }
530 
531 // Delete all data associated with a kernel. Also release associated OpenGL
532 // shader and program.
delete_kernel(void * user_context,KernelInfo * kernel)533 WEAK void delete_kernel(void *user_context, KernelInfo *kernel) {
534     global_state.DeleteProgram(kernel->program_id);
535 #if 0  // TODO figure out why this got deleted.
536     global_state.DeleteShader(kernel->shader_id);
537 #endif
538 
539     Argument *arg = kernel->arguments;
540     while (arg) {
541         Argument *next = arg->next;
542         free(arg->name);
543         free(arg);
544         arg = next;
545     }
546     free(kernel->source);
547     free(kernel->name);
548     free(kernel);
549 }
550 
551 // Vertices and their order in a triangle strip for rendering a quad
552 // ranging from (-1,-1) to (1,1).
553 WEAK GLfloat quad_vertices[] = {
554     -1.0f, -1.0f, 1.0f, -1.0f,
555     -1.0f, 1.0f, 1.0f, 1.0f};
556 WEAK GLuint quad_indices[] = {0, 1, 2, 3};
557 
init()558 WEAK void GlobalState::init() {
559     initialized = false;
560     profile = OpenGL;
561     major_version = 2;
562     minor_version = 0;
563     framebuffer_id = 0;
564     vertex_array_object = vertex_buffer = element_buffer = 0;
565     have_vertex_array_objects = false;
566     have_texture_rg = false;
567     have_texture_rgb8_rgba8 = false;
568     // Initialize all GL function pointers to NULL
569 #define GLFUNC(type, name) name = NULL;
570     USED_GL_FUNCTIONS;
571     OPTIONAL_GL_FUNCTIONS;
572 #undef GLFUNC
573 }
574 
load_gl_func(void * user_context,const char * name,void ** ptr,bool required)575 WEAK int load_gl_func(void *user_context, const char *name, void **ptr, bool required) {
576     void *p = halide_opengl_get_proc_address(user_context, name);
577     if (!p && required) {
578         error(user_context) << "Could not load function pointer for " << name;
579         return -1;
580     }
581     *ptr = p;
582     return 0;
583 }
584 
extension_supported(void * user_context,const char * name)585 WEAK bool extension_supported(void *user_context, const char *name) {
586     // Iterate over space delimited extension strings. Note that glGetStringi
587     // is not part of GL ES 2.0, and not reliable in all implementations of
588     // GL ES 3.0.
589     const char *start = (const char *)global_state.GetString(GL_EXTENSIONS);
590     if (!start) {
591         return false;
592     }
593     while (const char *pos = strstr(start, name)) {
594         const char *end = pos + strlen(name);
595         // Ensure the found match is a full word, not a substring.
596         if ((pos == start || pos[-1] == ' ') &&
597             (*end == ' ' || *end == '\0')) {
598             return true;
599         }
600         start = end;
601     }
602 
603     return false;
604 }
605 
606 // Check for availability of various version- and extension-specific features
607 // and hook up functions pointers as necessary
init_extensions(void * user_context)608 WEAK void init_extensions(void *user_context) {
609     if (global_state.major_version >= 3) {  // This is likely valid for both OpenGL and OpenGL ES
610         load_gl_func(user_context, "glGenVertexArrays", (void **)&global_state.GenVertexArrays, false);
611         load_gl_func(user_context, "glBindVertexArray", (void **)&global_state.BindVertexArray, false);
612         load_gl_func(user_context, "glDeleteVertexArrays", (void **)&global_state.DeleteVertexArrays, false);
613         if (global_state.GenVertexArrays && global_state.BindVertexArray && global_state.DeleteVertexArrays) {
614             global_state.have_vertex_array_objects = true;
615         }
616     }
617     load_gl_func(user_context, "glDrawBuffers", (void **)&global_state.DrawBuffers, false);
618 
619     global_state.have_texture_rg =
620         global_state.major_version >= 3 ||
621         (global_state.profile == OpenGL &&
622          extension_supported(user_context, "GL_ARB_texture_rg")) ||
623         (global_state.profile == OpenGLES &&
624          extension_supported(user_context, "GL_EXT_texture_rg"));
625 
626     global_state.have_texture_rgb8_rgba8 =
627         global_state.major_version >= 3 ||
628         (global_state.profile == OpenGLES &&
629          extension_supported(user_context, "GL_OES_rgb8_rgba8"));
630 
631     global_state.have_texture_float =
632         (global_state.major_version >= 3) ||
633         (global_state.profile == OpenGL &&
634          extension_supported(user_context, "GL_ARB_texture_float")) ||
635         (global_state.profile == OpenGLES &&
636          extension_supported(user_context, "GL_OES_texture_float"));
637 }
638 
parse_int(const char * str,int * val)639 WEAK const char *parse_int(const char *str, int *val) {
640     int v = 0;
641     size_t i = 0;
642     while (str[i] >= '0' && str[i] <= '9') {
643         v = 10 * v + (str[i] - '0');
644         i++;
645     }
646     if (i > 0) {
647         *val = v;
648         return &str[i];
649     }
650     return NULL;
651 }
652 
parse_opengl_version(const char * str,int * major,int * minor)653 WEAK const char *parse_opengl_version(const char *str, int *major, int *minor) {
654     str = parse_int(str, major);
655     if (str == NULL || *str != '.') {
656         return NULL;
657     }
658     return parse_int(str + 1, minor);
659 }
660 
661 // Initialize the OpenGL-specific parts of the runtime.
halide_opengl_init(void * user_context)662 WEAK int halide_opengl_init(void *user_context) {
663     if (global_state.initialized) {
664         return 0;
665     }
666 
667 #ifdef DEBUG_RUNTIME
668     halide_start_clock(user_context);
669 #endif
670 
671     global_state.init();
672 
673     // Make a context if there isn't one
674     if (halide_opengl_create_context(user_context)) {
675         error(user_context) << "Failed to make OpenGL context";
676         return -1;
677     }
678 
679     // Initialize pointers to core OpenGL functions.
680 #define GLFUNC(TYPE, VAR)                                                              \
681     if (load_gl_func(user_context, "gl" #VAR, (void **)&global_state.VAR, true) < 0) { \
682         return -1;                                                                     \
683     }
684     USED_GL_FUNCTIONS;
685 #undef GLFUNC
686 
687     const char *version = (const char *)global_state.GetString(GL_VERSION);
688     const char *gles_version = match_prefix(version, "OpenGL ES ");
689     int major, minor;
690     if (gles_version && parse_opengl_version(gles_version, &major, &minor)) {
691         global_state.profile = OpenGLES;
692         global_state.major_version = major;
693         global_state.minor_version = minor;
694     } else if (parse_opengl_version(version, &major, &minor)) {
695         global_state.profile = OpenGL;
696         global_state.major_version = major;
697         global_state.minor_version = minor;
698     } else {
699         global_state.profile = OpenGL;
700         global_state.major_version = 2;
701         global_state.minor_version = 0;
702     }
703     init_extensions(user_context);
704     debug(user_context)
705         << "Halide running on OpenGL " << ((global_state.profile == OpenGL) ? "" : "ES ") << major << "." << minor << "\n"
706         << "  vertex_array_objects: " << (global_state.have_vertex_array_objects ? "yes\n" : "no\n")
707         << "  texture_rg: " << (global_state.have_texture_rg ? "yes\n" : "no\n")
708         << "  have_texture_rgb8_rgba8: " << (global_state.have_texture_rgb8_rgba8 ? "yes\n" : "no\n")
709         << "  texture_float: " << (global_state.have_texture_float ? "yes\n" : "no\n");
710 
711     // Initialize framebuffer.
712     global_state.GenFramebuffers(1, &global_state.framebuffer_id);
713     if (global_state.CheckAndReportError(user_context, "halide_opengl_init GenFramebuffers")) {
714         return 1;
715     }
716 
717     // Initialize vertex and element buffers.
718     GLuint buf[2];
719     global_state.GenBuffers(2, buf);
720     global_state.BindBuffer(GL_ARRAY_BUFFER, buf[0]);
721     global_state.BufferData(GL_ARRAY_BUFFER, sizeof(quad_vertices), quad_vertices, GL_STATIC_DRAW);
722     global_state.BindBuffer(GL_ARRAY_BUFFER, 0);
723     global_state.BindBuffer(GL_ELEMENT_ARRAY_BUFFER, buf[1]);
724     global_state.BufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(quad_indices), quad_indices, GL_STATIC_DRAW);
725     global_state.BindBuffer(GL_ELEMENT_ARRAY_BUFFER, 0);
726     global_state.vertex_buffer = buf[0];
727     global_state.element_buffer = buf[1];
728 
729     if (global_state.have_vertex_array_objects) {
730         global_state.GenVertexArrays(1, &global_state.vertex_array_object);
731         if (global_state.CheckAndReportError(user_context, "halide_opengl_init GenVertexArrays")) {
732             return 1;
733         }
734     }
735 
736     global_state.initialized = true;
737     return 0;
738 }
739 
740 // Release all data allocated by the runtime.
741 //
742 // The OpenGL context itself is generally managed by the host application, so
743 // we leave it untouched.
halide_opengl_device_release(void * user_context)744 WEAK int halide_opengl_device_release(void *user_context) {
745     if (!global_state.initialized) {
746         return 0;
747     }
748 
749     debug(user_context) << "halide_opengl_release\n";
750     global_state.DeleteFramebuffers(1, &global_state.framebuffer_id);
751 
752     ModuleState *mod = state_list;
753     while (mod) {
754         delete_kernel(user_context, mod->kernel);
755         mod->kernel = NULL;
756         ModuleState *next = mod->next;
757         // do not call free(mod) to avoid dangling pointers: the module state
758         // is still referenced in the code generated by Halide (see
759         // CodeGen_GPU_Host::get_module_state).
760         mod = next;
761     }
762 
763     global_state.DeleteBuffers(1, &global_state.vertex_buffer);
764     global_state.DeleteBuffers(1, &global_state.element_buffer);
765     if (global_state.have_vertex_array_objects) {
766         global_state.DeleteVertexArrays(1, &global_state.vertex_array_object);
767     }
768 
769     global_state = GlobalState();
770 
771     return 0;
772 }
773 
774 // Determine OpenGL texture format and channel type for a given halide_buffer_t.
get_texture_format(void * user_context,halide_buffer_t * buf,GLint * internal_format,GLint * format,GLint * type)775 WEAK bool get_texture_format(void *user_context, halide_buffer_t *buf,
776                              GLint *internal_format, GLint *format, GLint *type) {
777     if (buf->type == halide_type_of<uint8_t>()) {
778         *type = GL_UNSIGNED_BYTE;
779     } else if (buf->type == halide_type_of<uint16_t>()) {
780         *type = GL_UNSIGNED_SHORT;
781     } else if (buf->type == halide_type_of<float>()) {
782         *type = GL_FLOAT;
783     } else {
784         error(user_context) << "OpenGL: Only uint8, uint16, and float textures are supported.";
785         return false;
786     }
787 
788     const int channels = (buf->dimensions > 2) ? buf->dim[2].extent : 0;
789 
790     // GL_LUMINANCE and GL_LUMINANCE_ALPHA aren't color-renderable in ES2, period,
791     // thus can't be read back via ReadPixels, thus are nearly useless to us.
792     // GL_RED and GL_RG are technically optional in ES2 (required in ES3),
793     // but as a practical matter, they are supported on pretty much every recent device
794     // (iOS: everything >= iPhone 4s; Android: everything >= 4.3 plus various older devices).
795     // This is definitely suboptimal; the only real alternative would be to implement
796     // these as GL_RGB or GL_RGBA, ignoring the extra channels.
797     if (channels <= 2 && !global_state.have_texture_rg) {
798         error(user_context) << "OpenGL: 1 and 2 channel textures are not supported for this version of OpenGL.";
799         return false;
800     }
801 
802     // Common formats supported by both GLES 2.0 and GL 2.1 are selected below
803     //
804     switch (channels) {
805     case 0:
806     case 1:
807         *format = GL_RED;
808         break;
809     case 2:
810         *format = GL_RG;
811         break;
812     case 3:
813         *format = GL_RGB;
814         break;
815     case 4:
816         *format = GL_RGBA;
817         break;
818     default:
819         error(user_context) << "OpenGL: Invalid number of color channels: " << channels;
820         return false;
821     }
822 
823     switch (global_state.profile) {
824     case OpenGLES:
825         // For OpenGL ES, the texture format has to match the pixel format
826         // since there no conversion is performed during texture transfers.
827         // See OES_texture_float.
828         *internal_format = *format;
829         break;
830     case OpenGL:
831         // For desktop OpenGL, the internal format specifiers include the
832         // precise data type, see ARB_texture_float.
833         if (*type == GL_FLOAT) {
834             switch (*format) {
835             case GL_RED:
836             case GL_RG:
837             case GL_RGB:
838             case GL_RGBA:
839                 *internal_format = GL_RGBA32F;
840                 break;
841             default:
842                 error(user_context) << "OpenGL: Cannot select internal format for format " << *format;
843                 return false;
844             }
845         } else {
846             *internal_format = *format;
847         }
848         break;
849     }
850 
851     return true;
852 }
853 
854 // This function returns the width, height and number of color channels that the
855 // texture for the specified halide_buffer_t will contain. It provides a single place
856 // to implement the logic snapping zero sized dimensions to one element.
get_texture_dimensions(void * user_context,halide_buffer_t * buf,GLint * width,GLint * height,GLint * channels)857 WEAK bool get_texture_dimensions(void *user_context, halide_buffer_t *buf, GLint *width,
858                                  GLint *height, GLint *channels) {
859     if (buf->dimensions > 3) {
860         error(user_context) << "The GL backend supports buffers of at most 3 dimensions\n";
861         return false;
862     }
863 
864     *width = buf->dim[0].extent;
865     if (*width == 0) {
866         error(user_context) << "Invalid dim[0].extent: " << *width << "\n";
867         return false;
868     }
869 
870     // GLES 2.0 supports GL_TEXTURE_2D (plus cube map), but not 1d or 3d. If we
871     // end up with a buffer that has a zero extent, set the corresponding size
872     // to one.
873     *height = (buf->dimensions > 1) ? buf->dim[1].extent : 1;
874     *channels = (buf->dimensions > 2) ? buf->dim[2].extent : 1;
875 
876     return true;
877 }
878 
879 // Allocate a new texture matching the dimension and color format of the
880 // specified buffer.
halide_opengl_device_malloc(void * user_context,halide_buffer_t * buf)881 WEAK int halide_opengl_device_malloc(void *user_context, halide_buffer_t *buf) {
882     if (int error = halide_opengl_init(user_context)) {
883         return error;
884     }
885 
886     if (!buf) {
887         error(user_context) << "Invalid buffer";
888         return 1;
889     }
890 
891     // If the texture was already created by the host application, check that
892     // it has the correct format. Otherwise, allocate and set up an
893     // appropriate texture.
894     GLuint tex = 0;
895     bool halide_allocated = false;
896 
897     if (buf->device) {
898 #ifdef HAVE_GLES3
899         // Look up the width and the height from the existing texture. Note that
900         // glGetTexLevelParameteriv does not support GL_TEXTURE_WIDTH or
901         // GL_TEXTURE_HEIGHT in GLES 2.0
902         GLint width, height;
903         global_state.BindTexture(GL_TEXTURE_2D, tex);
904         global_state.GetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_WIDTH, &width);
905         global_state.GetTexLevelParameteriv(GL_TEXTURE_2D, 0, GL_TEXTURE_HEIGHT, &height);
906         if (global_state.CheckAndReportError(user_context, "halide_opengl_device_malloc binding texture (GLES3)")) {
907             return 1;
908         }
909         if (width < buf->dim[0].extent || height < buf->dim[1].extent) {
910             error(user_context)
911                 << "Existing texture is smaller than buffer. "
912                 << "Texture size: " << width << "x" << height
913                 << ", buffer size: " << buf->dim[0].extent << "x" << buf->dim[1].extent;
914             return 1;
915         }
916 #endif
917         uint64_t handle = buf->device;
918         tex = (handle == HALIDE_OPENGL_RENDER_TARGET) ? 0 : (GLuint)handle;
919     } else {
920         if (buf->dimensions > 3) {
921             error(user_context) << "high-dimensional textures are not supported";
922             return 1;
923         }
924 
925         // Generate texture ID
926         global_state.GenTextures(1, &tex);
927         if (global_state.CheckAndReportError(user_context, "halide_opengl_device_malloc GenTextures")) {
928             global_state.DeleteTextures(1, &tex);
929             return 1;
930         }
931 
932         // Set parameters for this texture: no interpolation and clamp to edges.
933         global_state.BindTexture(GL_TEXTURE_2D, tex);
934         global_state.TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
935         global_state.TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
936         global_state.TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
937         global_state.TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
938         if (global_state.CheckAndReportError(user_context, "halide_opengl_device_malloc binding texture")) {
939             global_state.DeleteTextures(1, &tex);
940             return 1;
941         }
942 
943         // Create empty texture here and fill it with glTexSubImage2D later.
944         GLint internal_format, format, type;
945         if (!get_texture_format(user_context, buf, &internal_format, &format, &type)) {
946             error(user_context) << "Invalid texture format";
947             global_state.DeleteTextures(1, &tex);
948             return 1;
949         }
950 
951         GLint width, height, channels;
952         if (!get_texture_dimensions(user_context, buf, &width, &height, &channels)) {
953             error(user_context) << "Invalid texture dimensions";
954             return 1;
955         }
956 
957         global_state.TexImage2D(GL_TEXTURE_2D, 0, internal_format, width, height, 0, format, type, NULL);
958         if (global_state.CheckAndReportError(user_context, "halide_opengl_device_malloc TexImage2D")) {
959             global_state.DeleteTextures(1, &tex);
960             return 1;
961         }
962 
963         buf->device = tex;
964         buf->device_interface = &opengl_device_interface;
965         buf->device_interface->impl->use_module();
966         halide_allocated = true;
967         debug(user_context) << "Allocated texture " << tex
968                             << " of size " << width << " x " << height << "\n";
969 
970         global_state.BindTexture(GL_TEXTURE_2D, 0);
971     }
972 
973     return 0;
974 }
975 
976 // Delete all texture information associated with a buffer.
halide_opengl_device_free(void * user_context,halide_buffer_t * buf)977 WEAK int halide_opengl_device_free(void *user_context, halide_buffer_t *buf) {
978     if (!global_state.initialized) {
979         error(user_context) << "OpenGL runtime not initialized in call to halide_opengl_device_free.";
980         return 1;
981     }
982 
983     if (buf->device == 0) {
984         return 0;
985     }
986 
987     uint64_t handle = buf->device;
988     GLuint tex = (handle == HALIDE_OPENGL_RENDER_TARGET) ? 0 : (GLuint)handle;
989 
990     int result = 0;
991     debug(user_context) << "halide_opengl_device_free: Deleting texture " << tex << "\n";
992     global_state.DeleteTextures(1, &tex);
993     if (global_state.CheckAndReportError(user_context, "halide_opengl_device_free DeleteTextures")) {
994         result = 1;
995         // do not return: we want to zero out the interface and
996         // device fields even if we can't delete the texture.
997     }
998     buf->device = 0;
999     buf->device_interface->impl->release_module();
1000     buf->device_interface = NULL;
1001 
1002     return result;
1003 }
1004 
1005 // Can't use std::min, std::max in Halide runtime.
1006 template<typename T>
std_min(T a,T b)1007 ALWAYS_INLINE T std_min(T a, T b) {
1008     return (a < b) ? a : b;
1009 }
1010 template<typename T>
std_max(T a,T b)1011 ALWAYS_INLINE T std_max(T a, T b) {
1012     return (a > b) ? a : b;
1013 }
1014 
1015 // This method copies image data from the layout specified by the strides of the
1016 // halide_buffer_t to the packed interleaved format needed by GL. It is assumed that
1017 // src and dst have the same number of channels.
1018 template<class T>
halide_to_interleaved(const halide_buffer_t * src_buf,T * dst)1019 ALWAYS_INLINE void halide_to_interleaved(const halide_buffer_t *src_buf, T *dst) {
1020     const T *src = reinterpret_cast<const T *>(src_buf->host);
1021     int width = (src_buf->dimensions > 0) ? src_buf->dim[0].extent : 1;
1022     int height = (src_buf->dimensions > 1) ? src_buf->dim[1].extent : 1;
1023     int channels = (src_buf->dimensions > 2) ? src_buf->dim[2].extent : 1;
1024     int x_stride = (src_buf->dimensions > 0) ? src_buf->dim[0].stride : 0;
1025     int y_stride = (src_buf->dimensions > 1) ? src_buf->dim[1].stride : 0;
1026     int c_stride = (src_buf->dimensions > 2) ? src_buf->dim[2].stride : 0;
1027     for (int y = 0; y < height; y++) {
1028         int dstidx = y * width * channels;
1029         for (int x = 0; x < width; x++) {
1030             int srcidx = y * y_stride + x * x_stride;
1031             for (int c = 0; c < channels; c++) {
1032                 dst[dstidx] = src[srcidx];
1033                 srcidx += c_stride;
1034                 dstidx += 1;
1035             }
1036         }
1037     }
1038 }
1039 
1040 // This method copies image data from the packed interleaved format needed by GL
1041 // to the arbitrary strided layout specified by the halide_buffer_t. If src has fewer
1042 // channels than dst, the excess in dst will be left untouched; if src has
1043 // more channels than dst, the excess will be ignored.
1044 template<class T>
interleaved_to_halide(void * user_context,const T * src,int src_channels,halide_buffer_t * dst_buf)1045 ALWAYS_INLINE void interleaved_to_halide(void *user_context, const T *src, int src_channels, halide_buffer_t *dst_buf) {
1046     T *dst = reinterpret_cast<T *>(dst_buf->host);
1047     int width = (dst_buf->dimensions > 0) ? dst_buf->dim[0].extent : 1;
1048     int height = (dst_buf->dimensions > 1) ? dst_buf->dim[1].extent : 1;
1049     int dst_channels = (dst_buf->dimensions > 2) ? dst_buf->dim[2].extent : 1;
1050     int x_stride = (dst_buf->dimensions > 0) ? dst_buf->dim[0].stride : 0;
1051     int y_stride = (dst_buf->dimensions > 1) ? dst_buf->dim[1].stride : 0;
1052     int c_stride = (dst_buf->dimensions > 2) ? dst_buf->dim[2].stride : 0;
1053     int src_skip = std_max(0, src_channels - dst_channels);
1054     int channels = std_min<int>(src_channels, dst_channels);
1055 
1056     for (int y = 0; y < height; y++) {
1057         int srcidx = y * width * src_channels;
1058         for (int x = 0; x < width; x++) {
1059             int dstidx = y * y_stride + x * x_stride;
1060             for (int c = 0; c < channels; c++) {
1061                 dst[dstidx] = src[srcidx];
1062                 srcidx += 1;
1063                 dstidx += c_stride;
1064             }
1065             srcidx += src_skip;
1066         }
1067     }
1068 }
1069 
1070 // Copy image data from host memory to texture.
halide_opengl_copy_to_device(void * user_context,halide_buffer_t * buf)1071 WEAK int halide_opengl_copy_to_device(void *user_context, halide_buffer_t *buf) {
1072     if (!global_state.initialized) {
1073         error(user_context) << "OpenGL runtime not initialized (halide_opengl_copy_to_device).";
1074         return 1;
1075     }
1076 
1077     GLStateSaver state_saver;
1078 
1079     int err = halide_opengl_device_malloc(user_context, buf);
1080     if (err) {
1081         return err;
1082     }
1083 
1084     if (!buf->host || !buf->device) {
1085         debug_buffer(user_context, buf);
1086         error(user_context) << "Invalid copy_to_device operation: host or device NULL";
1087         return 1;
1088     }
1089 
1090     uint64_t handle = buf->device;
1091     if (handle == HALIDE_OPENGL_RENDER_TARGET) {
1092         // TODO: this isn't correct; we want to ensure we copy to the current render_target.
1093         debug(user_context) << "halide_opengl_copy_to_device: called for HALIDE_OPENGL_RENDER_TARGET\n";
1094         return 0;
1095     }
1096     GLuint tex = (GLuint)handle;
1097     debug(user_context) << "halide_opengl_copy_to_device: " << tex << "\n";
1098 
1099     global_state.BindTexture(GL_TEXTURE_2D, tex);
1100     if (global_state.CheckAndReportError(user_context, "halide_opengl_copy_to_device BindTexture")) {
1101         return 1;
1102     }
1103     GLint internal_format, format, type;
1104     if (!get_texture_format(user_context, buf, &internal_format, &format, &type)) {
1105         error(user_context) << "Invalid texture format";
1106         return 1;
1107     }
1108 
1109     GLint width, height, buffer_channels;
1110     if (!get_texture_dimensions(user_context, buf, &width, &height, &buffer_channels)) {
1111         error(user_context) << "Invalid texture dimensions";
1112         return 1;
1113     }
1114 
1115     // To use TexSubImage2D directly, the colors must be stored interleaved
1116     // and rows must be stored consecutively.
1117     // (Single-channel buffers are "interleaved" for our purposes here.)
1118     bool is_interleaved = (buffer_channels == 1) || (buf->dim[2].stride == 1 && buf->dim[0].stride == buf->dim[2].extent);
1119     bool is_packed = (buf->dim[1].stride == buf->dim[0].extent * buf->dim[0].stride);
1120     if (is_interleaved && is_packed) {
1121         global_state.PixelStorei(GL_UNPACK_ALIGNMENT, 1);
1122         global_state.TexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, format, type, buf->host);
1123         if (global_state.CheckAndReportError(user_context, "halide_opengl_copy_to_device TexSubImage2D(1)")) {
1124             return 1;
1125         }
1126     } else {
1127         debug(user_context)
1128             << "Warning: In copy_to_device, host buffer is not interleaved. Doing slow interleave.\n";
1129 
1130         size_t texture_size = width * height * buffer_channels * buf->type.bytes();
1131         HalideMalloc tmp(user_context, texture_size);
1132         if (!tmp.ptr) {
1133             error(user_context) << "halide_malloc failed inside copy_to_device";
1134             return -1;
1135         }
1136 
1137         switch (type) {
1138         case GL_UNSIGNED_BYTE:
1139             halide_to_interleaved<uint8_t>(buf, (uint8_t *)tmp.ptr);
1140             break;
1141         case GL_UNSIGNED_SHORT:
1142             halide_to_interleaved<uint16_t>(buf, (uint16_t *)tmp.ptr);
1143             break;
1144         case GL_FLOAT:
1145             halide_to_interleaved<float>(buf, (float *)tmp.ptr);
1146             break;
1147         }
1148 
1149         global_state.PixelStorei(GL_UNPACK_ALIGNMENT, 1);
1150         global_state.TexSubImage2D(GL_TEXTURE_2D, 0, 0, 0, width, height, format, type, tmp.ptr);
1151         if (global_state.CheckAndReportError(user_context, "halide_opengl_copy_to_device TexSubImage2D(2)")) {
1152             return 1;
1153         }
1154     }
1155 
1156     return 0;
1157 }
1158 
1159 // Copy image data from texture back to host memory.
halide_opengl_copy_to_host(void * user_context,halide_buffer_t * buf)1160 WEAK int halide_opengl_copy_to_host(void *user_context, halide_buffer_t *buf) {
1161     if (!global_state.initialized) {
1162         error(user_context) << "OpenGL runtime not initialized (halide_opengl_copy_to_host).";
1163         return 1;
1164     }
1165 
1166     GLStateSaver state_saver;
1167 
1168     if (!buf->host || !buf->device) {
1169         debug_buffer(user_context, buf);
1170         error(user_context) << "Invalid copy_to_host operation: host or dev NULL";
1171         return 1;
1172     }
1173 
1174     GLint internal_format, format, type;
1175     if (!get_texture_format(user_context, buf, &internal_format, &format, &type)) {
1176         error(user_context) << "Invalid texture format";
1177         return 1;
1178     }
1179 
1180     GLint width, height, buffer_channels;
1181     if (!get_texture_dimensions(user_context, buf, &width, &height, &buffer_channels)) {
1182         error(user_context) << "Invalid texture dimensions";
1183         return 1;
1184     }
1185     GLint texture_channels = buffer_channels;
1186 
1187     uint64_t handle = buf->device;
1188     if (handle != HALIDE_OPENGL_RENDER_TARGET) {
1189         GLuint tex = (GLuint)handle;
1190         debug(user_context) << "halide_copy_to_host: texture " << tex << "\n";
1191         global_state.BindFramebuffer(GL_FRAMEBUFFER, global_state.framebuffer_id);
1192         if (global_state.CheckAndReportError(user_context, "copy_to_host BindFramebuffer")) {
1193             return 1;
1194         }
1195         global_state.FramebufferTexture2D(GL_FRAMEBUFFER, GL_COLOR_ATTACHMENT0, GL_TEXTURE_2D, tex, 0);
1196         if (global_state.CheckAndReportError(user_context, "copy_to_host FramebufferTexture2D")) {
1197             return 1;
1198         }
1199     } else {
1200         debug(user_context) << "halide_copy_to_host: HALIDE_OPENGL_RENDER_TARGET\n";
1201     }
1202 
1203     // Check that framebuffer is set up correctly
1204     GLenum status = global_state.CheckFramebufferStatus(GL_FRAMEBUFFER);
1205     if (status != GL_FRAMEBUFFER_COMPLETE) {
1206         error(user_context)
1207             << "Setting up GL framebuffer " << global_state.framebuffer_id << " failed " << status;
1208         return 1;
1209     }
1210 
1211     // The only format/type pairs guaranteed to be readable in GLES2 are GL_RGBA+GL_UNSIGNED_BYTE,
1212     // plus one other implementation-dependent pair specified here. Spoiler alert:
1213     // some ES2 implementations return that very same pair here (i.e., they don't support
1214     // any other formats); in that case, we need to read as RGBA and manually convert to
1215     // what we need (usually GL_RGB).
1216     // NOTE: this requires the currently-bound Framebuffer is correct.
1217     // TODO: short and float will require even more effort on top of this.
1218     if (global_state.profile == OpenGLES && format == GL_RGB) {
1219         GLint extra_format, extra_type;
1220         global_state.GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_TYPE, &extra_type);
1221         if (type != GL_UNSIGNED_BYTE && type != extra_type) {
1222             error(user_context) << "ReadPixels does not support our type; we don't handle this yet.\n";
1223             return 1;
1224         }
1225         global_state.GetIntegerv(GL_IMPLEMENTATION_COLOR_READ_FORMAT, &extra_format);
1226         if (format != GL_RGBA && format != extra_format) {
1227             debug(user_context) << "ReadPixels does not support our format; falling back to GL_RGBA\n";
1228             format = GL_RGBA;
1229             texture_channels = 4;
1230         }
1231     }
1232 
1233     // To download the texture directly, the colors must be stored interleaved
1234     // and rows must be stored consecutively.
1235     // (Single-channel buffers are "interleaved" for our purposes here.)
1236     bool is_interleaved = (buffer_channels == 1) || (buf->dim[2].stride == 1 && buf->dim[0].stride == buf->dim[2].extent);
1237     bool is_packed = (buf->dim[1].stride == buf->dim[0].extent * buf->dim[0].stride);
1238     if (is_interleaved && is_packed && texture_channels == buffer_channels) {
1239         global_state.PixelStorei(GL_PACK_ALIGNMENT, 1);
1240 #ifdef DEBUG_RUNTIME
1241         int64_t t1 = halide_current_time_ns(user_context);
1242 #endif
1243         global_state.ReadPixels(0, 0, buf->dim[0].extent, buf->dim[1].extent, format, type, buf->host);
1244 #ifdef DEBUG_RUNTIME
1245         int64_t t2 = halide_current_time_ns(user_context);
1246 #endif
1247         if (global_state.CheckAndReportError(user_context, "copy_to_host ReadPixels (1)")) {
1248             return 1;
1249         }
1250 #ifdef DEBUG_RUNTIME
1251         debug(user_context) << "ReadPixels(1) time: " << (t2 - t1) / 1e3 << "usec\n";
1252 #endif
1253     } else {
1254         debug(user_context)
1255             << "Warning: In copy_to_host, host buffer is not interleaved, or not a native format. Doing slow deinterleave.\n";
1256 
1257         size_t texture_size = width * height * texture_channels * buf->type.bytes();
1258         HalideMalloc tmp(user_context, texture_size);
1259         if (!tmp.ptr) {
1260             error(user_context) << "halide_malloc failed inside copy_to_host";
1261             return -1;
1262         }
1263 
1264         global_state.PixelStorei(GL_PACK_ALIGNMENT, 1);
1265 #ifdef DEBUG_RUNTIME
1266         int64_t t1 = halide_current_time_ns(user_context);
1267 #endif
1268         global_state.ReadPixels(0, 0, buf->dim[0].extent, buf->dim[1].extent, format, type, tmp.ptr);
1269 #ifdef DEBUG_RUNTIME
1270         int64_t t2 = halide_current_time_ns(user_context);
1271         debug(user_context) << "ReadPixels(2) time: " << (t2 - t1) / 1e3 << "usec\n";
1272 #endif
1273         if (global_state.CheckAndReportError(user_context, "copy_to_host ReadPixels (2)")) {
1274             return 1;
1275         }
1276 
1277         // Premature optimization warning: interleaved_to_halide() could definitely
1278         // be optimized, but ReadPixels() typically takes ~2-10x as long (especially on
1279         // mobile devices), so the returns will be modest.
1280 #ifdef DEBUG_RUNTIME
1281         int64_t t3 = halide_current_time_ns(user_context);
1282 #endif
1283         switch (type) {
1284         case GL_UNSIGNED_BYTE:
1285             interleaved_to_halide<uint8_t>(user_context, (uint8_t *)tmp.ptr, texture_channels, buf);
1286             break;
1287         case GL_UNSIGNED_SHORT:
1288             interleaved_to_halide<uint16_t>(user_context, (uint16_t *)tmp.ptr, texture_channels, buf);
1289             break;
1290         case GL_FLOAT:
1291             interleaved_to_halide<float>(user_context, (float *)tmp.ptr, texture_channels, buf);
1292             break;
1293         }
1294 #ifdef DEBUG_RUNTIME
1295         int64_t t4 = halide_current_time_ns(user_context);
1296         debug(user_context) << "deinterleave time: " << (t4 - t3) / 1e3 << "usec\n";
1297 #endif
1298     }
1299 
1300     return 0;
1301 }
1302 
1303 }  // namespace OpenGL
1304 }  // namespace Internal
1305 }  // namespace Runtime
1306 }  // namespace Halide
1307 
1308 using namespace Halide::Runtime::Internal::OpenGL;
1309 
1310 // Find the correct module for the called function
1311 // TODO: This currently takes O(# of GLSL'd stages) and can
1312 // be optimized
find_module(const char * stage_name)1313 WEAK ModuleState *find_module(const char *stage_name) {
1314     ModuleState *state_ptr = state_list;
1315 
1316     while (state_ptr != NULL) {
1317         KernelInfo *kernel = state_ptr->kernel;
1318         if (kernel && strcmp(stage_name, kernel->name) == 0) {
1319             return state_ptr;
1320         }
1321         state_ptr = state_ptr->next;
1322     }
1323 
1324     return NULL;
1325 }
1326 
1327 //  Create wrappers that satisfy old naming conventions
1328 
1329 extern "C" {
1330 
halide_opengl_run(void * user_context,void * state_ptr,const char * entry_name,int blocksX,int blocksY,int blocksZ,int threadsX,int threadsY,int threadsZ,int shared_mem_bytes,size_t arg_sizes[],void * args[],int8_t is_buffer[],int num_padded_attributes,float * vertex_buffer,int num_coords_dim0,int num_coords_dim1)1331 WEAK int halide_opengl_run(void *user_context,
1332                            void *state_ptr,
1333                            const char *entry_name,
1334                            int blocksX, int blocksY, int blocksZ,
1335                            int threadsX, int threadsY, int threadsZ,
1336                            int shared_mem_bytes,
1337                            size_t arg_sizes[], void *args[], int8_t is_buffer[],
1338                            int num_padded_attributes,
1339                            float *vertex_buffer,
1340                            int num_coords_dim0,
1341                            int num_coords_dim1) {
1342     if (!global_state.initialized) {
1343         error(user_context) << "OpenGL runtime not initialized (halide_opengl_run).";
1344         return 1;
1345     }
1346 
1347     GLStateSaver state_saver;
1348 
1349     // Find the right module
1350     ModuleState *mod = find_module(entry_name);
1351     if (!mod) {
1352         error(user_context) << "Internal error: module state for stage " << entry_name << " not found\n";
1353         return 1;
1354     }
1355 
1356     KernelInfo *kernel = mod->kernel;
1357 
1358     global_state.UseProgram(kernel->program_id);
1359     if (global_state.CheckAndReportError(user_context, "halide_opengl_run UseProgram")) {
1360         return 1;
1361     }
1362 
1363     // TODO(abstephensg) it would be great to codegen these vec4 uniform buffers
1364     // directly, instead of passing an array of arguments and then copying them
1365     // out at runtime.
1366 
1367     // Determine the number of float and int uniform parameters. This code
1368     // follows the argument packing convention in CodeGen_GPU_Host and
1369     // CodeGen_OpenGL_Dev
1370     int num_uniform_floats = 0;
1371     int num_uniform_ints = 0;
1372 
1373     Argument *kernel_arg = kernel->arguments;
1374     for (int i = 0; args[i]; i++, kernel_arg = kernel_arg->next) {
1375 
1376         // Check for a mismatch between the number of arguments declared in the
1377         // fragment shader source header and the number passed to this function
1378         if (!kernel_arg) {
1379             error(user_context)
1380                 << "Too many arguments passed to halide_opengl_run\n"
1381                 << "Argument " << i << ": size=" << i << " value=" << args[i];
1382             return 1;
1383         }
1384 
1385         // Count the number of float and int uniform parameters.
1386         if (kernel_arg->kind == Argument::Uniform) {
1387             switch (kernel_arg->type) {
1388             case Argument::Float:
1389             // Integer parameters less than 32 bits wide are passed as
1390             // normalized float values
1391             case Argument::Int8:
1392             case Argument::UInt8:
1393             case Argument::Int16:
1394             case Argument::UInt16:
1395                 ++num_uniform_floats;
1396                 break;
1397             case Argument::Bool:
1398             case Argument::Int32:
1399             case Argument::UInt32:
1400                 ++num_uniform_ints;
1401                 break;
1402             default:
1403                 error(user_context) << "GLSL: Encountered invalid kernel argument type";
1404                 return 1;
1405             }
1406         }
1407     }
1408 
1409     // Pad up to a multiple of four
1410     int num_padded_uniform_floats = (num_uniform_floats + 0x3) & ~0x3;
1411     int num_padded_uniform_ints = (num_uniform_ints + 0x3) & ~0x3;
1412 
1413     // Allocate storage for the packed arguments
1414     float uniform_float[num_padded_uniform_floats];
1415     int uniform_int[num_padded_uniform_ints];
1416 
1417     bool bind_render_targets = true;
1418 
1419     // Copy input arguments to corresponding GLSL uniforms.
1420     GLint num_active_textures = 0;
1421     int uniform_float_idx = 0;
1422     int uniform_int_idx = 0;
1423 
1424     kernel_arg = kernel->arguments;
1425     for (int i = 0; args[i]; i++, kernel_arg = kernel_arg->next) {
1426 
1427         if (kernel_arg->kind == Argument::Outbuf) {
1428             halide_assert(user_context, is_buffer[i] && "OpenGL Outbuf argument is not a buffer.");
1429             // Check if the output buffer will be bound by the client instead of
1430             // the Halide runtime
1431             uint64_t handle = ((halide_buffer_t *)args[i])->device;
1432             if (!handle) {
1433                 error(user_context) << "GLSL: Encountered invalid NULL dev pointer";
1434                 return 1;
1435             }
1436             if (handle == HALIDE_OPENGL_RENDER_TARGET) {
1437                 bind_render_targets = false;
1438             }
1439             // Outbuf textures are handled explicitly below
1440             continue;
1441         } else if (kernel_arg->kind == Argument::Inbuf) {
1442             halide_assert(user_context, is_buffer[i] && "OpenGL Inbuf argument is not a buffer.")
1443                 GLint loc =
1444                     global_state.GetUniformLocation(kernel->program_id, kernel_arg->name);
1445             if (global_state.CheckAndReportError(user_context, "halide_opengl_run GetUniformLocation(InBuf)")) {
1446                 return 1;
1447             }
1448             if (loc == -1) {
1449                 error(user_context) << "No sampler defined for input texture.";
1450                 return 1;
1451             }
1452             uint64_t handle = ((halide_buffer_t *)args[i])->device;
1453             if (!handle) {
1454                 error(user_context) << "GLSL: Encountered invalid NULL dev pointer";
1455                 return 1;
1456             }
1457             global_state.ActiveTexture(GL_TEXTURE0 + num_active_textures);
1458             global_state.BindTexture(GL_TEXTURE_2D, handle == HALIDE_OPENGL_RENDER_TARGET ? 0 : (GLuint)handle);
1459             global_state.Uniform1iv(loc, 1, &num_active_textures);
1460 
1461             // Textures not created by the Halide runtime might not have
1462             // parameters set, or might have had parameters set differently
1463             global_state.TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
1464             global_state.TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
1465             global_state.TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
1466             global_state.TexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
1467 
1468             num_active_textures++;
1469             // TODO: check maximum number of active textures
1470         } else if (kernel_arg->kind == Argument::Uniform) {
1471             // Copy the uniform parameter into the packed scalar list
1472             // corresponding to its type.
1473 
1474             // Note: small integers are represented as floats in GLSL.
1475             switch (kernel_arg->type) {
1476             case Argument::Float:
1477                 uniform_float[uniform_float_idx++] = *(float *)args[i];
1478                 break;
1479             case Argument::Bool:
1480                 uniform_int[uniform_int_idx++] = *((bool *)args[i]) ? 1 : 0;
1481                 break;
1482             case Argument::Int8:
1483                 uniform_float[uniform_float_idx++] = *((int8_t *)args[i]);
1484                 break;
1485             case Argument::UInt8:
1486                 uniform_float[uniform_float_idx++] = *((uint8_t *)args[i]);
1487                 break;
1488             case Argument::Int16: {
1489                 uniform_float[uniform_float_idx++] = *((int16_t *)args[i]);
1490                 break;
1491             }
1492             case Argument::UInt16: {
1493                 uniform_float[uniform_float_idx++] = *((uint16_t *)args[i]);
1494                 break;
1495             }
1496             case Argument::Int32: {
1497                 uniform_int[uniform_int_idx++] = *((int32_t *)args[i]);
1498                 break;
1499             }
1500             case Argument::UInt32: {
1501                 uint32_t value = *((uint32_t *)args[i]);
1502                 if (value > 0x7fffffff) {
1503                     error(user_context)
1504                         << "OpenGL: argument '" << kernel_arg->name << "' is too large for GLint";
1505                     return -1;
1506                 }
1507                 uniform_int[uniform_int_idx++] = static_cast<GLint>(value);
1508                 break;
1509             }
1510             case Argument::Void:
1511                 error(user_context) << "OpenGL: Encountered invalid kernel argument type";
1512                 return 1;
1513             }
1514         }
1515     }
1516 
1517     if (kernel_arg) {
1518         error(user_context) << "Too few arguments passed to halide_opengl_run";
1519         return 1;
1520     }
1521 
1522     // Set the packed uniform int parameters
1523     for (int idx = 0; idx != num_padded_uniform_ints; idx += 4) {
1524 
1525         // Produce the uniform parameter name without using the std library.
1526         Printer<StringStreamPrinter, 16> name(user_context);
1527         name << "_uniformi" << (idx / 4);
1528 
1529         GLint loc = global_state.GetUniformLocation(kernel->program_id, name.str());
1530         if (global_state.CheckAndReportError(user_context, "halide_opengl_run GetUniformLocation")) {
1531             return 1;
1532         }
1533         if (loc == -1) {
1534             // Argument was probably optimized away by GLSL compiler.
1535             continue;
1536         }
1537 
1538         global_state.Uniform4iv(loc, 1, &uniform_int[idx]);
1539     }
1540 
1541     // Set the packed uniform float parameters
1542     for (int idx = 0; idx != num_padded_uniform_floats; idx += 4) {
1543 
1544         // Produce the uniform parameter name without using the std library.
1545         Printer<StringStreamPrinter, 16> name(user_context);
1546         name << "_uniformf" << (idx / 4);
1547 
1548         GLint loc = global_state.GetUniformLocation(kernel->program_id, name.str());
1549         if (global_state.CheckAndReportError(user_context, "halide_opengl_run GetUniformLocation(2)")) {
1550             return 1;
1551         }
1552         if (loc == -1) {
1553             // Argument was probably optimized away by GLSL compiler.
1554             continue;
1555         }
1556 
1557         global_state.Uniform4fv(loc, 1, &uniform_float[idx]);
1558     }
1559 
1560     // Prepare framebuffer for rendering to output textures.
1561     GLint output_min[2] = {0, 0};
1562     GLint output_extent[2] = {0, 0};
1563 
1564     if (bind_render_targets) {
1565         global_state.BindFramebuffer(GL_FRAMEBUFFER, global_state.framebuffer_id);
1566     }
1567 
1568     global_state.Disable(GL_CULL_FACE);
1569     global_state.Disable(GL_DEPTH_TEST);
1570 
1571     GLint num_output_textures = 0;
1572     kernel_arg = kernel->arguments;
1573     for (int i = 0; args[i]; i++, kernel_arg = kernel_arg->next) {
1574         if (kernel_arg->kind != Argument::Outbuf) continue;
1575 
1576         halide_assert(user_context, is_buffer[i] && "OpenGL Outbuf argument is not a buffer.")
1577 
1578             // TODO: GL_MAX_COLOR_ATTACHMENTS
1579             if (num_output_textures >= 1) {
1580             error(user_context)
1581                 << "OpenGL ES 2.0 only supports one single output texture";
1582             return 1;
1583         }
1584 
1585         halide_buffer_t *buf = (halide_buffer_t *)args[i];
1586         halide_assert(user_context, buf->dimensions >= 2);
1587         uint64_t handle = buf->device;
1588         if (!handle) {
1589             error(user_context) << "GLSL: Encountered invalid NULL dev pointer";
1590             return 1;
1591         }
1592         GLuint tex = (handle == HALIDE_OPENGL_RENDER_TARGET) ? 0 : (GLuint)handle;
1593 
1594         // Check to see if the object name is actually a FBO
1595         if (bind_render_targets) {
1596             debug(user_context)
1597                 << "Output texture " << num_output_textures << ": " << tex << "\n";
1598             global_state.FramebufferTexture2D(GL_FRAMEBUFFER,
1599                                               GL_COLOR_ATTACHMENT0 + num_output_textures,
1600                                               GL_TEXTURE_2D, tex, 0);
1601             if (global_state.CheckAndReportError(user_context, "halide_opengl_run FramebufferTexture2D")) {
1602                 return 1;
1603             }
1604         }
1605 
1606         output_min[0] = buf->dim[0].min;
1607         output_min[1] = buf->dim[1].min;
1608         output_extent[0] = buf->dim[0].extent;
1609         output_extent[1] = buf->dim[1].extent;
1610         num_output_textures++;
1611     }
1612     // TODO: GL_MAX_DRAW_BUFFERS
1613     if (num_output_textures == 0) {
1614         error(user_context) << "halide_opengl_run: kernel has no output\n";
1615         // TODO: cleanup
1616         return 1;
1617     } else if (num_output_textures > 1) {
1618         if (global_state.DrawBuffers) {
1619             HalideMalloc draw_buffers_tmp(user_context, num_output_textures * sizeof(GLenum));
1620             if (!draw_buffers_tmp.ptr) {
1621                 error(user_context) << "halide_malloc";
1622                 return 1;
1623             }
1624             GLenum *draw_buffers = (GLenum *)draw_buffers_tmp.ptr;
1625             for (int i = 0; i < num_output_textures; i++) {
1626                 draw_buffers[i] = GL_COLOR_ATTACHMENT0 + i;
1627             }
1628             global_state.DrawBuffers(num_output_textures, draw_buffers);
1629             if (global_state.CheckAndReportError(user_context, "halide_opengl_run DrawBuffers")) {
1630                 return 1;
1631             }
1632         } else {
1633             error(user_context) << "halide_opengl_run: kernel has more than one output and DrawBuffers is not available (earlier than GL ES 3.0?).\n";
1634             // TODO: cleanup
1635             return 1;
1636         }
1637     }
1638 
1639     if (bind_render_targets) {
1640         // Check that framebuffer is set up correctly
1641         GLenum status = global_state.CheckFramebufferStatus(GL_FRAMEBUFFER);
1642         if (global_state.CheckAndReportError(user_context, "halide_opengl_run CheckFramebufferStatus")) {
1643             return 1;
1644         }
1645         if (status != GL_FRAMEBUFFER_COMPLETE) {
1646             error(user_context)
1647                 << "Setting up GL framebuffer " << global_state.framebuffer_id
1648                 << " failed (" << status << ")";
1649             // TODO: cleanup
1650             return 1;
1651         }
1652     }
1653 
1654     // Set vertex attributes
1655     GLint loc = global_state.GetUniformLocation(kernel->program_id, "output_extent");
1656     global_state.Uniform2iv(loc, 1, output_extent);
1657     if (global_state.CheckAndReportError(user_context, "halide_opengl_run Uniform2iv(output_extent)")) {
1658         return 1;
1659     }
1660     loc = global_state.GetUniformLocation(kernel->program_id, "output_min");
1661     global_state.Uniform2iv(loc, 1, output_min);
1662     if (global_state.CheckAndReportError(user_context, "halide_opengl_run Uniform2iv(output_min)")) {
1663         return 1;
1664     }
1665 
1666 #if 0  // DEBUG_RUNTIME
1667     debug(user_context) << "output_extent: " << output_extent[0] << "," << output_extent[1] << "\n";
1668     debug(user_context) << "output_min: " << output_min[0] << "," << output_min[1] << "\n";
1669 #endif
1670 
1671     // TODO(abestephensg): Sort coordinate dimensions when the linear solver is integrated
1672     // Sort the coordinates
1673 
1674     // Construct an element buffer using the sorted vertex order.
1675     // Note that this is "width" and "height" of the vertices, not the output image.
1676     int width = num_coords_dim0;
1677     int height = num_coords_dim1;
1678 
1679     int vertex_buffer_size = width * height * num_padded_attributes;
1680 
1681     int element_buffer_size = (width - 1) * (height - 1) * 6;
1682     int element_buffer[element_buffer_size];
1683 
1684     int idx = 0;
1685     for (int h = 0; h != (height - 1); ++h) {
1686         for (int w = 0; w != (width - 1); ++w) {
1687 
1688             // TODO(abestephensg): Use sorted coordinates when integrated
1689             int v = w + h * width;
1690             element_buffer[idx++] = v;
1691             element_buffer[idx++] = v + 1;
1692             element_buffer[idx++] = v + width + 1;
1693 
1694             element_buffer[idx++] = v + width + 1;
1695             element_buffer[idx++] = v + width;
1696             element_buffer[idx++] = v;
1697         }
1698     }
1699 
1700 #if 0  // DEBUG_RUNTIME
1701     debug(user_context) << "Vertex buffer:";
1702     for (int i=0;i!=vertex_buffer_size;++i) {
1703         if (!(i%num_padded_attributes)) {
1704           debug(user_context) << "\n";
1705         }
1706         debug(user_context) << vertex_buffer[i] << " ";
1707     }
1708     debug(user_context) << "\n";
1709     debug(user_context) << "\n";
1710 
1711     debug(user_context) << "Element buffer:";
1712     for (int i=0;i!=element_buffer_size;++i) {
1713         if (!(i%3)) {
1714             debug(user_context) << "\n";
1715         }
1716         debug(user_context) << element_buffer[i] << " ";
1717     }
1718     debug(user_context) << "\n";
1719 #endif
1720 
1721     // Setup viewport
1722     global_state.Viewport(0, 0, output_extent[0], output_extent[1]);
1723 
1724     // Setup the vertex and element buffers
1725     GLuint vertex_array_object = 0;
1726     if (global_state.have_vertex_array_objects) {
1727         global_state.GenVertexArrays(1, &vertex_array_object);
1728         global_state.BindVertexArray(vertex_array_object);
1729     }
1730 
1731     GLuint vertex_buffer_id;
1732     global_state.GenBuffers(1, &vertex_buffer_id);
1733     global_state.BindBuffer(GL_ARRAY_BUFFER, vertex_buffer_id);
1734     global_state.BufferData(GL_ARRAY_BUFFER, sizeof(float) * vertex_buffer_size, vertex_buffer, GL_STATIC_DRAW);
1735     if (global_state.CheckAndReportError(user_context, "halide_opengl_run vertex BufferData et al")) {
1736         return 1;
1737     }
1738 
1739     GLuint element_buffer_id;
1740     global_state.GenBuffers(1, &element_buffer_id);
1741     global_state.BindBuffer(GL_ELEMENT_ARRAY_BUFFER, element_buffer_id);
1742     global_state.BufferData(GL_ELEMENT_ARRAY_BUFFER, sizeof(float) * element_buffer_size, element_buffer, GL_STATIC_DRAW);
1743     if (global_state.CheckAndReportError(user_context, "halide_opengl_run element BufferData et al")) {
1744         return 1;
1745     }
1746 
1747     // The num_padded_attributes argument is the number of vertex attributes,
1748     // including the spatial x and y coordinates, padded up to a multiple of
1749     // four so that the attributes may be packed into vec4 slots.
1750     int num_packed_attributes = num_padded_attributes / 4;
1751 
1752     // Set up the per vertex attributes
1753     GLint attrib_ids[num_packed_attributes];
1754 
1755     for (int i = 0; i != num_packed_attributes; i++) {
1756 
1757         // The attribute names can synthesized by the runtime based on the
1758         // number of packed varying attributes
1759         Printer<StringStreamPrinter> attribute_name(user_context);
1760         attribute_name << "_varyingf" << i << "_attrib";
1761 
1762         // TODO(abstephensg): Switch to glBindAttribLocation
1763         GLint attrib_id = global_state.GetAttribLocation(kernel->program_id, attribute_name.buf);
1764         attrib_ids[i] = attrib_id;
1765 
1766         // Check to see if the varying attribute was simplified out of the
1767         // program by the GLSL compiler.
1768         if (attrib_id == -1) {
1769             continue;
1770         }
1771 
1772         global_state.VertexAttribPointer(attrib_id, 4, GL_FLOAT, GL_FALSE /* Normalized */, sizeof(GLfloat) * num_padded_attributes, (void *)(i * sizeof(GLfloat) * 4));
1773         if (global_state.CheckAndReportError(user_context, "halide_opengl_run VertexAttribPointer et al")) {
1774             return 1;
1775         }
1776 
1777         global_state.EnableVertexAttribArray(attrib_id);
1778         if (global_state.CheckAndReportError(user_context, "halide_opengl_run EnableVertexAttribArray et al")) {
1779             return 1;
1780         }
1781     }
1782 
1783     // Draw the scene
1784     global_state.DrawElements(GL_TRIANGLES, element_buffer_size, GL_UNSIGNED_INT, NULL);
1785     if (global_state.CheckAndReportError(user_context, "halide_opengl_run DrawElements et al")) {
1786         return 1;
1787     }
1788 
1789     // Cleanup
1790     if (global_state.have_vertex_array_objects) {
1791         global_state.DeleteVertexArrays(1, &vertex_array_object);
1792     }
1793 
1794     global_state.DeleteBuffers(1, &vertex_buffer_id);
1795     global_state.DeleteBuffers(1, &element_buffer_id);
1796 
1797     return 0;
1798 }
1799 
halide_opengl_device_sync(void * user_context,struct halide_buffer_t *)1800 WEAK int halide_opengl_device_sync(void *user_context, struct halide_buffer_t *) {
1801     if (!global_state.initialized) {
1802         error(user_context) << "OpenGL runtime not initialized (halide_opengl_device_sync).";
1803         return 1;
1804     }
1805 #ifdef DEBUG_RUNTIME
1806     int64_t t0 = halide_current_time_ns(user_context);
1807 #endif
1808     global_state.Finish();
1809 #ifdef DEBUG_RUNTIME
1810     int64_t t1 = halide_current_time_ns(user_context);
1811     debug(user_context) << "halide_opengl_device_sync: took " << (t1 - t0) / 1e3 << "usec\n";
1812 #endif
1813     return 0;
1814 }
1815 
1816 // Called at the beginning of a code block generated by Halide. This function
1817 // is responsible for setting up the OpenGL environment and compiling the GLSL
1818 // code into a fragment shader.
halide_opengl_initialize_kernels(void * user_context,void ** state_ptr,const char * src,int size)1819 WEAK int halide_opengl_initialize_kernels(void *user_context, void **state_ptr,
1820                                           const char *src, int size) {
1821     debug(user_context) << "In initialize_kernels\n";
1822 
1823     if (int error = halide_opengl_init(user_context)) {
1824         return error;
1825     }
1826 
1827     const char *this_kernel = src;
1828 
1829     ModuleState **state = (ModuleState **)state_ptr;
1830     ModuleState *module = *state;
1831 
1832     while (this_kernel) {
1833         // Find the start of the next kernel
1834         const char *next_kernel = strstr(this_kernel + 1, kernel_marker);
1835 
1836         // Use that to compute the length of this kernel
1837         int len = 0;
1838         if (!next_kernel) {
1839             len = strlen(this_kernel);
1840         } else {
1841             len = next_kernel - this_kernel;
1842         }
1843 
1844         // Construct a new ModuleState and add it to the global list
1845         module = (ModuleState *)malloc(sizeof(ModuleState));
1846         module->kernel = NULL;
1847         module->next = state_list;
1848         state_list = module;
1849         *state = module;
1850 
1851         KernelInfo *kernel = module->kernel;
1852         if (!kernel) {
1853             kernel = create_kernel(user_context, this_kernel, len);
1854             if (!kernel) {
1855                 error(user_context) << "Invalid kernel: " << this_kernel;
1856                 return -1;
1857             }
1858             module->kernel = kernel;
1859         }
1860 
1861         // Create the vertex shader. The runtime will output boilerplate for the
1862         // vertex shader based on a fixed program plus arguments obtained from
1863         // the comment header passed in the fragment shader. Since there are a
1864         // relatively small number of vertices (i.e. usually only four), per-vertex
1865         // expressions interpolated by varying attributes are evaluated
1866         // by host code on the CPU and passed to the GPU as values in the
1867         // vertex buffer.
1868         enum { PrinterLength = 1024 * 4 };
1869         Printer<StringStreamPrinter, PrinterLength> vertex_src(user_context);
1870 
1871         // Count the number of varying attributes, this is 2 for the spatial
1872         // x and y coordinates, plus the number of scalar varying attribute
1873         // expressions pulled out of the fragment shader.
1874         int num_varying_float = 2;
1875 
1876         for (Argument *arg = kernel->arguments; arg; arg = arg->next) {
1877             if (arg->kind == Argument::Varying)
1878                 ++num_varying_float;
1879         }
1880 
1881         int num_packed_varying_float = ((num_varying_float + 3) & ~0x3) / 4;
1882 
1883         for (int i = 0; i != num_packed_varying_float; ++i) {
1884             vertex_src << "attribute vec4 _varyingf" << i << "_attrib;\n";
1885             vertex_src << "varying   vec4 _varyingf" << i << ";\n";
1886         }
1887 
1888         vertex_src << "uniform ivec2 output_min;\n"
1889                    << "uniform ivec2 output_extent;\n"
1890                    << "void main() {\n"
1891 
1892                    // Host codegen always passes the spatial vertex coordinates
1893                    // in the first two elements of the _varyingf0_attrib
1894                    << "    vec2 position = vec2(_varyingf0_attrib[0], _varyingf0_attrib[1]);\n"
1895                    << "    gl_Position = vec4(position, 0.0, 1.0);\n"
1896                    << "    vec2 texcoord = 0.5 * position + 0.5;\n"
1897                    << "    vec2 pixcoord = texcoord * vec2(output_extent.xy) + vec2(output_min.xy);\n";
1898 
1899         // Copy through all of the varying attributes
1900         for (int i = 0; i != num_packed_varying_float; ++i) {
1901             vertex_src << "    _varyingf" << i << " = _varyingf" << i << "_attrib;\n";
1902         }
1903 
1904         vertex_src << "    _varyingf0.xy = pixcoord;\n";
1905 
1906         vertex_src << "}\n";
1907 
1908         // Check to see if there was sufficient storage for the vertex program.
1909         if (vertex_src.size() >= PrinterLength) {
1910             error(user_context) << "Vertex shader source truncated";
1911             return 1;
1912         }
1913 
1914         // Initialize vertex shader.
1915         GLuint vertex_shader_id = make_shader(user_context,
1916                                               GL_VERTEX_SHADER, vertex_src.buf, NULL);
1917         if (vertex_shader_id == 0) {
1918             halide_error(user_context, "Failed to create vertex shader");
1919             return 1;
1920         }
1921 
1922         // Create the fragment shader
1923         GLuint fragment_shader_id = make_shader(user_context, GL_FRAGMENT_SHADER,
1924                                                 kernel->source, NULL);
1925         // Link GLSL program
1926         GLuint program = global_state.CreateProgram();
1927         global_state.AttachShader(program, vertex_shader_id);
1928         global_state.AttachShader(program, fragment_shader_id);
1929         global_state.LinkProgram(program);
1930 
1931         // Release the individual shaders
1932         global_state.DeleteShader(vertex_shader_id);
1933         global_state.DeleteShader(fragment_shader_id);
1934 
1935         GLint status;
1936         global_state.GetProgramiv(program, GL_LINK_STATUS, &status);
1937         if (!status) {
1938             GLint log_len;
1939             global_state.GetProgramiv(program, GL_INFO_LOG_LENGTH, &log_len);
1940             HalideMalloc log_tmp(user_context, log_len);
1941             if (log_tmp.ptr) {
1942                 char *log = (char *)log_tmp.ptr;
1943                 global_state.GetProgramInfoLog(program, log_len, NULL, log);
1944                 debug(user_context) << "Could not link GLSL program:\n"
1945                                     << log << "\n";
1946             }
1947             global_state.DeleteProgram(program);
1948             return -1;
1949         }
1950         kernel->program_id = program;
1951 
1952         this_kernel = next_kernel;
1953     }
1954     return 0;
1955 }
1956 
halide_opengl_device_and_host_malloc(void * user_context,struct halide_buffer_t * buf)1957 WEAK int halide_opengl_device_and_host_malloc(void *user_context, struct halide_buffer_t *buf) {
1958     return halide_default_device_and_host_malloc(user_context, buf, &opengl_device_interface);
1959 }
1960 
halide_opengl_device_and_host_free(void * user_context,struct halide_buffer_t * buf)1961 WEAK int halide_opengl_device_and_host_free(void *user_context, struct halide_buffer_t *buf) {
1962     return halide_default_device_and_host_free(user_context, buf, &opengl_device_interface);
1963 }
1964 
halide_opengl_device_interface()1965 WEAK const halide_device_interface_t *halide_opengl_device_interface() {
1966     return &opengl_device_interface;
1967 }
1968 
halide_opengl_context_lost(void * user_context)1969 WEAK void halide_opengl_context_lost(void *user_context) {
1970     if (!global_state.initialized) return;
1971 
1972     debug(user_context) << "halide_opengl_context_lost\n";
1973     for (ModuleState *mod = state_list; mod; mod = mod->next) {
1974         // Reset program handle to force recompilation.
1975         mod->kernel->program_id = 0;
1976     }
1977 
1978     global_state.init();
1979     return;
1980 }
1981 
halide_opengl_wrap_texture(void * user_context,halide_buffer_t * buf,uint64_t texture_id)1982 WEAK int halide_opengl_wrap_texture(void *user_context, halide_buffer_t *buf, uint64_t texture_id) {
1983     if (!global_state.initialized) {
1984         if (int error = halide_opengl_init(user_context)) {
1985             return error;
1986         }
1987     }
1988     if (texture_id == 0) {
1989         error(user_context) << "Texture " << texture_id << " is not a valid texture name.";
1990         return -3;
1991     }
1992     halide_assert(user_context, buf->device == 0);
1993     if (buf->device != 0) {
1994         return -2;
1995     }
1996     buf->device = texture_id;
1997     buf->device_interface = &opengl_device_interface;
1998     buf->device_interface->impl->use_module();
1999     return 0;
2000 }
2001 
halide_opengl_wrap_render_target(void * user_context,halide_buffer_t * buf)2002 WEAK int halide_opengl_wrap_render_target(void *user_context, halide_buffer_t *buf) {
2003     if (!global_state.initialized) {
2004         if (int error = halide_opengl_init(user_context)) {
2005             return error;
2006         }
2007     }
2008     halide_assert(user_context, buf->device == 0);
2009     if (buf->device != 0) {
2010         return -2;
2011     }
2012     buf->device = HALIDE_OPENGL_RENDER_TARGET;
2013     buf->device_interface = &opengl_device_interface;
2014     buf->device_interface->impl->use_module();
2015     return 0;
2016 }
2017 
halide_opengl_detach_texture(void * user_context,halide_buffer_t * buf)2018 WEAK int halide_opengl_detach_texture(void *user_context, halide_buffer_t *buf) {
2019     if (buf->device == 0) {
2020         return 0;
2021     }
2022 
2023     halide_assert(user_context, buf->device_interface == &opengl_device_interface);
2024     buf->device = 0;
2025     buf->device_interface->impl->release_module();
2026     buf->device_interface = NULL;
2027     return 0;
2028 }
2029 
halide_opengl_get_texture(void * user_context,halide_buffer_t * buf)2030 WEAK uintptr_t halide_opengl_get_texture(void *user_context, halide_buffer_t *buf) {
2031     if (buf->device == 0) {
2032         return 0;
2033     }
2034     halide_assert(user_context, buf->device_interface == &opengl_device_interface);
2035     uint64_t handle = buf->device;
2036     // client_bound always return 0 here.
2037     return handle == HALIDE_OPENGL_RENDER_TARGET ? 0 : (uintptr_t)handle;
2038 }
2039 
2040 namespace {
halide_opengl_cleanup()2041 WEAK __attribute__((destructor)) void halide_opengl_cleanup() {
2042     halide_opengl_device_release(NULL);
2043 }
2044 }  // namespace
2045 
2046 }  // extern "C"
2047 
2048 namespace Halide {
2049 namespace Runtime {
2050 namespace Internal {
2051 namespace OpenGL {
2052 
2053 WEAK halide_device_interface_impl_t opengl_device_interface_impl = {
2054     halide_use_jit_module,
2055     halide_release_jit_module,
2056     halide_opengl_device_malloc,
2057     halide_opengl_device_free,
2058     halide_opengl_device_sync,
2059     halide_opengl_device_release,
2060     halide_opengl_copy_to_host,
2061     halide_opengl_copy_to_device,
2062     halide_opengl_device_and_host_malloc,
2063     halide_opengl_device_and_host_free,
2064     halide_default_buffer_copy,
2065     halide_default_device_crop,
2066     halide_default_device_slice,
2067     halide_default_device_release_crop,
2068     halide_opengl_wrap_texture,
2069     halide_opengl_detach_texture};
2070 
2071 WEAK halide_device_interface_t opengl_device_interface = {
2072     halide_device_malloc,
2073     halide_device_free,
2074     halide_device_sync,
2075     halide_device_release,
2076     halide_copy_to_host,
2077     halide_copy_to_device,
2078     halide_device_and_host_malloc,
2079     halide_device_and_host_free,
2080     halide_buffer_copy,
2081     halide_device_crop,
2082     halide_device_slice,
2083     halide_device_release_crop,
2084     halide_device_wrap_native,
2085     halide_device_detach_native,
2086     NULL,
2087     &opengl_device_interface_impl};
2088 
2089 }  // namespace OpenGL
2090 }  // namespace Internal
2091 }  // namespace Runtime
2092 }  // namespace Halide
2093