1 // Copyright 2008-present Contributors to the OpenImageIO project.
2 // SPDX-License-Identifier: BSD-3-Clause
3 // https://github.com/OpenImageIO/oiio/blob/master/LICENSE.md
4 
5 
6 #include <cmath>
7 #include <cstring>
8 #include <list>
9 #include <random>
10 #include <sstream>
11 #include <string>
12 
13 #include <OpenImageIO/dassert.h>
14 #include <OpenImageIO/filter.h>
15 #include <OpenImageIO/fmath.h>
16 #include <OpenImageIO/imagebuf.h>
17 #include <OpenImageIO/imagebufalgo.h>
18 #include <OpenImageIO/imagecache.h>
19 #include <OpenImageIO/imageio.h>
20 #include <OpenImageIO/optparser.h>
21 #include <OpenImageIO/simd.h>
22 #include <OpenImageIO/strutil.h>
23 #include <OpenImageIO/sysutil.h>
24 #include <OpenImageIO/texture.h>
25 #include <OpenImageIO/thread.h>
26 #include <OpenImageIO/typedesc.h>
27 #include <OpenImageIO/ustring.h>
28 #include <OpenImageIO/varyingref.h>
29 
30 #include "imagecache_pvt.h"
31 #include "texture_pvt.h"
32 
33 #define TEX_FAST_MATH 1
34 
35 
36 OIIO_NAMESPACE_BEGIN
37 using namespace pvt;
38 using namespace simd;
39 
40 namespace {  // anonymous
41 
42 // We would like shared_texturesys to be a shared_ptr so that it is
43 // automatically deleted when the app exists, but because it contains a
44 // reference to an ImageCache, we get into the "destruction order fiasco."
45 // The only easy way to fix this is to make shared_texturesys be an ordinary
46 // pointer and just let it leak (who cares? the app is done, and it only
47 // contains a few hundred bytes).
48 static TextureSystemImpl* shared_texturesys = NULL;
49 static spin_mutex shared_texturesys_mutex;
50 static bool do_unit_test_texture    = false;
51 static float unit_test_texture_blur = 0.0f;
52 
53 static EightBitConverter<float> uchar2float;
54 static vfloat4 u8scale(1.0f / 255.0f);
55 static vfloat4 u16scale(1.0f / 65535.0f);
56 
57 OIIO_FORCEINLINE vfloat4
uchar2float4(const unsigned char * c)58 uchar2float4(const unsigned char* c)
59 {
60     return vfloat4(c) * u8scale;
61 }
62 
63 
64 OIIO_FORCEINLINE vfloat4
ushort2float4(const unsigned short * s)65 ushort2float4(const unsigned short* s)
66 {
67     return vfloat4(s) * u16scale;
68 }
69 
70 
71 OIIO_FORCEINLINE vfloat4
half2float4(const half * h)72 half2float4(const half* h)
73 {
74     return vfloat4(h);
75 }
76 
77 
78 static const OIIO_SIMD4_ALIGN vbool4 channel_masks[5] = {
79     vbool4(false, false, false, false), vbool4(true, false, false, false),
80     vbool4(true, true, false, false),   vbool4(true, true, true, false),
81     vbool4(true, true, true, true),
82 };
83 
84 }  // end anonymous namespace
85 
86 
87 TextureSystem*
create(bool shared,ImageCache * imagecache)88 TextureSystem::create(bool shared, ImageCache* imagecache)
89 {
90     if (shared) {
91         // They requested a shared texture system.  If a shared one already
92         // exists, just return it, otherwise record the new texture system
93         // as the shared one.
94         spin_lock guard(shared_texturesys_mutex);
95         if (!shared_texturesys)
96             shared_texturesys = new TextureSystemImpl(ImageCache::create(true));
97 #if 0
98         std::cerr << " shared TextureSystem is "
99                   << (void *)shared_texturesys << "\n";
100 #endif
101         return shared_texturesys;
102     }
103 
104     // Doesn't need a shared cache
105     bool own_ic = false;  // presume the caller owns the IC it passes
106     if (!imagecache) {    // If no IC supplied, make one and we own it
107         imagecache = ImageCache::create(false);
108         own_ic     = true;
109     }
110     TextureSystemImpl* ts  = new TextureSystemImpl(imagecache);
111     ts->m_imagecache_owner = own_ic;
112 #if 0
113     std::cerr << "creating new ImageCache " << (void *)ts << "\n";
114 #endif
115     return ts;
116 }
117 
118 
119 
120 void
destroy(TextureSystem * x,bool teardown_imagecache)121 TextureSystem::destroy(TextureSystem* x, bool teardown_imagecache)
122 {
123     // std::cerr << "Destroying TS " << (void *)x << "\n";
124     if (!x)
125         return;
126     TextureSystemImpl* impl = (TextureSystemImpl*)x;
127     if (teardown_imagecache) {
128         if (impl->m_imagecache_owner)
129             ImageCache::destroy(impl->m_imagecache, true);
130         impl->m_imagecache = nullptr;
131     }
132 
133     spin_lock guard(shared_texturesys_mutex);
134     if (impl == shared_texturesys) {
135         // This is the shared TS, so don't really delete it.
136     } else {
137         // Not a shared cache, we are the only owner, so truly destroy it.
138         delete x;
139     }
140 }
141 
142 
143 
144 namespace pvt {  // namespace pvt
145 
146 
147 
148 // Wrap functions wrap 'coord' around 'width', and return true if the
149 // result is a valid pixel coordinate, false if black should be used
150 // instead.
151 
152 
153 bool
wrap_periodic_sharedborder(int & coord,int origin,int width)154 TextureSystemImpl::wrap_periodic_sharedborder(int& coord, int origin, int width)
155 {
156     // Like periodic, but knowing that the first column and last are
157     // actually the same position, so we essentially skip the last
158     // column in the next cycle.
159     width = std::max(width, 2);  // avoid %0 for width=1
160     coord -= origin;
161     coord = safe_mod(coord, width - 1);
162     if (coord < 0)  // Fix negative values
163         coord += width;
164     coord += origin;
165     return true;
166 }
167 
168 
169 const TextureSystemImpl::wrap_impl TextureSystemImpl::wrap_functions[] = {
170     // Must be in same order as Wrap enum
171     wrap_black,
172     wrap_black,
173     wrap_clamp,
174     wrap_periodic,
175     wrap_mirror,
176     wrap_periodic_pow2,
177     wrap_periodic_sharedborder
178 };
179 
180 
181 
182 simd::vbool4
wrap_black_simd(simd::vint4 & coord_,const simd::vint4 & origin,const simd::vint4 & width)183 wrap_black_simd(simd::vint4& coord_, const simd::vint4& origin,
184                 const simd::vint4& width)
185 {
186     simd::vint4 coord(coord_);
187     return (coord >= origin) & (coord < (width + origin));
188 }
189 
190 
191 simd::vbool4
wrap_clamp_simd(simd::vint4 & coord_,const simd::vint4 & origin,const simd::vint4 & width)192 wrap_clamp_simd(simd::vint4& coord_, const simd::vint4& origin,
193                 const simd::vint4& width)
194 {
195     simd::vint4 coord(coord_);
196     coord = simd::blend(coord, origin, coord < origin);
197     coord = simd::blend(coord, (origin + width - 1), coord >= (origin + width));
198     coord_ = coord;
199     return simd::vbool4::True();
200 }
201 
202 
203 simd::vbool4
wrap_periodic_simd(simd::vint4 & coord_,const simd::vint4 & origin,const simd::vint4 & width)204 wrap_periodic_simd(simd::vint4& coord_, const simd::vint4& origin,
205                    const simd::vint4& width)
206 {
207     simd::vint4 coord(coord_);
208     coord  = coord - origin;
209     coord  = coord % width;
210     coord  = simd::blend(coord, coord + width, coord < 0);
211     coord  = coord + origin;
212     coord_ = coord;
213     return simd::vbool4::True();
214 }
215 
216 
217 simd::vbool4
wrap_periodic_pow2_simd(simd::vint4 & coord_,const simd::vint4 & origin,const simd::vint4 & width)218 wrap_periodic_pow2_simd(simd::vint4& coord_, const simd::vint4& origin,
219                         const simd::vint4& width)
220 {
221     simd::vint4 coord(coord_);
222     // OIIO_DASSERT (ispow2(width));
223     coord = coord - origin;
224     coord = coord
225             & (width - 1);  // Shortcut periodic if we're sure it's a pow of 2
226     coord  = coord + origin;
227     coord_ = coord;
228     return simd::vbool4::True();
229 }
230 
231 
232 simd::vbool4
wrap_mirror_simd(simd::vint4 & coord_,const simd::vint4 & origin,const simd::vint4 & width)233 wrap_mirror_simd(simd::vint4& coord_, const simd::vint4& origin,
234                  const simd::vint4& width)
235 {
236     simd::vint4 coord(coord_);
237     coord            = coord - origin;
238     coord            = simd::blend(coord, -1 - coord, coord < 0);
239     simd::vint4 iter = coord / width;  // Which iteration of the pattern?
240     coord -= iter * width;
241     // Odd iterations -- flip the sense
242     coord = blend(coord, (width - 1) - coord, (iter & 1) != 0);
243     // OIIO_DASSERT_MSG (coord >= 0 && coord < width,
244     //              "width=%d, origin=%d, result=%d", width, origin, coord);
245     coord += origin;
246     coord_ = coord;
247     return simd::vbool4::True();
248 }
249 
250 
251 simd::vbool4
wrap_periodic_sharedborder_simd(simd::vint4 & coord_,const simd::vint4 & origin,const simd::vint4 & width)252 wrap_periodic_sharedborder_simd(simd::vint4& coord_, const simd::vint4& origin,
253                                 const simd::vint4& width)
254 {
255     // Like periodic, but knowing that the first column and last are
256     // actually the same position, so we essentially skip the last
257     // column in the next cycle.
258     simd::vint4 coord(coord_);
259     coord = coord - origin;
260     coord = safe_mod(coord, (width - 1));
261     coord += blend(simd::vint4(origin), width + origin,
262                    coord < 0);  // Fix negative values
263     coord_ = coord;
264     return true;
265 }
266 
267 
268 
269 typedef simd::vbool4 (*wrap_impl_simd)(simd::vint4& coord,
270                                        const simd::vint4& origin,
271                                        const simd::vint4& width);
272 
273 
274 const wrap_impl_simd wrap_functions_simd[] = {
275     // Must be in same order as Wrap enum
276     wrap_black_simd,
277     wrap_black_simd,
278     wrap_clamp_simd,
279     wrap_periodic_simd,
280     wrap_mirror_simd,
281     wrap_periodic_pow2_simd,
282     wrap_periodic_sharedborder_simd
283 };
284 
285 
286 
287 const char*
texture_format_name(TexFormat f)288 texture_format_name(TexFormat f)
289 {
290     static const char* texture_format_names[] = {
291         // MUST match the order of TexFormat
292         "unknown",
293         "Plain Texture",
294         "Volume Texture",
295         "Shadow",
296         "CubeFace Shadow",
297         "Volume Shadow",
298         "LatLong Environment",
299         "CubeFace Environment",
300         ""
301     };
302     return texture_format_names[(int)f];
303 }
304 
305 
306 
307 const char*
texture_type_name(TexFormat f)308 texture_type_name(TexFormat f)
309 {
310     static const char* texture_type_names[] = {
311         // MUST match the order of TexFormat
312         "unknown", "Plain Texture", "Volume Texture", "Shadow", "Shadow",
313         "Shadow",  "Environment",   "Environment",    ""
314     };
315     return texture_type_names[(int)f];
316 }
317 
318 
319 
TextureSystemImpl(ImageCache * imagecache)320 TextureSystemImpl::TextureSystemImpl(ImageCache* imagecache)
321     : hq_filter(NULL)
322 {
323     m_imagecache = (ImageCacheImpl*)imagecache;
324     init();
325 }
326 
327 
328 
329 void
init()330 TextureSystemImpl::init()
331 {
332     m_Mw2c.makeIdentity();
333     m_gray_to_rgb       = false;
334     m_flip_t            = false;
335     m_max_tile_channels = 6;
336     delete hq_filter;
337     hq_filter    = Filter1D::create("b-spline", 4);
338     m_statslevel = 0;
339 
340     // Allow environment variable to override default options
341     const char* options = getenv("OPENIMAGEIO_TEXTURE_OPTIONS");
342     if (options)
343         attribute("options", options);
344 
345     if (do_unit_test_texture)
346         unit_test_texture();
347 }
348 
349 
350 
~TextureSystemImpl()351 TextureSystemImpl::~TextureSystemImpl()
352 {
353     printstats();
354     delete hq_filter;
355 }
356 
357 
358 
359 std::string
getstats(int level,bool icstats) const360 TextureSystemImpl::getstats(int level, bool icstats) const
361 {
362     // Merge all the threads
363     ImageCacheStatistics stats;
364     m_imagecache->mergestats(stats);
365 
366     std::ostringstream out;
367     out.imbue(std::locale::classic());  // Force "C" locale with '.' decimal
368     bool anytexture = (stats.texture_queries + stats.texture3d_queries
369                        + stats.shadow_queries + stats.environment_queries
370                        + stats.imageinfo_queries);
371     if (level > 0 && anytexture) {
372         out << "OpenImageIO Texture statistics\n";
373 
374         std::string opt;
375 #define BOOLOPT(name) \
376     if (m_##name)     \
377     opt += #name " "
378 #define INTOPT(name) opt += Strutil::sprintf(#name "=%d ", m_##name)
379 #define STROPT(name)     \
380     if (m_##name.size()) \
381     opt += Strutil::sprintf(#name "=\"%s\" ", m_##name)
382         INTOPT(gray_to_rgb);
383         INTOPT(flip_t);
384         INTOPT(max_tile_channels);
385 #undef BOOLOPT
386 #undef INTOPT
387 #undef STROPT
388         out << "  Options:  " << Strutil::wordwrap(opt, 75, 12) << "\n";
389 
390         out << "  Queries/batches : \n";
391         out << "    texture     :  " << stats.texture_queries << " queries in "
392             << stats.texture_batches << " batches\n";
393         out << "    texture 3d  :  " << stats.texture3d_queries
394             << " queries in " << stats.texture3d_batches << " batches\n";
395         out << "    shadow      :  " << stats.shadow_queries << " queries in "
396             << stats.shadow_batches << " batches\n";
397         out << "    environment :  " << stats.environment_queries
398             << " queries in " << stats.environment_batches << " batches\n";
399         out << "    gettextureinfo :  " << stats.imageinfo_queries
400             << " queries\n";
401         out << "  Interpolations :\n";
402         out << "    closest  : " << stats.closest_interps << "\n";
403         out << "    bilinear : " << stats.bilinear_interps << "\n";
404         out << "    bicubic  : " << stats.cubic_interps << "\n";
405         if (stats.aniso_queries)
406             out << Strutil::sprintf("  Average anisotropic probes : %.3g\n",
407                                     (double)stats.aniso_probes
408                                         / (double)stats.aniso_queries);
409         else
410             out << Strutil::sprintf("  Average anisotropic probes : 0\n");
411         out << Strutil::sprintf("  Max anisotropy in the wild : %.3g\n",
412                                 stats.max_aniso);
413         if (icstats)
414             out << "\n";
415     }
416     if (icstats)
417         out << m_imagecache->getstats(level);
418     return out.str();
419 }
420 
421 
422 
423 void
printstats() const424 TextureSystemImpl::printstats() const
425 {
426     if (m_statslevel == 0)
427         return;
428     std::cout << getstats(m_statslevel, false) << "\n\n";
429 }
430 
431 
432 
433 void
reset_stats()434 TextureSystemImpl::reset_stats()
435 {
436     OIIO_DASSERT(m_imagecache);
437     m_imagecache->reset_stats();
438 }
439 
440 
441 
442 bool
attribute(string_view name,TypeDesc type,const void * val)443 TextureSystemImpl::attribute(string_view name, TypeDesc type, const void* val)
444 {
445     if (name == "options" && type == TypeDesc::STRING) {
446         return optparser(*this, *(const char**)val);
447     }
448     if (name == "worldtocommon"
449         && (type == TypeMatrix || type == TypeDesc(TypeDesc::FLOAT, 16))) {
450         m_Mw2c = *(const Imath::M44f*)val;
451         m_Mc2w = m_Mw2c.inverse();
452         return true;
453     }
454     if (name == "commontoworld"
455         && (type == TypeMatrix || type == TypeDesc(TypeDesc::FLOAT, 16))) {
456         m_Mc2w = *(const Imath::M44f*)val;
457         m_Mw2c = m_Mc2w.inverse();
458         return true;
459     }
460     if ((name == "gray_to_rgb" || name == "grey_to_rgb") && (type == TypeInt)) {
461         m_gray_to_rgb = *(const int*)val;
462         return true;
463     }
464     if (name == "flip_t" && type == TypeInt) {
465         m_flip_t = *(const int*)val;
466         return true;
467     }
468     if (name == "m_max_tile_channels" && type == TypeInt) {
469         m_max_tile_channels = *(const int*)val;
470         return true;
471     }
472     if (name == "statistics:level" && type == TypeInt) {
473         m_statslevel = *(const int*)val;
474         // DO NOT RETURN! pass the same message to the image cache
475     }
476     if (name == "unit_test" && type == TypeInt) {
477         do_unit_test_texture = *(const int*)val;
478         return true;
479     }
480     if (name == "unit_test_blur" && type == TypeFloat) {
481         unit_test_texture_blur = *(const float*)val;
482         return true;
483     }
484 
485     // Maybe it's meant for the cache?
486     return m_imagecache->attribute(name, type, val);
487 }
488 
489 
490 
491 bool
getattribute(string_view name,TypeDesc type,void * val) const492 TextureSystemImpl::getattribute(string_view name, TypeDesc type,
493                                 void* val) const
494 {
495     if (name == "worldtocommon"
496         && (type == TypeMatrix || type == TypeDesc(TypeDesc::FLOAT, 16))) {
497         *(Imath::M44f*)val = m_Mw2c;
498         return true;
499     }
500     if (name == "commontoworld"
501         && (type == TypeMatrix || type == TypeDesc(TypeDesc::FLOAT, 16))) {
502         *(Imath::M44f*)val = m_Mc2w;
503         return true;
504     }
505     if ((name == "gray_to_rgb" || name == "grey_to_rgb") && (type == TypeInt)) {
506         *(int*)val = m_gray_to_rgb;
507         return true;
508     }
509     if (name == "flip_t" && type == TypeInt) {
510         *(int*)val = m_flip_t;
511         return true;
512     }
513     if (name == "m_max_tile_channels" && type == TypeInt) {
514         *(int*)val = m_max_tile_channels;
515         return true;
516     }
517 
518     // If not one of these, maybe it's an attribute meant for the image cache?
519     return m_imagecache->getattribute(name, type, val);
520 
521     return false;
522 }
523 
524 
525 
526 std::string
resolve_filename(const std::string & filename) const527 TextureSystemImpl::resolve_filename(const std::string& filename) const
528 {
529     return m_imagecache->resolve_filename(filename);
530 }
531 
532 
533 
534 bool
get_texture_info(ustring filename,int subimage,ustring dataname,TypeDesc datatype,void * data)535 TextureSystemImpl::get_texture_info(ustring filename, int subimage,
536                                     ustring dataname, TypeDesc datatype,
537                                     void* data)
538 {
539     bool ok = m_imagecache->get_image_info(filename, subimage, 0, dataname,
540                                            datatype, data);
541     if (!ok) {
542         std::string err = m_imagecache->geterror();
543         if (!err.empty())
544             error("{}", err);
545     }
546     return ok;
547 }
548 
549 
550 
551 bool
get_texture_info(TextureHandle * texture_handle,Perthread * thread_info,int subimage,ustring dataname,TypeDesc datatype,void * data)552 TextureSystemImpl::get_texture_info(TextureHandle* texture_handle,
553                                     Perthread* thread_info, int subimage,
554                                     ustring dataname, TypeDesc datatype,
555                                     void* data)
556 {
557     bool ok
558         = m_imagecache->get_image_info((ImageCache::ImageHandle*)texture_handle,
559                                        (ImageCache::Perthread*)thread_info,
560                                        subimage, 0, dataname, datatype, data);
561     if (!ok) {
562         std::string err = m_imagecache->geterror();
563         if (!err.empty())
564             error("{}", err);
565     }
566     return ok;
567 }
568 
569 
570 
571 bool
get_imagespec(ustring filename,int subimage,ImageSpec & spec)572 TextureSystemImpl::get_imagespec(ustring filename, int subimage,
573                                  ImageSpec& spec)
574 {
575     bool ok = m_imagecache->get_imagespec(filename, spec, subimage);
576     if (!ok) {
577         std::string err = m_imagecache->geterror();
578         if (!err.empty())
579             error("{}", err);
580     }
581     return ok;
582 }
583 
584 
585 
586 bool
get_imagespec(TextureHandle * texture_handle,Perthread * thread_info,int subimage,ImageSpec & spec)587 TextureSystemImpl::get_imagespec(TextureHandle* texture_handle,
588                                  Perthread* thread_info, int subimage,
589                                  ImageSpec& spec)
590 {
591     bool ok
592         = m_imagecache->get_imagespec((ImageCache::ImageHandle*)texture_handle,
593                                       (ImageCache::Perthread*)thread_info, spec,
594                                       subimage);
595     if (!ok) {
596         std::string err = m_imagecache->geterror();
597         if (!err.empty())
598             error("{}", err);
599     }
600     return ok;
601 }
602 
603 
604 
605 const ImageSpec*
imagespec(ustring filename,int subimage)606 TextureSystemImpl::imagespec(ustring filename, int subimage)
607 {
608     const ImageSpec* spec = m_imagecache->imagespec(filename, subimage);
609     if (!spec)
610         error("{}", m_imagecache->geterror());
611     return spec;
612 }
613 
614 
615 
616 const ImageSpec*
imagespec(TextureHandle * texture_handle,Perthread * thread_info,int subimage)617 TextureSystemImpl::imagespec(TextureHandle* texture_handle,
618                              Perthread* thread_info, int subimage)
619 {
620     const ImageSpec* spec
621         = m_imagecache->imagespec((ImageCache::ImageHandle*)texture_handle,
622                                   (ImageCache::Perthread*)thread_info,
623                                   subimage);
624     if (!spec) {
625         std::string err = m_imagecache->geterror();
626         if (!err.empty())
627             error("{}", err);
628     }
629     return spec;
630 }
631 
632 
633 
634 bool
get_texels(ustring filename,TextureOpt & options,int miplevel,int xbegin,int xend,int ybegin,int yend,int zbegin,int zend,int chbegin,int chend,TypeDesc format,void * result)635 TextureSystemImpl::get_texels(ustring filename, TextureOpt& options,
636                               int miplevel, int xbegin, int xend, int ybegin,
637                               int yend, int zbegin, int zend, int chbegin,
638                               int chend, TypeDesc format, void* result)
639 {
640     PerThreadInfo* thread_info = m_imagecache->get_perthread_info();
641     TextureFile* texfile       = find_texturefile(filename, thread_info);
642     if (!texfile) {
643         error("Texture file \"{}\" not found", filename);
644         return false;
645     }
646     return get_texels((TextureHandle*)texfile, (Perthread*)thread_info, options,
647                       miplevel, xbegin, xend, ybegin, yend, zbegin, zend,
648                       chbegin, chend, format, result);
649 }
650 
651 
652 
653 bool
get_texels(TextureHandle * texture_handle_,Perthread * thread_info_,TextureOpt & options,int miplevel,int xbegin,int xend,int ybegin,int yend,int zbegin,int zend,int chbegin,int chend,TypeDesc format,void * result)654 TextureSystemImpl::get_texels(TextureHandle* texture_handle_,
655                               Perthread* thread_info_, TextureOpt& options,
656                               int miplevel, int xbegin, int xend, int ybegin,
657                               int yend, int zbegin, int zend, int chbegin,
658                               int chend, TypeDesc format, void* result)
659 {
660     PerThreadInfo* thread_info = m_imagecache->get_perthread_info(
661         (PerThreadInfo*)thread_info_);
662     TextureFile* texfile = verify_texturefile((TextureFile*)texture_handle_,
663                                               thread_info);
664     if (!texfile) {
665         error("Invalid texture handle NULL");
666         return false;
667     }
668 
669     if (texfile->broken()) {
670         if (texfile->errors_should_issue())
671             error("Invalid texture file \"{}\"", texfile->filename());
672         return false;
673     }
674     int subimage = options.subimage;
675     if (subimage < 0 || subimage >= texfile->subimages()) {
676         error("get_texel asked for nonexistent subimage {} of \"{}\"", subimage,
677               texfile->filename());
678         return false;
679     }
680     if (miplevel < 0 || miplevel >= texfile->miplevels(subimage)) {
681         if (texfile->errors_should_issue())
682             error("get_texel asked for nonexistent MIP level {} of \"{}\"",
683                   miplevel, texfile->filename());
684         return false;
685     }
686     const ImageSpec& spec(texfile->spec(subimage, miplevel));
687 
688     // FIXME -- this could be WAY more efficient than starting from
689     // scratch for each pixel within the rectangle.  Instead, we should
690     // grab a whole tile at a time and memcpy it rapidly.  But no point
691     // doing anything more complicated (not to mention bug-prone) until
692     // somebody reports this routine as being a bottleneck.
693     int nchannels      = chend - chbegin;
694     int actualchannels = Imath::clamp(spec.nchannels - chbegin, 0, nchannels);
695     int tile_chbegin = 0, tile_chend = spec.nchannels;
696     if (spec.nchannels > m_max_tile_channels) {
697         // For files with many channels, narrow the range we cache
698         tile_chbegin = chbegin;
699         tile_chend   = chbegin + actualchannels;
700     }
701     TileID tileid(*texfile, subimage, miplevel, 0, 0, 0, tile_chbegin,
702                   tile_chend);
703     size_t formatchannelsize = format.size();
704     size_t formatpixelsize   = nchannels * formatchannelsize;
705     size_t scanlinesize      = (xend - xbegin) * formatpixelsize;
706     size_t zplanesize        = (yend - ybegin) * scanlinesize;
707     imagesize_t npixelsread  = 0;
708     bool ok                  = true;
709     for (int z = zbegin; z < zend; ++z) {
710         if (z < spec.z || z >= (spec.z + std::max(spec.depth, 1))) {
711             // nonexistent planes
712             memset(result, 0, zplanesize);
713             result = (void*)((char*)result + zplanesize);
714             continue;
715         }
716         tileid.z(z - ((z - spec.z) % std::max(1, spec.tile_depth)));
717         for (int y = ybegin; y < yend; ++y) {
718             if (y < spec.y || y >= (spec.y + spec.height)) {
719                 // nonexistent scanlines
720                 memset(result, 0, scanlinesize);
721                 result = (void*)((char*)result + scanlinesize);
722                 continue;
723             }
724             tileid.y(y - ((y - spec.y) % spec.tile_height));
725             for (int x = xbegin; x < xend; ++x, ++npixelsread) {
726                 if (x < spec.x || x >= (spec.x + spec.width)) {
727                     // nonexistent columns
728                     memset(result, 0, formatpixelsize);
729                     result = (void*)((char*)result + formatpixelsize);
730                     continue;
731                 }
732                 tileid.x(x - ((x - spec.x) % spec.tile_width));
733                 ok &= find_tile(tileid, thread_info, npixelsread == 0);
734                 TileRef& tile(thread_info->tile);
735                 const char* data;
736                 if (tile
737                     && (data = (const char*)tile->data(x, y, z, chbegin))) {
738                     convert_types(texfile->datatype(subimage), data, format,
739                                   result, actualchannels);
740                     for (int c = actualchannels; c < nchannels; ++c)
741                         convert_types(TypeDesc::FLOAT, &options.fill, format,
742                                       (char*)result + c * formatchannelsize, 1);
743                 } else {
744                     memset(result, 0, formatpixelsize);
745                 }
746                 result = (void*)((char*)result + formatpixelsize);
747             }
748         }
749     }
750     if (!ok) {
751         std::string err = m_imagecache->geterror();
752         if (!err.empty())
753             error("{}", err);
754     }
755     return ok;
756 }
757 
758 
759 
760 std::string
geterror() const761 TextureSystemImpl::geterror() const
762 {
763     std::string e;
764     std::string* errptr = m_errormessage.get();
765     if (errptr) {
766         e = *errptr;
767         errptr->clear();
768     }
769     return e;
770 }
771 
772 
773 
774 void
append_error(const std::string & message) const775 TextureSystemImpl::append_error(const std::string& message) const
776 {
777     std::string* errptr = m_errormessage.get();
778     if (!errptr) {
779         errptr = new std::string;
780         m_errormessage.reset(errptr);
781     }
782     OIIO_DASSERT(
783         errptr->size() < 1024 * 1024 * 16
784         && "Accumulated error messages > 16MB. Try checking return codes!");
785     if (errptr->size())
786         *errptr += '\n';
787     *errptr += message;
788 }
789 
790 
791 
792 // Implementation of invalidate -- just invalidate the image cache.
793 void
invalidate(ustring filename,bool force)794 TextureSystemImpl::invalidate(ustring filename, bool force)
795 {
796     m_imagecache->invalidate(filename, force);
797 }
798 
799 
800 
801 // Implementation of invalidate -- just invalidate the image cache.
802 void
invalidate_all(bool force)803 TextureSystemImpl::invalidate_all(bool force)
804 {
805     m_imagecache->invalidate_all(force);
806 }
807 
808 
809 
810 void
close(ustring filename)811 TextureSystemImpl::close(ustring filename)
812 {
813     m_imagecache->close(filename);
814 }
815 
816 
817 
818 void
close_all()819 TextureSystemImpl::close_all()
820 {
821     m_imagecache->close_all();
822 }
823 
824 
825 
826 bool
missing_texture(TextureOpt & options,int nchannels,float * result,float * dresultds,float * dresultdt,float * dresultdr)827 TextureSystemImpl::missing_texture(TextureOpt& options, int nchannels,
828                                    float* result, float* dresultds,
829                                    float* dresultdt, float* dresultdr)
830 {
831     for (int c = 0; c < nchannels; ++c) {
832         if (options.missingcolor)
833             result[c] = options.missingcolor[c];
834         else
835             result[c] = options.fill;
836         if (dresultds)
837             dresultds[c] = 0;
838         if (dresultdt)
839             dresultdt[c] = 0;
840         if (dresultdr)
841             dresultdr[c] = 0;
842     }
843     if (options.missingcolor) {
844         // don't treat it as an error if missingcolor was supplied
845         (void)geterror();  // eat the error
846         return true;
847     } else {
848         return false;
849     }
850 }
851 
852 
853 
854 void
fill_gray_channels(const ImageSpec & spec,int nchannels,float * result,float * dresultds,float * dresultdt,float * dresultdr)855 TextureSystemImpl::fill_gray_channels(const ImageSpec& spec, int nchannels,
856                                       float* result, float* dresultds,
857                                       float* dresultdt, float* dresultdr)
858 {
859     int specchans = spec.nchannels;
860     if (specchans == 1 && nchannels >= 3) {
861         // Asked for RGB or RGBA, texture was just R...
862         // copy the one channel to G and B
863         *(simd::vfloat4*)result = simd::shuffle<0, 0, 0, 3>(
864             *(simd::vfloat4*)result);
865         if (dresultds) {
866             *(simd::vfloat4*)dresultds = simd::shuffle<0, 0, 0, 3>(
867                 *(simd::vfloat4*)dresultds);
868             *(simd::vfloat4*)dresultdt = simd::shuffle<0, 0, 0, 3>(
869                 *(simd::vfloat4*)dresultdt);
870             if (dresultdr)
871                 *(simd::vfloat4*)dresultdr = simd::shuffle<0, 0, 0, 3>(
872                     *(simd::vfloat4*)dresultdr);
873         }
874     } else if (specchans == 2 && nchannels == 4 && spec.alpha_channel == 1) {
875         // Asked for RGBA, texture was RA
876         // Shuffle into RRRA
877         *(simd::vfloat4*)result = simd::shuffle<0, 0, 0, 1>(
878             *(simd::vfloat4*)result);
879         if (dresultds) {
880             *(simd::vfloat4*)dresultds = simd::shuffle<0, 0, 0, 1>(
881                 *(simd::vfloat4*)dresultds);
882             *(simd::vfloat4*)dresultdt = simd::shuffle<0, 0, 0, 1>(
883                 *(simd::vfloat4*)dresultdt);
884             if (dresultdr)
885                 *(simd::vfloat4*)dresultdr = simd::shuffle<0, 0, 0, 1>(
886                     *(simd::vfloat4*)dresultdr);
887         }
888     }
889 }
890 
891 
892 
893 bool
texture(ustring filename,TextureOptions & options,Runflag * runflags,int beginactive,int endactive,VaryingRef<float> s,VaryingRef<float> t,VaryingRef<float> dsdx,VaryingRef<float> dtdx,VaryingRef<float> dsdy,VaryingRef<float> dtdy,int nchannels,float * result,float * dresultds,float * dresultdt)894 TextureSystemImpl::texture(ustring filename, TextureOptions& options,
895                            Runflag* runflags, int beginactive, int endactive,
896                            VaryingRef<float> s, VaryingRef<float> t,
897                            VaryingRef<float> dsdx, VaryingRef<float> dtdx,
898                            VaryingRef<float> dsdy, VaryingRef<float> dtdy,
899                            int nchannels, float* result, float* dresultds,
900                            float* dresultdt)
901 {
902     Perthread* thread_info        = get_perthread_info();
903     TextureHandle* texture_handle = get_texture_handle(filename, thread_info);
904     return texture(texture_handle, thread_info, options, runflags, beginactive,
905                    endactive, s, t, dsdx, dtdx, dsdy, dtdy, nchannels, result,
906                    dresultds, dresultdt);
907 }
908 
909 
910 
911 bool
texture(TextureHandle * texture_handle,Perthread * thread_info,TextureOptions & options,Runflag * runflags,int beginactive,int endactive,VaryingRef<float> s,VaryingRef<float> t,VaryingRef<float> dsdx,VaryingRef<float> dtdx,VaryingRef<float> dsdy,VaryingRef<float> dtdy,int nchannels,float * result,float * dresultds,float * dresultdt)912 TextureSystemImpl::texture(TextureHandle* texture_handle,
913                            Perthread* thread_info, TextureOptions& options,
914                            Runflag* runflags, int beginactive, int endactive,
915                            VaryingRef<float> s, VaryingRef<float> t,
916                            VaryingRef<float> dsdx, VaryingRef<float> dtdx,
917                            VaryingRef<float> dsdy, VaryingRef<float> dtdy,
918                            int nchannels, float* result, float* dresultds,
919                            float* dresultdt)
920 {
921     if (!texture_handle)
922         return false;
923     bool ok = true;
924     result += beginactive * nchannels;
925     if (dresultds) {
926         dresultds += beginactive * nchannels;
927         dresultdt += beginactive * nchannels;
928     }
929     for (int i = beginactive; i < endactive; ++i) {
930         if (runflags[i]) {
931             TextureOpt opt(options, i);
932             ok &= texture(texture_handle, thread_info, opt, s[i], t[i], dsdx[i],
933                           dtdx[i], dsdy[i], dtdy[i], nchannels, result,
934                           dresultds, dresultdt);
935         }
936         result += nchannels;
937         if (dresultds) {
938             dresultds += nchannels;
939             dresultdt += nchannels;
940         }
941     }
942     return ok;
943 }
944 
945 
946 
947 bool
texture(ustring filename,TextureOpt & options,float s,float t,float dsdx,float dtdx,float dsdy,float dtdy,int nchannels,float * result,float * dresultds,float * dresultdt)948 TextureSystemImpl::texture(ustring filename, TextureOpt& options, float s,
949                            float t, float dsdx, float dtdx, float dsdy,
950                            float dtdy, int nchannels, float* result,
951                            float* dresultds, float* dresultdt)
952 {
953     PerThreadInfo* thread_info = m_imagecache->get_perthread_info();
954     TextureFile* texturefile   = find_texturefile(filename, thread_info);
955     return texture((TextureHandle*)texturefile, (Perthread*)thread_info,
956                    options, s, t, dsdx, dtdx, dsdy, dtdy, nchannels, result,
957                    dresultds, dresultdt);
958 }
959 
960 
961 
962 bool
texture(TextureHandle * texture_handle_,Perthread * thread_info_,TextureOpt & options,float s,float t,float dsdx,float dtdx,float dsdy,float dtdy,int nchannels,float * result,float * dresultds,float * dresultdt)963 TextureSystemImpl::texture(TextureHandle* texture_handle_,
964                            Perthread* thread_info_, TextureOpt& options,
965                            float s, float t, float dsdx, float dtdx, float dsdy,
966                            float dtdy, int nchannels, float* result,
967                            float* dresultds, float* dresultdt)
968 {
969     // Handle >4 channel lookups by recursion.
970     if (nchannels > 4) {
971         int save_firstchannel = options.firstchannel;
972         while (nchannels) {
973             int n   = std::min(nchannels, 4);
974             bool ok = texture(texture_handle_, thread_info_, options, s, t,
975                               dsdx, dtdx, dsdy, dtdy, n /* chans */, result,
976                               dresultds, dresultdt);
977             if (!ok)
978                 return false;
979             result += n;
980             if (dresultds)
981                 dresultds += n;
982             if (dresultdt)
983                 dresultdt += n;
984             options.firstchannel += n;
985             nchannels -= n;
986         }
987         options.firstchannel = save_firstchannel;  // restore what we changed
988         return true;
989     }
990 
991     static const texture_lookup_prototype lookup_functions[] = {
992         // Must be in the same order as Mipmode enum
993         &TextureSystemImpl::texture_lookup,
994         &TextureSystemImpl::texture_lookup_nomip,
995         &TextureSystemImpl::texture_lookup_trilinear_mipmap,
996         &TextureSystemImpl::texture_lookup_trilinear_mipmap,
997         &TextureSystemImpl::texture_lookup
998     };
999     texture_lookup_prototype lookup = lookup_functions[(int)options.mipmode];
1000 
1001     PerThreadInfo* thread_info = m_imagecache->get_perthread_info(
1002         (PerThreadInfo*)thread_info_);
1003     TextureFile* texturefile = (TextureFile*)texture_handle_;
1004     if (texturefile->is_udim())
1005         texturefile = m_imagecache->resolve_udim(texturefile, thread_info, s,
1006                                                  t);
1007 
1008     texturefile = verify_texturefile(texturefile, thread_info);
1009 
1010     ImageCacheStatistics& stats(thread_info->m_stats);
1011     ++stats.texture_batches;
1012     ++stats.texture_queries;
1013 
1014     if (!texturefile || texturefile->broken())
1015         return missing_texture(options, nchannels, result, dresultds,
1016                                dresultdt);
1017 
1018     if (!options.subimagename.empty()) {
1019         // If subimage was specified by name, figure out its index.
1020         int s = m_imagecache->subimage_from_name(texturefile,
1021                                                  options.subimagename);
1022         if (s < 0) {
1023             error("Unknown subimage \"{}\" in texture \"{}\"",
1024                   options.subimagename, texturefile->filename());
1025             return missing_texture(options, nchannels, result, dresultds,
1026                                    dresultdt);
1027         }
1028         options.subimage = s;
1029         options.subimagename.clear();
1030     }
1031 
1032     const ImageCacheFile::SubimageInfo& subinfo(
1033         texturefile->subimageinfo(options.subimage));
1034     const ImageSpec& spec(texturefile->spec(options.subimage, 0));
1035 
1036     int actualchannels = Imath::clamp(spec.nchannels - options.firstchannel, 0,
1037                                       nchannels);
1038 
1039     // Figure out the wrap functions
1040     if (options.swrap == TextureOpt::WrapDefault)
1041         options.swrap = (TextureOpt::Wrap)texturefile->swrap();
1042     if (options.swrap == TextureOpt::WrapPeriodic && ispow2(spec.width))
1043         options.swrap = TextureOpt::WrapPeriodicPow2;
1044     if (options.twrap == TextureOpt::WrapDefault)
1045         options.twrap = (TextureOpt::Wrap)texturefile->twrap();
1046     if (options.twrap == TextureOpt::WrapPeriodic && ispow2(spec.height))
1047         options.twrap = TextureOpt::WrapPeriodicPow2;
1048 
1049     if (subinfo.is_constant_image && options.swrap != TextureOpt::WrapBlack
1050         && options.twrap != TextureOpt::WrapBlack) {
1051         // Lookup of constant color texture, non-black wrap -- skip all the
1052         // hard stuff.
1053         for (int c = 0; c < actualchannels; ++c)
1054             result[c] = subinfo.average_color[c + options.firstchannel];
1055         for (int c = actualchannels; c < nchannels; ++c)
1056             result[c] = options.fill;
1057         if (dresultds) {
1058             // Derivs are always 0 from a constant texture lookup
1059             for (int c = 0; c < nchannels; ++c) {
1060                 dresultds[c] = 0.0f;
1061                 dresultdt[c] = 0.0f;
1062             }
1063         }
1064         if (actualchannels < nchannels && options.firstchannel == 0
1065             && m_gray_to_rgb)
1066             fill_gray_channels(spec, nchannels, result, dresultds, dresultdt);
1067         return true;
1068     }
1069 
1070     if (m_flip_t) {
1071         t = 1.0f - t;
1072         dtdx *= -1.0f;
1073         dtdy *= -1.0f;
1074     }
1075 
1076     if (!subinfo.full_pixel_range) {  // remap st for overscan or crop
1077         s = s * subinfo.sscale + subinfo.soffset;
1078         dsdx *= subinfo.sscale;
1079         dsdy *= subinfo.sscale;
1080         t = t * subinfo.tscale + subinfo.toffset;
1081         dtdx *= subinfo.tscale;
1082         dtdy *= subinfo.tscale;
1083     }
1084 
1085     bool ok;
1086     // Everything from the lookup function on down will assume that there
1087     // is space for a vfloat4 in all of the result locations, so if that's
1088     // not the case (or it's not properly aligned), make a local copy and
1089     // then copy back when we're done.
1090     // FIXME -- is this necessary at all? Can we eliminate the conditional
1091     // and the duplicated code by always doing the simd copy thing? Come
1092     // back here and time whether for 4-channel textures it really matters.
1093     bool simd_copy = (nchannels != 4 || ((size_t)result & 0x0f)
1094                       || ((size_t)dresultds & 0x0f) || /* FIXME -- necessary? */
1095                       ((size_t)dresultdt & 0x0f));
1096     if (simd_copy) {
1097         simd::vfloat4 result_simd, dresultds_simd, dresultdt_simd;
1098         float* OIIO_RESTRICT saved_dresultds = dresultds;
1099         float* OIIO_RESTRICT saved_dresultdt = dresultdt;
1100         if (saved_dresultds) {
1101             dresultds = (float*)&dresultds_simd;
1102             dresultdt = (float*)&dresultdt_simd;
1103         }
1104         ok = (this->*lookup)(*texturefile, thread_info, options, nchannels,
1105                              actualchannels, s, t, dsdx, dtdx, dsdy, dtdy,
1106                              (float*)&result_simd, dresultds, dresultdt);
1107         if (actualchannels < nchannels && options.firstchannel == 0
1108             && m_gray_to_rgb)
1109             fill_gray_channels(spec, nchannels, (float*)&result_simd, dresultds,
1110                                dresultdt);
1111         result_simd.store(result, nchannels);
1112         if (saved_dresultds) {
1113             if (m_flip_t)
1114                 dresultdt_simd = -dresultdt_simd;
1115             dresultds_simd.store(saved_dresultds, nchannels);
1116             dresultdt_simd.store(saved_dresultdt, nchannels);
1117         }
1118     } else {
1119         // All provided output slots are aligned 4-floats, use them directly
1120         ok = (this->*lookup)(*texturefile, thread_info, options, nchannels,
1121                              actualchannels, s, t, dsdx, dtdx, dsdy, dtdy,
1122                              result, dresultds, dresultdt);
1123         if (actualchannels < nchannels && options.firstchannel == 0
1124             && m_gray_to_rgb)
1125             fill_gray_channels(spec, nchannels, result, dresultds, dresultdt);
1126         if (m_flip_t && dresultdt)
1127             *(vfloat4*)dresultdt = -(*(vfloat4*)dresultdt);
1128     }
1129 
1130     return ok;
1131 }
1132 
1133 
1134 
1135 bool
texture(ustring filename,TextureOptBatch & options,Tex::RunMask mask,const float * s,const float * t,const float * dsdx,const float * dtdx,const float * dsdy,const float * dtdy,int nchannels,float * result,float * dresultds,float * dresultdt)1136 TextureSystemImpl::texture(ustring filename, TextureOptBatch& options,
1137                            Tex::RunMask mask, const float* s, const float* t,
1138                            const float* dsdx, const float* dtdx,
1139                            const float* dsdy, const float* dtdy, int nchannels,
1140                            float* result, float* dresultds, float* dresultdt)
1141 {
1142     Perthread* thread_info        = get_perthread_info();
1143     TextureHandle* texture_handle = get_texture_handle(filename, thread_info);
1144     return texture(texture_handle, thread_info, options, mask, s, t, dsdx, dtdx,
1145                    dsdy, dtdy, nchannels, result, dresultds, dresultdt);
1146 }
1147 
1148 
1149 bool
texture(TextureHandle * texture_handle,Perthread * thread_info,TextureOptBatch & options,Tex::RunMask mask,const float * s,const float * t,const float * dsdx,const float * dtdx,const float * dsdy,const float * dtdy,int nchannels,float * result,float * dresultds,float * dresultdt)1150 TextureSystemImpl::texture(TextureHandle* texture_handle,
1151                            Perthread* thread_info, TextureOptBatch& options,
1152                            Tex::RunMask mask, const float* s, const float* t,
1153                            const float* dsdx, const float* dtdx,
1154                            const float* dsdy, const float* dtdy, int nchannels,
1155                            float* result, float* dresultds, float* dresultdt)
1156 {
1157     // (FIXME) CHEAT! Texture points individually
1158     TextureOpt opt;
1159     opt.firstchannel        = options.firstchannel;
1160     opt.subimage            = options.subimage;
1161     opt.subimagename        = options.subimagename;
1162     opt.swrap               = (TextureOpt::Wrap)options.swrap;
1163     opt.twrap               = (TextureOpt::Wrap)options.twrap;
1164     opt.mipmode             = (TextureOpt::MipMode)options.mipmode;
1165     opt.interpmode          = (TextureOpt::InterpMode)options.interpmode;
1166     opt.anisotropic         = options.anisotropic;
1167     opt.conservative_filter = options.conservative_filter;
1168     opt.fill                = options.fill;
1169     opt.missingcolor        = options.missingcolor;
1170     // rwrap not needed for 2D texture
1171 
1172     bool ok          = true;
1173     Tex::RunMask bit = 1;
1174     for (int i = 0; i < Tex::BatchWidth; ++i, bit <<= 1) {
1175         float r[4], drds[4], drdt[4];  // temp result
1176         if (mask & bit) {
1177             opt.sblur  = options.sblur[i];
1178             opt.tblur  = options.tblur[i];
1179             opt.swidth = options.swidth[i];
1180             opt.twidth = options.twidth[i];
1181             // rblur, rwidth not needed for 2D texture
1182             if (dresultds) {
1183                 ok &= texture(texture_handle, thread_info, opt, s[i], t[i],
1184                               dsdx[i], dtdx[i], dsdy[i], dtdy[i], nchannels, r,
1185                               drds, drdt);
1186                 for (int c = 0; c < nchannels; ++c) {
1187                     result[c * Tex::BatchWidth + i]    = r[c];
1188                     dresultds[c * Tex::BatchWidth + i] = drds[c];
1189                     dresultdt[c * Tex::BatchWidth + i] = drdt[c];
1190                 }
1191             } else {
1192                 ok &= texture(texture_handle, thread_info, opt, s[i], t[i],
1193                               dsdx[i], dtdx[i], dsdy[i], dtdy[i], nchannels, r);
1194                 for (int c = 0; c < nchannels; ++c) {
1195                     result[c * Tex::BatchWidth + i] = r[c];
1196                 }
1197             }
1198         }
1199     }
1200     return ok;
1201 }
1202 
1203 
1204 
1205 bool
texture_lookup_nomip(TextureFile & texturefile,PerThreadInfo * thread_info,TextureOpt & options,int nchannels_result,int actualchannels,float s,float t,float,float,float,float,float * result,float * dresultds,float * dresultdt)1206 TextureSystemImpl::texture_lookup_nomip(
1207     TextureFile& texturefile, PerThreadInfo* thread_info, TextureOpt& options,
1208     int nchannels_result, int actualchannels, float s, float t, float /*dsdx*/,
1209     float /*dtdx*/, float /*dsdy*/, float /*dtdy*/, float* result,
1210     float* dresultds, float* dresultdt)
1211 {
1212     // Initialize results to 0.  We'll add from here on as we sample.
1213     OIIO_DASSERT((dresultds == NULL) == (dresultdt == NULL));
1214     ((simd::vfloat4*)result)->clear();
1215     if (dresultds) {
1216         ((simd::vfloat4*)dresultds)->clear();
1217         ((simd::vfloat4*)dresultdt)->clear();
1218     }
1219 
1220     static const sampler_prototype sample_functions[] = {
1221         // Must be in the same order as InterpMode enum
1222         &TextureSystemImpl::sample_closest,
1223         &TextureSystemImpl::sample_bilinear,
1224         &TextureSystemImpl::sample_bicubic,
1225         &TextureSystemImpl::sample_bilinear,
1226     };
1227     sampler_prototype sampler      = sample_functions[(int)options.interpmode];
1228     OIIO_SIMD4_ALIGN float sval[4] = { s, 0.0f, 0.0f, 0.0f };
1229     OIIO_SIMD4_ALIGN float tval[4] = { t, 0.0f, 0.0f, 0.0f };
1230     static OIIO_SIMD4_ALIGN float weight[4] = { 1.0f, 0.0f, 0.0f, 0.0f };
1231     ImageCacheFile::SubimageInfo& subinfo(
1232         texturefile.subimageinfo(options.subimage));
1233     int min_mip_level = subinfo.min_mip_level;
1234     bool ok = (this->*sampler)(1, sval, tval, min_mip_level, texturefile,
1235                                thread_info, options, nchannels_result,
1236                                actualchannels, weight, (vfloat4*)result,
1237                                (vfloat4*)dresultds, (vfloat4*)dresultdt);
1238 
1239     // Update stats
1240     ImageCacheStatistics& stats(thread_info->m_stats);
1241     ++stats.aniso_queries;
1242     ++stats.aniso_probes;
1243     switch (options.interpmode) {
1244     case TextureOpt::InterpClosest: ++stats.closest_interps; break;
1245     case TextureOpt::InterpBilinear: ++stats.bilinear_interps; break;
1246     case TextureOpt::InterpBicubic: ++stats.cubic_interps; break;
1247     case TextureOpt::InterpSmartBicubic: ++stats.bilinear_interps; break;
1248     }
1249     return ok;
1250 }
1251 
1252 
1253 
1254 // Scale the derivs as dictated by 'width', and also make sure
1255 // they are all some minimum value to make the subsequent math clean.
1256 inline void
adjust_width(float & dsdx,float & dtdx,float & dsdy,float & dtdy,float swidth,float twidth)1257 adjust_width(float& dsdx, float& dtdx, float& dsdy, float& dtdy, float swidth,
1258              float twidth /*, float sblur=0, float tblur=0*/)
1259 {
1260     // Trust user not to use nonsensical width<0
1261     dsdx *= swidth;
1262     dtdx *= twidth;
1263     dsdy *= swidth;
1264     dtdy *= twidth;
1265 #if 0
1266     // You might think that blur should be added to the original derivs and
1267     // then just carry on. And sometimes it looks fine, but for some
1268     // situations the results are absurdly wrong, as revealed by the unit
1269     // test visualizations. I'm leaving this code here but disabled as a
1270     // reminder to myself not to try it again. It's wrong.
1271     if (sblur+tblur != 0.0f /* avoid the work when blur is zero */) {
1272         dsdx += std::copysign (sblur, dsdx);
1273         dsdy += std::copysign (sblur, dsdy);
1274         dtdx += std::copysign (tblur, dtdx);
1275         dtdy += std::copysign (tblur, dtdy);
1276     }
1277 #endif
1278 
1279     // Clamp degenerate derivatives so they don't cause mathematical problems
1280     static const float eps = 1.0e-8f, eps2 = eps * eps;
1281     float dxlen2 = dsdx * dsdx + dtdx * dtdx;
1282     float dylen2 = dsdy * dsdy + dtdy * dtdy;
1283     if (dxlen2 < eps2) {  // Tiny dx
1284         if (dylen2 < eps2) {
1285             // Tiny dx and dy: Essentially point sampling.  Substitute a
1286             // tiny but finite filter.
1287             dsdx   = eps;
1288             dsdy   = 0;
1289             dtdx   = 0;
1290             dtdy   = eps;
1291             dxlen2 = dylen2 = eps2;
1292         } else {
1293             // Tiny dx, sane dy -- pick a small dx orthogonal to dy, but
1294             // of length eps.
1295             float scale = eps / sqrtf(dylen2);
1296             dsdx        = dtdy * scale;
1297             dtdx        = -dsdy * scale;
1298             dxlen2      = eps2;
1299         }
1300     } else if (dylen2 < eps2) {
1301         // Tiny dy, sane dx -- pick a small dy orthogonal to dx, but of
1302         // length eps.
1303         float scale = eps / sqrtf(dxlen2);
1304         dsdy        = -dtdx * scale;
1305         dtdy        = dsdx * scale;
1306         dylen2      = eps2;
1307     }
1308 }
1309 
1310 
1311 
1312 // Adjust the ellipse major and minor axes based on the blur, if nonzero.
1313 // Trust user not to use nonsensical blur<0
1314 //
1315 // FIXME: This is not "correct," but it's probably good enough. There is
1316 // probably a more principled way to deal with blur (including anisotropic)
1317 // by figuring out how to transform/scale the ellipse or consider the blur
1318 // to be like a convolution of gaussians. Some day, somebody ought to come
1319 // back to this and solve it better.
1320 inline void
adjust_blur(float & majorlength,float & minorlength,float & theta,float sblur,float tblur)1321 adjust_blur(float& majorlength, float& minorlength, float& theta, float sblur,
1322             float tblur)
1323 {
1324     if (sblur + tblur != 0.0f /* avoid the work when blur is zero */) {
1325         // Carefully add blur to the right derivative components in the
1326         // right proportions -- merely adding the same amount of blur
1327         // to all four derivatives blurs too much at some angles.
1328         OIIO_DASSERT(majorlength > 0.0f && minorlength > 0.0f);
1329         float sintheta, costheta;
1330 #ifdef TEX_FAST_MATH
1331         fast_sincos(theta, &sintheta, &costheta);
1332 #else
1333         sincos(theta, &sintheta, &costheta);
1334 #endif
1335         sintheta = fabsf(sintheta);
1336         costheta = fabsf(costheta);
1337         majorlength += sblur * costheta + tblur * sintheta;
1338         minorlength += sblur * sintheta + tblur * costheta;
1339 #if 1
1340         if (minorlength > majorlength) {
1341             // Wildly uneven sblur and tblur values might swap which axis is
1342             // thicker. For example, if the major axis is vertical (thin
1343             // ellipse) but you have so much horizontal blur that it turns
1344             // into a wide ellipse. My hacky solution is to notice when this
1345             // happens and just swap the major and minor axes. I'm actually
1346             // not convinced this is the best solution -- in the unit test
1347             // visualizations, it looks great (and better than not doing it)
1348             // for most ordinary situations, but when the derivs indicate
1349             // extreme diagonal sheer (like when the dx and dy are almost
1350             // parallel, rather than the expected almost perpendicular),
1351             // it's less than fully satisfactory. But I don't know a better
1352             // solution. And, I dunno, maybe it's even right; it's very hard
1353             // to reason about what the right thing to do is for that case,
1354             // for very stretched blur.
1355             std::swap(minorlength, majorlength);
1356             theta += M_PI_2;
1357         }
1358 #endif
1359     }
1360 }
1361 
1362 
1363 
1364 // For the given texture file, options, and ellipse major and minor
1365 // lengths and aspect ratio, compute the two MIPmap levels and
1366 // respective weights to use for a texture lookup.  The general strategy
1367 // is that we choose the MIPmap level so that the minor axis length is
1368 // pixel-sized (and then we will sample several times along the major
1369 // axis in order to handle anisotropy), but we make adjustments in
1370 // corner cases where the ideal sampling is too high or too low resolution
1371 // given the MIPmap levels we have available.
1372 inline void
compute_miplevels(TextureSystemImpl::TextureFile & texturefile,TextureOpt & options,float majorlength,float minorlength,float & aspect,int * miplevel,float * levelweight)1373 compute_miplevels(TextureSystemImpl::TextureFile& texturefile,
1374                   TextureOpt& options, float majorlength, float minorlength,
1375                   float& aspect, int* miplevel, float* levelweight)
1376 {
1377     ImageCacheFile::SubimageInfo& subinfo(
1378         texturefile.subimageinfo(options.subimage));
1379     float levelblend  = 0.0f;
1380     int nmiplevels    = (int)subinfo.levels.size();
1381     int min_mip_level = subinfo.min_mip_level;
1382     for (int m = min_mip_level; m < nmiplevels; ++m) {
1383         // Compute the filter size (minor axis) in raster space at this
1384         // MIP level.  We use the smaller of the two texture resolutions,
1385         // which is better than just using one, but a more principled
1386         // approach is desired but remains elusive.  FIXME.
1387         float filtwidth_ras = minorlength
1388                               * std::min(subinfo.spec(m).width,
1389                                          subinfo.spec(m).height);
1390         // Once the filter width is smaller than one texel at this level,
1391         // we've gone too far, so we know that we want to interpolate the
1392         // previous level and the current level.  Note that filtwidth_ras
1393         // is expected to be >= 0.5, or would have stopped one level ago.
1394         if (filtwidth_ras <= 1.0f) {
1395             miplevel[0] = m - 1;
1396             miplevel[1] = m;
1397             levelblend  = Imath::clamp(2.0f * filtwidth_ras - 1.0f, 0.0f, 1.0f);
1398             break;
1399         }
1400     }
1401 
1402     if (miplevel[1] < 0) {
1403         // We'd like to blur even more, but make due with the coarsest
1404         // MIP level.
1405         miplevel[0] = nmiplevels - 1;
1406         miplevel[1] = miplevel[0];
1407         levelblend  = 0;
1408     } else if (miplevel[0] < min_mip_level) {
1409         // We wish we had even more resolution than the finest MIP level,
1410         // but tough for us.
1411         miplevel[0] = min_mip_level;
1412         miplevel[1] = min_mip_level;
1413         levelblend  = 0;
1414         // It's possible that minorlength is degenerate, giving an aspect
1415         // ratio that implies a huge nsamples, which is pointless if those
1416         // samples are too close.  So if minorlength is less than 1/2 texel
1417         // at the finest resolution, clamp it and recalculate aspect.
1418         int r = std::max(subinfo.spec(0).full_width,
1419                          subinfo.spec(0).full_height);
1420         if (minorlength * r < 0.5f) {
1421             aspect = Imath::clamp(majorlength * r * 2.0f, 1.0f,
1422                                   float(options.anisotropic));
1423         }
1424     }
1425     if (options.mipmode == TextureOpt::MipModeOneLevel) {
1426         miplevel[0] = miplevel[1];
1427         levelblend  = 0;
1428     }
1429     levelweight[0] = 1.0f - levelblend;
1430     levelweight[1] = levelblend;
1431 }
1432 
1433 
1434 
1435 bool
texture_lookup_trilinear_mipmap(TextureFile & texturefile,PerThreadInfo * thread_info,TextureOpt & options,int nchannels_result,int actualchannels,float s,float t,float dsdx,float dtdx,float dsdy,float dtdy,float * result,float * dresultds,float * dresultdt)1436 TextureSystemImpl::texture_lookup_trilinear_mipmap(
1437     TextureFile& texturefile, PerThreadInfo* thread_info, TextureOpt& options,
1438     int nchannels_result, int actualchannels, float s, float t, float dsdx,
1439     float dtdx, float dsdy, float dtdy, float* result, float* dresultds,
1440     float* dresultdt)
1441 {
1442     // Initialize results to 0.  We'll add from here on as we sample.
1443     OIIO_DASSERT((dresultds == NULL) == (dresultdt == NULL));
1444     ((simd::vfloat4*)result)->clear();
1445     if (dresultds) {
1446         ((simd::vfloat4*)dresultds)->clear();
1447         ((simd::vfloat4*)dresultdt)->clear();
1448     }
1449 
1450     adjust_width(dsdx, dtdx, dsdy, dtdy, options.swidth, options.twidth);
1451 
1452     // Determine the MIP-map level(s) we need: we will blend
1453     //    data(miplevel[0]) * (1-levelblend) + data(miplevel[1]) * levelblend
1454     int miplevel[2]      = { -1, -1 };
1455     float levelweight[2] = { 0, 0 };
1456     float sfilt          = std::max(fabsf(dsdx), fabsf(dsdy));
1457     float tfilt          = std::max(fabsf(dtdx), fabsf(dtdy));
1458     float filtwidth      = options.conservative_filter ? std::max(sfilt, tfilt)
1459                                                        : std::min(sfilt, tfilt);
1460     // account for blur
1461     filtwidth += std::max(options.sblur, options.tblur);
1462     float aspect = 1.0f;
1463     compute_miplevels(texturefile, options, filtwidth, filtwidth, aspect,
1464                       miplevel, levelweight);
1465 
1466     static const sampler_prototype sample_functions[] = {
1467         // Must be in the same order as InterpMode enum
1468         &TextureSystemImpl::sample_closest,
1469         &TextureSystemImpl::sample_bilinear,
1470         &TextureSystemImpl::sample_bicubic,
1471         &TextureSystemImpl::sample_bilinear,
1472     };
1473     sampler_prototype sampler = sample_functions[(int)options.interpmode];
1474 
1475     // FIXME -- support for smart cubic?
1476 
1477     OIIO_SIMD4_ALIGN float sval[4]   = { s, 0.0f, 0.0f, 0.0f };
1478     OIIO_SIMD4_ALIGN float tval[4]   = { t, 0.0f, 0.0f, 0.0f };
1479     OIIO_SIMD4_ALIGN float weight[4] = { 1.0f, 0.0f, 0.0f, 0.0f };
1480     bool ok                          = true;
1481     int npointson                    = 0;
1482     vfloat4 r_sum, drds_sum, drdt_sum;
1483     r_sum.clear();
1484     if (dresultds) {
1485         drds_sum.clear();
1486         drdt_sum.clear();
1487     }
1488     for (int level = 0; level < 2; ++level) {
1489         if (!levelweight[level])  // No contribution from this level, skip it
1490             continue;
1491         vfloat4 r, drds, drdt;
1492         ok &= (this->*sampler)(1, sval, tval, miplevel[level], texturefile,
1493                                thread_info, options, nchannels_result,
1494                                actualchannels, weight, &r,
1495                                dresultds ? &drds : NULL,
1496                                dresultds ? &drdt : NULL);
1497         ++npointson;
1498         vfloat4 lw = levelweight[level];
1499         r_sum += lw * r;
1500         if (dresultds) {
1501             drds_sum += lw * drds;
1502             drdt_sum += lw * drdt;
1503         }
1504     }
1505 
1506     *(simd::vfloat4*)(result) = r_sum;
1507     if (dresultds) {
1508         *(simd::vfloat4*)(dresultds) = drds_sum;
1509         *(simd::vfloat4*)(dresultdt) = drdt_sum;
1510     }
1511 
1512     // Update stats
1513     ImageCacheStatistics& stats(thread_info->m_stats);
1514     stats.aniso_queries += npointson;
1515     stats.aniso_probes += npointson;
1516     switch (options.interpmode) {
1517     case TextureOpt::InterpClosest: stats.closest_interps += npointson; break;
1518     case TextureOpt::InterpBilinear: stats.bilinear_interps += npointson; break;
1519     case TextureOpt::InterpBicubic: stats.cubic_interps += npointson; break;
1520     case TextureOpt::InterpSmartBicubic:
1521         stats.bilinear_interps += npointson;
1522         break;
1523     }
1524     return ok;
1525 }
1526 
1527 
1528 
1529 // Given pixel derivatives, calculate major and minor axis lengths and
1530 // major axis orientation angle of the ellipse.  See Greene's EWA paper
1531 // or Mavridis (2011).  If ABCF is non-NULL, it should point to space
1532 // for 4 floats, which will be used to store the ellipse parameters A,
1533 // B, C, F.
1534 inline void
ellipse_axes(float dsdx,float dtdx,float dsdy,float dtdy,float & majorlength,float & minorlength,float & theta,float * ABCF=NULL)1535 ellipse_axes(float dsdx, float dtdx, float dsdy, float dtdy, float& majorlength,
1536              float& minorlength, float& theta, float* ABCF = NULL)
1537 {
1538     float dsdx2 = dsdx * dsdx;
1539     float dtdx2 = dtdx * dtdx;
1540     float dsdy2 = dsdy * dsdy;
1541     float dtdy2 = dtdy * dtdy;
1542     double A    = dtdx2 + dtdy2;
1543     double B    = -2.0 * (dsdx * dtdx + dsdy * dtdy);
1544     double C    = dsdx2 + dsdy2;
1545     double root = hypot(A - C, B);  // equivalent: sqrt (A*A - 2AC + C*C + B*B)
1546     double Aprime = (A + C - root) * 0.5;
1547     double Cprime = (A + C + root) * 0.5;
1548 #if 0
1549     double F = A*C - B*B*0.25;
1550     majorlength = std::min(safe_sqrt (float(F / Aprime)), 1000.0f);
1551     minorlength = std::min(safe_sqrt (float(F / Cprime)), 1000.0f);
1552 #else
1553     // Wolfram says that this is equivalent to:
1554     majorlength = std::min(safe_sqrt(float(Cprime)), 1000.0f);
1555     minorlength = std::min(safe_sqrt(float(Aprime)), 1000.0f);
1556 #endif
1557 #ifdef TEX_FAST_MATH
1558     theta = fast_atan2(B, A - C) * 0.5f + float(M_PI_2);
1559 #else
1560     theta = atan2(B, A - C) * 0.5 + M_PI_2;
1561 #endif
1562     if (ABCF) {
1563         // Optionally store the ellipse equation parameters, the ellipse
1564         // is given by: A*u^2 + B*u*v + C*v^2 < 1
1565         double F    = A * C - B * B * 0.25;
1566         double Finv = 1.0f / F;
1567         ABCF[0]     = A * Finv;
1568         ABCF[1]     = B * Finv;
1569         ABCF[2]     = C * Finv;
1570         ABCF[3]     = F;
1571     }
1572 
1573     // N.B. If the derivs passed in are the full pixel-to-pixel
1574     // derivatives, then majorlength/minorlength are the (diameter) axes
1575     // of the ellipse; if the derivs are the "to edge of pixel" 1/2
1576     // length derivs, then majorlength/minorlength are the major and
1577     // minor radii of the ellipse.  We do the former!  So it's important
1578     // to consider that factor of 2 in compute_ellipse_sampling.
1579 }
1580 
1581 
1582 
1583 // Given the aspect ratio, major axis orientation angle, and axis lengths,
1584 // calculate the smajor & tmajor values that give the orientation of the
1585 // line on which samples should be distributed.  If there are n samples,
1586 // they should be positioned as:
1587 //     p_i = 2*(i+0.5)/n - 1.0;
1588 //     sample_i = (s + p_i*smajor, t + p_i*tmajor)
1589 // If a weights ptr is supplied, it will be filled in [0..nsamples-1] with
1590 // normalized weights for each sample.
1591 inline int
compute_ellipse_sampling(float aspect,float theta,float majorlength,float minorlength,float & smajor,float & tmajor,float & invsamples,float * weights=NULL)1592 compute_ellipse_sampling(float aspect, float theta, float majorlength,
1593                          float minorlength, float& smajor, float& tmajor,
1594                          float& invsamples, float* weights = NULL)
1595 {
1596     // Compute the sin and cos of the sampling direction, given major
1597     // axis angle
1598     sincos(theta, &tmajor, &smajor);
1599     float L = 2.0f * (majorlength - minorlength);
1600     smajor *= L;
1601     tmajor *= L;
1602 #if 1
1603     // This is the theoretically correct number of samples.
1604     int nsamples = std::max(1, int(2.0f * aspect - 1.0f));
1605 #else
1606     int nsamples = std::max(1, (int)ceilf(aspect - 0.3f));
1607     // This approach does fewer samples for high aspect ratios, but I see
1608     // artifacts.
1609 #endif
1610     invsamples = 1.0f / nsamples;
1611     if (weights) {
1612         if (nsamples == 1) {
1613             weights[0] = 1.0f;
1614         } else if (nsamples == 2) {
1615             weights[0] = 0.5f;
1616             weights[1] = 0.5f;
1617         } else {
1618             float scale = majorlength / L;  // 1/(L/major)
1619             for (int i = 0, e = (nsamples + 1) / 2; i < e; ++i) {
1620                 float x = (2.0f * (i + 0.5f) * invsamples - 1.0f) * scale;
1621 #ifdef TEX_FAST_MATH
1622                 float w = fast_exp(-2.0f * x * x);
1623 #else
1624                 float w = expf(-2.0f * x * x);
1625 #endif
1626                 weights[nsamples - i - 1] = weights[i] = w;
1627             }
1628             float sumw = 0.0f;
1629             for (int i = 0; i < nsamples; ++i)
1630                 sumw += weights[i];
1631             for (int i = 0; i < nsamples; ++i)
1632                 weights[i] /= sumw;
1633         }
1634     }
1635     return nsamples;
1636 }
1637 
1638 
1639 
1640 bool
texture_lookup(TextureFile & texturefile,PerThreadInfo * thread_info,TextureOpt & options,int nchannels_result,int actualchannels,float s,float t,float dsdx,float dtdx,float dsdy,float dtdy,float * result,float * dresultds,float * dresultdt)1641 TextureSystemImpl::texture_lookup(TextureFile& texturefile,
1642                                   PerThreadInfo* thread_info,
1643                                   TextureOpt& options, int nchannels_result,
1644                                   int actualchannels, float s, float t,
1645                                   float dsdx, float dtdx, float dsdy,
1646                                   float dtdy, float* result, float* dresultds,
1647                                   float* dresultdt)
1648 {
1649     OIIO_DASSERT((dresultds == NULL) == (dresultdt == NULL));
1650 
1651     // Compute the natural resolution we want for the bare derivs, this
1652     // will be the threshold for knowing we're maxifying (and therefore
1653     // wanting cubic interpolation).
1654     float sfilt_noblur = std::max(std::max(fabsf(dsdx), fabsf(dsdy)), 1e-8f);
1655     float tfilt_noblur = std::max(std::max(fabsf(dtdx), fabsf(dtdy)), 1e-8f);
1656     int naturalsres    = (int)(1.0f / sfilt_noblur);
1657     int naturaltres    = (int)(1.0f / tfilt_noblur);
1658 
1659     // Scale by 'width'
1660     adjust_width(dsdx, dtdx, dsdy, dtdy, options.swidth, options.twidth);
1661 
1662     // Determine the MIP-map level(s) we need: we will blend
1663     //    data(miplevel[0]) * (1-levelblend) + data(miplevel[1]) * levelblend
1664     float smajor, tmajor;
1665     float majorlength, minorlength;
1666     float theta;
1667 
1668     // Do a bit more math and get the exact ellipse axis lengths, and
1669     // therefore a more accurate aspect ratio as well.  Looks much MUCH
1670     // better, but for scenes with lots of grazing angles, it can greatly
1671     // increase the average anisotropy, therefore the number of bilinear
1672     // or bicubic texture probes, and therefore runtime!
1673     ellipse_axes(dsdx, dtdx, dsdy, dtdy, majorlength, minorlength, theta);
1674 
1675     adjust_blur(majorlength, minorlength, theta, options.sblur, options.tblur);
1676 
1677     float aspect, trueaspect;
1678     aspect = anisotropic_aspect(majorlength, minorlength, options, trueaspect);
1679 
1680     int miplevel[2]      = { -1, -1 };
1681     float levelweight[2] = { 0, 0 };
1682     compute_miplevels(texturefile, options, majorlength, minorlength, aspect,
1683                       miplevel, levelweight);
1684 
1685     float* lineweight
1686         = OIIO_ALLOCA(float,
1687                       round_to_multiple_of_pow2(2 * options.anisotropic, 4));
1688     float invsamples;
1689     int nsamples = compute_ellipse_sampling(aspect, theta, majorlength,
1690                                             minorlength, smajor, tmajor,
1691                                             invsamples, lineweight);
1692     // All the computations were done assuming full diametric axes of
1693     // the ellipse, but our derivatives are pixel-to-pixel, yielding
1694     // semi-major and semi-minor lengths, so we need to scale everything
1695     // by 1/2.
1696     smajor *= 0.5f;
1697     tmajor *= 0.5f;
1698 
1699     bool ok           = true;
1700     int npointson     = 0;
1701     int closestprobes = 0, bilinearprobes = 0, bicubicprobes = 0;
1702     int nsamples_padded = round_to_multiple_of_pow2(nsamples, 4);
1703     float* sval         = OIIO_ALLOCA(float, nsamples_padded);
1704     float* tval         = OIIO_ALLOCA(float, nsamples_padded);
1705 
1706     // Compute the s and t positions of the samples along the major axis.
1707 #if OIIO_SIMD
1708     // Do the computations in batches of 4, with SIMD ops.
1709     static OIIO_SIMD4_ALIGN float iota_start[4] = { 0.5f, 1.5f, 2.5f, 3.5f };
1710     vfloat4 iota                                = *(const vfloat4*)iota_start;
1711     for (int sample = 0; sample < nsamples; sample += 4) {
1712         vfloat4 pos = 2.0f * (iota * invsamples - 0.5f);
1713         vfloat4 ss  = s + pos * smajor;
1714         vfloat4 tt  = t + pos * tmajor;
1715         ss.store(sval + sample);
1716         tt.store(tval + sample);
1717         iota += 4.0f;
1718     }
1719 #else
1720     // Non-SIMD, reference code
1721     for (int sample = 0; sample < nsamples; ++sample) {
1722         float pos = 2.0f * ((sample + 0.5f) * invsamples - 0.5f);
1723         sval[sample] = s + pos * smajor;
1724         tval[sample] = t + pos * tmajor;
1725     }
1726 #endif
1727 
1728     vfloat4 r_sum, drds_sum, drdt_sum;
1729     r_sum.clear();
1730     if (dresultds) {
1731         drds_sum.clear();
1732         drdt_sum.clear();
1733     }
1734     for (int level = 0; level < 2; ++level) {
1735         if (!levelweight[level])  // No contribution from this level, skip it
1736             continue;
1737         ++npointson;
1738         vfloat4 r, drds, drdt;
1739         int lev = miplevel[level];
1740         switch (options.interpmode) {
1741         case TextureOpt::InterpClosest:
1742             ok &= sample_closest(nsamples, sval, tval, lev, texturefile,
1743                                  thread_info, options, nchannels_result,
1744                                  actualchannels, lineweight, &r, NULL, NULL);
1745             ++closestprobes;
1746             break;
1747         case TextureOpt::InterpBilinear:
1748             ok &= sample_bilinear(nsamples, sval, tval, lev, texturefile,
1749                                   thread_info, options, nchannels_result,
1750                                   actualchannels, lineweight, &r,
1751                                   dresultds ? &drds : NULL,
1752                                   dresultds ? &drdt : NULL);
1753             ++bilinearprobes;
1754             break;
1755         case TextureOpt::InterpBicubic:
1756             ok &= sample_bicubic(nsamples, sval, tval, lev, texturefile,
1757                                  thread_info, options, nchannels_result,
1758                                  actualchannels, lineweight, &r,
1759                                  dresultds ? &drds : NULL,
1760                                  dresultds ? &drdt : NULL);
1761             ++bicubicprobes;
1762             break;
1763         case TextureOpt::InterpSmartBicubic:
1764             if (lev == 0
1765                 || (texturefile.spec(options.subimage, lev).width
1766                     < naturalsres / 2)
1767                 || (texturefile.spec(options.subimage, lev).height
1768                     < naturaltres / 2)) {
1769                 ok &= sample_bicubic(nsamples, sval, tval, lev, texturefile,
1770                                      thread_info, options, nchannels_result,
1771                                      actualchannels, lineweight, &r,
1772                                      dresultds ? &drds : NULL,
1773                                      dresultds ? &drdt : NULL);
1774                 ++bicubicprobes;
1775             } else {
1776                 ok &= sample_bilinear(nsamples, sval, tval, lev, texturefile,
1777                                       thread_info, options, nchannels_result,
1778                                       actualchannels, lineweight, &r,
1779                                       dresultds ? &drds : NULL,
1780                                       dresultds ? &drdt : NULL);
1781                 ++bilinearprobes;
1782             }
1783             break;
1784         }
1785 
1786         vfloat4 lw = levelweight[level];
1787         r_sum += lw * r;
1788         if (dresultds) {
1789             drds_sum += lw * drds;
1790             drdt_sum += lw * drdt;
1791         }
1792     }
1793 
1794     *(simd::vfloat4*)(result) = r_sum;
1795     if (dresultds) {
1796         *(simd::vfloat4*)(dresultds) = drds_sum;
1797         *(simd::vfloat4*)(dresultdt) = drdt_sum;
1798     }
1799 
1800     // Update stats
1801     ImageCacheStatistics& stats(thread_info->m_stats);
1802     stats.aniso_queries += npointson;
1803     stats.aniso_probes += npointson * nsamples;
1804     if (trueaspect > stats.max_aniso)
1805         stats.max_aniso = trueaspect;  // FIXME?
1806     stats.closest_interps += closestprobes * nsamples;
1807     stats.bilinear_interps += bilinearprobes * nsamples;
1808     stats.cubic_interps += bicubicprobes * nsamples;
1809 
1810     return ok;
1811 }
1812 
1813 
1814 
1815 const float*
pole_color(TextureFile & texturefile,PerThreadInfo *,const ImageCacheFile::LevelInfo & levelinfo,TileRef & tile,int subimage,int,int pole)1816 TextureSystemImpl::pole_color(TextureFile& texturefile,
1817                               PerThreadInfo* /*thread_info*/,
1818                               const ImageCacheFile::LevelInfo& levelinfo,
1819                               TileRef& tile, int subimage, int /*miplevel*/,
1820                               int pole)
1821 {
1822     if (!levelinfo.onetile)
1823         return NULL;  // Only compute color for one-tile MIP levels
1824     const ImageSpec& spec(levelinfo.spec);
1825     if (!levelinfo.polecolorcomputed) {
1826         static spin_mutex mutex;  // Protect everybody's polecolor
1827         spin_lock lock(mutex);
1828         if (!levelinfo.polecolorcomputed) {
1829             OIIO_DASSERT(levelinfo.polecolor.size() == 0);
1830             levelinfo.polecolor.resize(2 * spec.nchannels);
1831             OIIO_DASSERT(tile->id().nchannels() == spec.nchannels
1832                          && "pole_color doesn't work for channel subsets");
1833             int pixelsize                = tile->pixelsize();
1834             TypeDesc::BASETYPE pixeltype = texturefile.pixeltype(subimage);
1835             // We store north and south poles adjacently in polecolor
1836             float* p    = &(levelinfo.polecolor[0]);
1837             int width   = spec.width;
1838             float scale = 1.0f / width;
1839             for (int pole = 0; pole <= 1; ++pole, p += spec.nchannels) {
1840                 int y = pole * (spec.height - 1);  // 0 or height-1
1841                 for (int c = 0; c < spec.nchannels; ++c)
1842                     p[c] = 0.0f;
1843                 const unsigned char* texel = tile->bytedata()
1844                                              + y * spec.tile_width * pixelsize;
1845                 for (int i = 0; i < width; ++i, texel += pixelsize)
1846                     for (int c = 0; c < spec.nchannels; ++c) {
1847                         if (pixeltype == TypeDesc::UINT8)
1848                             p[c] += uchar2float(texel[c]);
1849                         else if (pixeltype == TypeDesc::UINT16)
1850                             p[c] += convert_type<uint16_t, float>(
1851                                 ((const uint16_t*)texel)[c]);
1852                         else if (pixeltype == TypeDesc::HALF)
1853                             p[c] += ((const half*)texel)[c];
1854                         else {
1855                             OIIO_DASSERT(pixeltype == TypeDesc::FLOAT);
1856                             p[c] += ((const float*)texel)[c];
1857                         }
1858                     }
1859                 for (int c = 0; c < spec.nchannels; ++c)
1860                     p[c] *= scale;
1861             }
1862             levelinfo.polecolorcomputed = true;
1863         }
1864     }
1865     return &(levelinfo.polecolor[pole * spec.nchannels]);
1866 }
1867 
1868 
1869 
1870 void
fade_to_pole(float t,float * accum,float & weight,TextureFile & texturefile,PerThreadInfo * thread_info,const ImageCacheFile::LevelInfo & levelinfo,TextureOpt & options,int miplevel,int nchannels)1871 TextureSystemImpl::fade_to_pole(float t, float* accum, float& weight,
1872                                 TextureFile& texturefile,
1873                                 PerThreadInfo* thread_info,
1874                                 const ImageCacheFile::LevelInfo& levelinfo,
1875                                 TextureOpt& options, int miplevel,
1876                                 int nchannels)
1877 {
1878     // N.B. We want to fade to pole colors right at texture
1879     // boundaries t==0 and t==height, but at the very top of this
1880     // function, we subtracted another 0.5 from t, so we need to
1881     // undo that here.
1882     float pole;
1883     const float* polecolor;
1884     if (t < 1.0f) {
1885         pole      = (1.0f - t);
1886         polecolor = pole_color(texturefile, thread_info, levelinfo,
1887                                thread_info->tile, options.subimage, miplevel,
1888                                0);
1889     } else {
1890         pole      = t - floorf(t);
1891         polecolor = pole_color(texturefile, thread_info, levelinfo,
1892                                thread_info->tile, options.subimage, miplevel,
1893                                1);
1894     }
1895     pole = Imath::clamp(pole, 0.0f, 1.0f);
1896     pole *= pole;  // squaring makes more pleasing appearance
1897     polecolor += options.firstchannel;
1898     for (int c = 0; c < nchannels; ++c)
1899         accum[c] += weight * pole * polecolor[c];
1900     weight *= 1.0f - pole;
1901 }
1902 
1903 
1904 
1905 bool
sample_closest(int nsamples,const float * s_,const float * t_,int miplevel,TextureFile & texturefile,PerThreadInfo * thread_info,TextureOpt & options,int nchannels_result,int actualchannels,const float * weight_,vfloat4 * accum_,vfloat4 * daccumds_,vfloat4 * daccumdt_)1906 TextureSystemImpl::sample_closest(
1907     int nsamples, const float* s_, const float* t_, int miplevel,
1908     TextureFile& texturefile, PerThreadInfo* thread_info, TextureOpt& options,
1909     int nchannels_result, int actualchannels, const float* weight_,
1910     vfloat4* accum_, vfloat4* daccumds_, vfloat4* daccumdt_)
1911 {
1912     bool allok = true;
1913     const ImageSpec& spec(texturefile.spec(options.subimage, miplevel));
1914     const ImageCacheFile::LevelInfo& levelinfo(
1915         texturefile.levelinfo(options.subimage, miplevel));
1916     TypeDesc::BASETYPE pixeltype = texturefile.pixeltype(options.subimage);
1917     wrap_impl swrap_func         = wrap_functions[(int)options.swrap];
1918     wrap_impl twrap_func         = wrap_functions[(int)options.twrap];
1919     vfloat4 accum;
1920     accum.clear();
1921     float nonfill    = 0.0f;
1922     int firstchannel = options.firstchannel;
1923     int tile_chbegin = 0, tile_chend = spec.nchannels;
1924     if (spec.nchannels > m_max_tile_channels) {
1925         // For files with many channels, narrow the range we cache
1926         tile_chbegin = options.firstchannel;
1927         tile_chend   = options.firstchannel + actualchannels;
1928     }
1929     TileID id(texturefile, options.subimage, miplevel, 0, 0, 0, tile_chbegin,
1930               tile_chend);
1931     for (int sample = 0; sample < nsamples; ++sample) {
1932         float s = s_[sample], t = t_[sample];
1933         float weight = weight_[sample];
1934 
1935         int stex, ttex;  // Texel coordinates
1936         float sfrac, tfrac;
1937         st_to_texel(s, t, texturefile, spec, stex, ttex, sfrac, tfrac);
1938 
1939         if (sfrac > 0.5f)
1940             ++stex;
1941         if (tfrac > 0.5f)
1942             ++ttex;
1943 
1944         // Wrap
1945         bool svalid, tvalid;  // Valid texels?  false means black border
1946         svalid = swrap_func(stex, spec.x, spec.width);
1947         tvalid = twrap_func(ttex, spec.y, spec.height);
1948         if (!levelinfo.full_pixel_range) {
1949             svalid &= (stex >= spec.x
1950                        && stex < (spec.x + spec.width));  // data window
1951             tvalid &= (ttex >= spec.y && ttex < (spec.y + spec.height));
1952         }
1953         if (!(svalid & tvalid)) {
1954             // All texels we need were out of range and using 'black' wrap.
1955             nonfill += weight;
1956             continue;
1957         }
1958 
1959         int tile_s = (stex - spec.x) % spec.tile_width;
1960         int tile_t = (ttex - spec.y) % spec.tile_height;
1961         id.xy(stex - tile_s, ttex - tile_t);
1962         bool ok = find_tile(id, thread_info, sample == 0);
1963         if (!ok)
1964             error("{}", m_imagecache->geterror());
1965         TileRef& tile(thread_info->tile);
1966         if (!tile || !ok) {
1967             allok = false;
1968             continue;
1969         }
1970         int offset = id.nchannels() * (tile_t * spec.tile_width + tile_s)
1971                      + (firstchannel - id.chbegin());
1972         OIIO_DASSERT((size_t)offset < spec.nchannels * spec.tile_pixels());
1973         simd::vfloat4 texel_simd;
1974         if (pixeltype == TypeDesc::UINT8) {
1975             // special case for 8-bit tiles
1976             texel_simd = uchar2float4(tile->bytedata() + offset);
1977         } else if (pixeltype == TypeDesc::UINT16) {
1978             texel_simd = ushort2float4(tile->ushortdata() + offset);
1979         } else if (pixeltype == TypeDesc::HALF) {
1980             texel_simd = half2float4(tile->halfdata() + offset);
1981         } else {
1982             OIIO_DASSERT(pixeltype == TypeDesc::FLOAT);
1983             texel_simd.load(tile->floatdata() + offset);
1984         }
1985 
1986         accum += weight * texel_simd;
1987     }
1988     simd::vbool4 channel_mask = channel_masks[actualchannels];
1989     accum                     = blend0(accum, channel_mask);
1990     if (nonfill < 1.0f && nchannels_result > actualchannels && options.fill) {
1991         // Add the weighted fill color
1992         accum += blend0not(vfloat4((1.0f - nonfill) * options.fill),
1993                            channel_mask);
1994     }
1995 
1996     *accum_ = accum;
1997     if (daccumds_) {
1998         daccumds_->clear();  // constant interp has 0 derivatives
1999         daccumdt_->clear();
2000     }
2001     return allok;
2002 }
2003 
2004 
2005 
2006 /// Convert texture coordinates (s,t), which range on 0-1 for the "full"
2007 /// image boundary, to texel coordinates (i+ifrac,j+jfrac) where (i,j) is
2008 /// the texel to the immediate upper left of the sample position, and ifrac
2009 /// and jfrac are the fractional (0-1) portion of the way to the next texel
2010 /// to the right or down, respectively.  Do this for 4 s,t values at a time.
2011 inline void
st_to_texel_simd(const vfloat4 & s_,const vfloat4 & t_,TextureSystemImpl::TextureFile & texturefile,const ImageSpec & spec,vint4 & i,vint4 & j,vfloat4 & ifrac,vfloat4 & jfrac)2012 st_to_texel_simd(const vfloat4& s_, const vfloat4& t_,
2013                  TextureSystemImpl::TextureFile& texturefile,
2014                  const ImageSpec& spec, vint4& i, vint4& j, vfloat4& ifrac,
2015                  vfloat4& jfrac)
2016 {
2017     vfloat4 s, t;
2018     // As passed in, (s,t) map the texture to (0,1).  Remap to texel coords.
2019     // Note that we have two modes, depending on the m_sample_border.
2020     if (texturefile.sample_border() == 0) {
2021         // texel samples are at 0.5/res, 1.5/res, ..., (res-0.5)/res,
2022         s = s_ * float(spec.width) + (spec.x - 0.5f);
2023         t = t_ * float(spec.height) + (spec.y - 0.5f);
2024     } else {
2025         // first and last rows/columns are *exactly* on the boundary,
2026         // so samples are at 0, 1/(res-1), ..., 1.
2027         s = s_ * float(spec.width - 1) + float(spec.x);
2028         t = t_ * float(spec.height - 1) + float(spec.y);
2029     }
2030     ifrac = floorfrac(s, &i);
2031     jfrac = floorfrac(t, &j);
2032     // Now (i,j) are the integer coordinates of the texel to the
2033     // immediate "upper left" of the lookup point, and (ifrac,jfrac) are
2034     // the amount that the lookup point is actually offset from the
2035     // texel center (with (1,1) being all the way to the next texel down
2036     // and to the right).
2037 }
2038 
2039 
2040 
2041 bool
sample_bilinear(int nsamples,const float * s_,const float * t_,int miplevel,TextureFile & texturefile,PerThreadInfo * thread_info,TextureOpt & options,int nchannels_result,int actualchannels,const float * weight_,vfloat4 * accum_,vfloat4 * daccumds_,vfloat4 * daccumdt_)2042 TextureSystemImpl::sample_bilinear(
2043     int nsamples, const float* s_, const float* t_, int miplevel,
2044     TextureFile& texturefile, PerThreadInfo* thread_info, TextureOpt& options,
2045     int nchannels_result, int actualchannels, const float* weight_,
2046     vfloat4* accum_, vfloat4* daccumds_, vfloat4* daccumdt_)
2047 {
2048     const ImageSpec& spec(texturefile.spec(options.subimage, miplevel));
2049     const ImageCacheFile::LevelInfo& levelinfo(
2050         texturefile.levelinfo(options.subimage, miplevel));
2051     TypeDesc::BASETYPE pixeltype = texturefile.pixeltype(options.subimage);
2052     wrap_impl swrap_func         = wrap_functions[(int)options.swrap];
2053     wrap_impl twrap_func         = wrap_functions[(int)options.twrap];
2054     wrap_impl_simd wrap_func     = (swrap_func == twrap_func)
2055                                        ? wrap_functions_simd[(int)options.swrap]
2056                                        : NULL;
2057     simd::vint4 xy(spec.x, spec.y);
2058     simd::vint4 widthheight(spec.width, spec.height);
2059     simd::vint4 tilewh(spec.tile_width, spec.tile_height);
2060     simd::vint4 tilewhmask = tilewh - 1;
2061     bool use_fill      = (nchannels_result > actualchannels && options.fill);
2062     bool tilepow2      = ispow2(spec.tile_width) && ispow2(spec.tile_height);
2063     size_t channelsize = texturefile.channelsize(options.subimage);
2064     int firstchannel   = options.firstchannel;
2065     int tile_chbegin = 0, tile_chend = spec.nchannels;
2066     // need_pole: do we potentially need to fade to special pole color?
2067     // If we do, can't restrict channel range or fade_to_pole won't work.
2068     bool need_pole = (options.envlayout == LayoutLatLong && levelinfo.onetile);
2069     if (spec.nchannels > m_max_tile_channels && !need_pole) {
2070         // For files with many channels, narrow the range we cache
2071         tile_chbegin = options.firstchannel;
2072         tile_chend   = options.firstchannel + actualchannels;
2073     }
2074     TileID id(texturefile, options.subimage, miplevel, 0, 0, 0, tile_chbegin,
2075               tile_chend);
2076     float nonfill = 0.0f;  // The degree to which we DON'T need fill
2077     // N.B. What's up with "nofill"? We need to consider fill only when we
2078     // are inside the valid texture region. Outside, i.e. in the black wrap
2079     // region, black takes precedence over fill. By keeping track of when
2080     // we don't need to worry about fill, which is the comparatively rare
2081     // case, we do a lot less math and have fewer rounding errors.
2082 
2083     vfloat4 accum, daccumds, daccumdt;
2084     accum.clear();
2085     if (daccumds_) {
2086         daccumds.clear();
2087         daccumdt.clear();
2088     }
2089     vfloat4 s_simd, t_simd;
2090     vint4 sint_simd, tint_simd;
2091     vfloat4 sfrac_simd, tfrac_simd;
2092     for (int sample = 0; sample < nsamples; ++sample) {
2093         // To utilize SIMD ops in an inherently scalar loop, every fourth
2094         // step, we compute the st_to_texel for the next four samples.
2095         int sample4 = sample & 3;
2096         if (sample4 == 0) {
2097             s_simd.load(s_ + sample);
2098             t_simd.load(t_ + sample);
2099             st_to_texel_simd(s_simd, t_simd, texturefile, spec, sint_simd,
2100                              tint_simd, sfrac_simd, tfrac_simd);
2101         }
2102         int sint = sint_simd[sample4], tint = tint_simd[sample4];
2103         float sfrac = sfrac_simd[sample4], tfrac = tfrac_simd[sample4];
2104         float weight = weight_[sample];
2105 
2106         // SIMD-ize the indices. We have four texels, fit them into one SIMD
2107         // 4-vector as S0,S1,T0,T1.
2108         enum { S0 = 0, S1 = 1, T0 = 2, T1 = 3 };
2109 
2110         simd::vint4 sttex(sint, sint + 1, tint,
2111                           tint + 1);  // Texel coords: s0,s1,t0,t1
2112         simd::vbool4 stvalid;
2113         if (wrap_func) {
2114             // Both directions use the same wrap function, call in parallel.
2115             stvalid = wrap_func(sttex, xy, widthheight);
2116         } else {
2117             stvalid.load(swrap_func(sttex[S0], spec.x, spec.width),
2118                          swrap_func(sttex[S1], spec.x, spec.width),
2119                          twrap_func(sttex[T0], spec.y, spec.height),
2120                          twrap_func(sttex[T1], spec.y, spec.height));
2121         }
2122 
2123         // Account for crop windows
2124         if (!levelinfo.full_pixel_range) {
2125             stvalid &= (sttex >= xy) & (sttex < (xy + widthheight));
2126         }
2127         if (none(stvalid)) {
2128             nonfill += weight;
2129             continue;  // All texels we need were out of range and using 'black' wrap
2130         }
2131 
2132         simd::vfloat4 texel_simd[2][2];
2133         simd::vint4 tile_st = (simd::vint4(simd::shuffle<S0, S0, T0, T0>(sttex))
2134                                - xy);
2135         if (tilepow2)
2136             tile_st &= tilewhmask;
2137         else
2138             tile_st %= tilewh;
2139         bool s_onetile = (tile_st[S0] != tilewhmask[S0])
2140                          & (sttex[S0] + 1 == sttex[S1]);
2141         bool t_onetile = (tile_st[T0] != tilewhmask[T0])
2142                          & (sttex[T0] + 1 == sttex[T1]);
2143         bool onetile = (s_onetile & t_onetile);
2144         if (onetile && all(stvalid)) {
2145             // Shortcut if all the texels we need are on the same tile
2146             id.xy(sttex[S0] - tile_st[S0], sttex[T0] - tile_st[T0]);
2147             bool ok = find_tile(id, thread_info, sample == 0);
2148             if (!ok)
2149                 error("{}", m_imagecache->geterror());
2150             TileRef& tile(thread_info->tile);
2151             if (!tile->valid())
2152                 return false;
2153             int pixelsize = tile->pixelsize();
2154             int offset    = pixelsize
2155                          * (tile_st[T0] * spec.tile_width + tile_st[S0]);
2156             const unsigned char* p = tile->bytedata() + offset
2157                                      + channelsize
2158                                            * (firstchannel - id.chbegin());
2159             if (pixeltype == TypeDesc::UINT8) {
2160                 texel_simd[0][0] = uchar2float4(p);
2161                 texel_simd[0][1] = uchar2float4(p + pixelsize);
2162                 p += pixelsize * spec.tile_width;
2163                 texel_simd[1][0] = uchar2float4(p);
2164                 texel_simd[1][1] = uchar2float4(p + pixelsize);
2165             } else if (pixeltype == TypeDesc::UINT16) {
2166                 texel_simd[0][0] = ushort2float4((uint16_t*)p);
2167                 texel_simd[0][1] = ushort2float4((uint16_t*)(p + pixelsize));
2168                 p += pixelsize * spec.tile_width;
2169                 texel_simd[1][0] = ushort2float4((uint16_t*)p);
2170                 texel_simd[1][1] = ushort2float4((uint16_t*)(p + pixelsize));
2171             } else if (pixeltype == TypeDesc::HALF) {
2172                 texel_simd[0][0] = half2float4((half*)p);
2173                 texel_simd[0][1] = half2float4((half*)(p + pixelsize));
2174                 p += pixelsize * spec.tile_width;
2175                 texel_simd[1][0] = half2float4((half*)p);
2176                 texel_simd[1][1] = half2float4((half*)(p + pixelsize));
2177             } else {
2178                 OIIO_DASSERT(pixeltype == TypeDesc::FLOAT);
2179                 texel_simd[0][0].load((const float*)p);
2180                 texel_simd[0][1].load((const float*)(p + pixelsize));
2181                 p += pixelsize * spec.tile_width;
2182                 texel_simd[1][0].load((const float*)p);
2183                 texel_simd[1][1].load((const float*)(p + pixelsize));
2184             }
2185         } else {
2186             bool noreusetile      = (options.swrap == TextureOpt::WrapMirror);
2187             simd::vint4 tile_st   = (sttex - xy) % tilewh;
2188             simd::vint4 tile_edge = sttex - tile_st;
2189             for (int j = 0; j < 2; ++j) {
2190                 if (!stvalid[T0 + j]) {
2191                     texel_simd[j][0].clear();
2192                     texel_simd[j][1].clear();
2193                     continue;
2194                 }
2195                 int tile_t = tile_st[T0 + j];
2196                 for (int i = 0; i < 2; ++i) {
2197                     if (!stvalid[S0 + i]) {
2198                         texel_simd[j][i].clear();
2199                         continue;
2200                     }
2201                     int tile_s = tile_st[S0 + i];
2202                     // Trick: we only need to find a tile if i == 0 or if we
2203                     // just crossed a tile bouncary (if tile_s == 0).
2204                     // Otherwise, we are still on the same tile as the last
2205                     // iteration, as long as we aren't using mirror wrap mode!
2206                     if (i == 0 || tile_s == 0 || noreusetile) {
2207                         id.xy(tile_edge[S0 + i], tile_edge[T0 + j]);
2208                         bool ok = find_tile(id, thread_info, sample == 0);
2209                         if (!ok)
2210                             error("{}", m_imagecache->geterror());
2211                         if (!thread_info->tile->valid()) {
2212                             return false;
2213                         }
2214                         OIIO_DASSERT(thread_info->tile->id() == id);
2215                     }
2216                     TileRef& tile(thread_info->tile);
2217                     int pixelsize = tile->pixelsize();
2218                     int offset    = pixelsize
2219                                  * (tile_t * spec.tile_width + tile_s);
2220                     offset += (firstchannel - id.chbegin()) * channelsize;
2221                     OIIO_DASSERT(offset < spec.tile_width * spec.tile_height
2222                                               * spec.tile_depth * pixelsize);
2223                     if (pixeltype == TypeDesc::UINT8)
2224                         texel_simd[j][i] = uchar2float4(
2225                             (const unsigned char*)(tile->bytedata() + offset));
2226                     else if (pixeltype == TypeDesc::UINT16)
2227                         texel_simd[j][i] = ushort2float4(
2228                             (const unsigned short*)(tile->bytedata() + offset));
2229                     else if (pixeltype == TypeDesc::HALF)
2230                         texel_simd[j][i] = half2float4(
2231                             (const half*)(tile->bytedata() + offset));
2232                     else {
2233                         OIIO_DASSERT(pixeltype == TypeDesc::FLOAT);
2234                         texel_simd[j][i].load(
2235                             (const float*)(tile->bytedata() + offset));
2236                     }
2237                 }
2238             }
2239         }
2240 
2241         // When we're on the lowest res mipmap levels, it's more pleasing if
2242         // we converge to a single pole color right at the pole.  Fade to
2243         // the average color over the texel height right next to the pole.
2244         if (need_pole) {
2245             float height = spec.height;
2246             if (texturefile.m_sample_border)
2247                 height -= 1.0f;
2248             float tt = t_[sample] * height;
2249             if (tt < 1.0f || tt > (height - 1.0f))
2250                 fade_to_pole(tt, (float*)&accum, weight, texturefile,
2251                              thread_info, levelinfo, options, miplevel,
2252                              actualchannels);
2253         }
2254 
2255         simd::vfloat4 weight_simd = weight;
2256         accum += weight_simd
2257                  * bilerp(texel_simd[0][0], texel_simd[0][1], texel_simd[1][0],
2258                           texel_simd[1][1], sfrac, tfrac);
2259         if (daccumds_) {
2260             simd::vfloat4 scalex = weight_simd * float(spec.width);
2261             simd::vfloat4 scaley = weight_simd * float(spec.height);
2262             daccumds += scalex
2263                         * lerp(texel_simd[0][1] - texel_simd[0][0],
2264                                texel_simd[1][1] - texel_simd[1][0], tfrac);
2265             daccumdt += scaley
2266                         * lerp(texel_simd[1][0] - texel_simd[0][0],
2267                                texel_simd[1][1] - texel_simd[0][1], sfrac);
2268         }
2269         if (use_fill && !all(stvalid)) {
2270             // Compute appropriate amount of "fill" color to extra channels in
2271             // non-"black"-wrapped regions.
2272             float f = bilerp(float(stvalid[S0] * stvalid[T0]),
2273                              float(stvalid[S1] * stvalid[T0]),
2274                              float(stvalid[S0] * stvalid[T1]),
2275                              float(stvalid[S1] * stvalid[T1]), sfrac, tfrac);
2276             nonfill += (1.0f - f) * weight;
2277         }
2278     }
2279 
2280     simd::vbool4 channel_mask = channel_masks[actualchannels];
2281     accum                     = blend0(accum, channel_mask);
2282     if (use_fill) {
2283         // Add the weighted fill color
2284         accum += blend0not(vfloat4((1.0f - nonfill) * options.fill),
2285                            channel_mask);
2286     }
2287 
2288     *accum_ = accum;
2289     if (daccumds_) {
2290         *daccumds_ = blend0(daccumds, channel_mask);
2291         *daccumdt_ = blend0(daccumdt, channel_mask);
2292     }
2293     return true;
2294 }
2295 
2296 
2297 namespace {
2298 
2299     // Evaluate Bspline weights for both value and derivatives (if dw is not
2300     // NULL) into w[0..3] and dw[0..3]. This is the canonical version for
2301     // reference, but we don't actually call it, instead favoring the much
2302     // harder to read SIMD versions below.
2303     template<typename T>
evalBSplineWeights_and_derivs(T * w,T fraction,T * dw=NULL)2304     inline void evalBSplineWeights_and_derivs(T* w, T fraction, T* dw = NULL)
2305     {
2306         T one_frac = 1.0 - fraction;
2307         w[0]       = T(1.0 / 6.0) * one_frac * one_frac * one_frac;
2308         w[1]       = T(2.0 / 3.0)
2309                - T(0.5) * fraction * fraction * (T(2.0) - fraction);
2310         w[2] = T(2.0 / 3.0)
2311                - T(0.5) * one_frac * one_frac * (T(2.0) - one_frac);
2312         w[3] = T(1.0 / 6.0) * fraction * fraction * fraction;
2313         if (dw) {
2314             dw[0] = T(-0.5) * one_frac * one_frac;
2315             dw[1] = T(0.5) * fraction * (T(3.0) * fraction - T(4.0));
2316             dw[2] = T(-0.5) * one_frac * (T(3.0) * one_frac - T(4.0));
2317             dw[3] = T(0.5) * fraction * fraction;
2318         }
2319     }
2320 
2321     // Evaluate the 4 Bspline weights (no derivs), returing them as a vfloat4.
2322     // The fraction also comes in as a vfloat4 (assuming the same value in all 4
2323     // slots).
evalBSplineWeights(const vfloat4 & fraction)2324     inline vfloat4 evalBSplineWeights(const vfloat4& fraction)
2325     {
2326 #if 0
2327     // Version that's easy to read and understand:
2328     float one_frac = 1.0f - fraction;
2329     vfloat4 w;
2330     w[0] = 0.0f          + (1.0f / 6.0f) * one_frac * one_frac * one_frac;
2331     w[1] = (2.0f / 3.0f) + (-0.5f)       * fraction * fraction * (2.0f - fraction);
2332     w[2] = (2.0f / 3.0f) + (-0.5f)       * one_frac * one_frac * (2.0f - one_frac);
2333     w[3] = 0.0f          + (1.0f / 6.0f) * fraction * fraction * fraction;
2334     return w;
2335 #else
2336         // Not as clear, but fastest version I've been able to achieve:
2337         OIIO_SIMD_FLOAT4_CONST4(A, 0.0f, 2.0f / 3.0f, 2.0f / 3.0f, 0.0f);
2338         OIIO_SIMD_FLOAT4_CONST4(B, 1.0f / 6.0f, -0.5f, -0.5f, 1.0f / 6.0f);
2339         OIIO_SIMD_FLOAT4_CONST4(om1m1o, 1.0f, -1.0f, -1.0f, 1.0f);
2340         OIIO_SIMD_FLOAT4_CONST4(z22z, 0.0f, 2.0f, 2.0f, 0.0f);
2341         simd::vfloat4 one_frac = vfloat4::One() - fraction;
2342         simd::vfloat4 ofof = AxBxAyBy(one_frac,
2343                                       fraction);  // 1-frac, frac, 1-frac, frac
2344         simd::vfloat4 C = (*(vfloat4*)&om1m1o) * ofof + (*(vfloat4*)&z22z);
2345         return (*(vfloat4*)&A) + (*(vfloat4*)&B) * ofof * ofof * C;
2346 #endif
2347     }
2348 
2349     // Evaluate Bspline weights for both value and derivatives (if dw is not
2350     // NULL), returning the 4 coefficients for each as vfloat4's.
evalBSplineWeights_and_derivs(simd::vfloat4 * w,float fraction,simd::vfloat4 * dw=NULL)2351     inline void evalBSplineWeights_and_derivs(simd::vfloat4* w, float fraction,
2352                                               simd::vfloat4* dw = NULL)
2353     {
2354 #if 0
2355     // Version that's easy to read and understand:
2356     float one_frac = 1.0f - fraction;
2357     (*w)[0] = 0.0f          + (1.0f / 6.0f) * one_frac * one_frac * one_frac;
2358     (*w)[1] = (2.0f / 3.0f) + (-0.5f)       * fraction * fraction * (2.0f - fraction);
2359     (*w)[2] = (2.0f / 3.0f) + (-0.5f)       * one_frac * one_frac * (2.0f - one_frac);
2360     (*w)[3] = 0.0f          + (1.0f / 6.0f) * fraction * fraction * fraction;
2361     if (dw) {
2362         (*dw)[0] = -0.5f * one_frac * (1.0f * one_frac - 0.0f);
2363         (*dw)[1] =  0.5f * fraction * (3.0f * fraction - 4.0f);
2364         (*dw)[2] = -0.5f * one_frac * (3.0f * one_frac - 4.0f);
2365         (*dw)[3] =  0.5f * fraction * (1.0f * fraction - 0.0f);
2366     }
2367 #else
2368         // Not as clear, but fastest version I've been able to achieve:
2369         OIIO_SIMD_FLOAT4_CONST4(A, 0.0f, 2.0f / 3.0f, 2.0f / 3.0f, 0.0f);
2370         OIIO_SIMD_FLOAT4_CONST4(B, 1.0f / 6.0f, -0.5f, -0.5f, 1.0f / 6.0f);
2371         float one_frac = 1.0f - fraction;
2372         simd::vfloat4 ofof(one_frac, fraction, one_frac, fraction);
2373         simd::vfloat4 C(one_frac, 2.0f - fraction, 2.0f - one_frac, fraction);
2374         *w = (*(vfloat4*)&A) + (*(vfloat4*)&B) * ofof * ofof * C;
2375         if (dw) {
2376             const simd::vfloat4 D(-0.5f, 0.5f, -0.5f, 0.5f);
2377             const simd::vfloat4 E(1.0f, 3.0f, 3.0f, 1.0f);
2378             const simd::vfloat4 F(0.0f, 4.0f, 4.0f, 0.0f);
2379             *dw = D * ofof * (E * ofof - F);
2380         }
2381 #endif
2382     }
2383 
2384 }  // anonymous namespace
2385 
2386 
2387 
2388 bool
sample_bicubic(int nsamples,const float * s_,const float * t_,int miplevel,TextureFile & texturefile,PerThreadInfo * thread_info,TextureOpt & options,int nchannels_result,int actualchannels,const float * weight_,vfloat4 * accum_,vfloat4 * daccumds_,vfloat4 * daccumdt_)2389 TextureSystemImpl::sample_bicubic(
2390     int nsamples, const float* s_, const float* t_, int miplevel,
2391     TextureFile& texturefile, PerThreadInfo* thread_info, TextureOpt& options,
2392     int nchannels_result, int actualchannels, const float* weight_,
2393     vfloat4* accum_, vfloat4* daccumds_, vfloat4* daccumdt_)
2394 {
2395     const ImageSpec& spec(texturefile.spec(options.subimage, miplevel));
2396     const ImageCacheFile::LevelInfo& levelinfo(
2397         texturefile.levelinfo(options.subimage, miplevel));
2398     TypeDesc::BASETYPE pixeltype   = texturefile.pixeltype(options.subimage);
2399     wrap_impl_simd swrap_func_simd = wrap_functions_simd[(int)options.swrap];
2400     wrap_impl_simd twrap_func_simd = wrap_functions_simd[(int)options.twrap];
2401 
2402     vint4 spec_x_simd(spec.x);
2403     vint4 spec_y_simd(spec.y);
2404     vint4 spec_width_simd(spec.width);
2405     vint4 spec_height_simd(spec.height);
2406     vint4 spec_x_plus_width_simd  = spec_x_simd + spec_width_simd;
2407     vint4 spec_y_plus_height_simd = spec_y_simd + spec_height_simd;
2408     bool use_fill      = (nchannels_result > actualchannels && options.fill);
2409     bool tilepow2      = ispow2(spec.tile_width) && ispow2(spec.tile_height);
2410     int tilewidthmask  = spec.tile_width - 1;  // e.g. 63
2411     int tileheightmask = spec.tile_height - 1;
2412     size_t channelsize = texturefile.channelsize(options.subimage);
2413     int firstchannel   = options.firstchannel;
2414     float nonfill      = 0.0f;  // The degree to which we DON'T need fill
2415     // N.B. What's up with "nofill"? We need to consider fill only when we
2416     // are inside the valid texture region. Outside, i.e. in the black wrap
2417     // region, black takes precedence over fill. By keeping track of when
2418     // we don't need to worry about fill, which is the comparatively rare
2419     // case, we do a lot less math and have fewer rounding errors.
2420 
2421     // need_pole: do we potentially need to fade to special pole color?
2422     // If we do, can't restrict channel range or fade_to_pole won't work.
2423     bool need_pole = (options.envlayout == LayoutLatLong && levelinfo.onetile);
2424     int tile_chbegin = 0, tile_chend = spec.nchannels;
2425     if (spec.nchannels > m_max_tile_channels) {
2426         // For files with many channels, narrow the range we cache
2427         tile_chbegin = options.firstchannel;
2428         tile_chend   = options.firstchannel + actualchannels;
2429     }
2430     TileID id(texturefile, options.subimage, miplevel, 0, 0, 0, tile_chbegin,
2431               tile_chend);
2432     size_t pixelsize                 = channelsize * id.nchannels();
2433     size_t firstchannel_offset_bytes = channelsize
2434                                        * (firstchannel - id.chbegin());
2435     vfloat4 accum, daccumds, daccumdt;
2436     accum.clear();
2437     if (daccumds_) {
2438         daccumds.clear();
2439         daccumdt.clear();
2440     }
2441 
2442     vfloat4 s_simd, t_simd;
2443     vint4 sint_simd, tint_simd;
2444     vfloat4 sfrac_simd, tfrac_simd;
2445     for (int sample = 0; sample < nsamples; ++sample) {
2446         // To utilize SIMD ops in an inherently scalar loop, every fourth
2447         // step, we compute the st_to_texel for the next four samples.
2448         int sample4 = sample & 3;
2449         if (sample4 == 0) {
2450             s_simd.load(s_ + sample);
2451             t_simd.load(t_ + sample);
2452             st_to_texel_simd(s_simd, t_simd, texturefile, spec, sint_simd,
2453                              tint_simd, sfrac_simd, tfrac_simd);
2454         }
2455         int sint = sint_simd[sample4], tint = tint_simd[sample4];
2456         float sfrac = sfrac_simd[sample4], tfrac = tfrac_simd[sample4];
2457         float weight = weight_[sample];
2458 
2459         // We're gathering 4x4 samples and 4x weights.  Indices: texels 0,
2460         // 1, 2, 3.  The sample lies between samples 1 and 2.
2461 
2462         static const OIIO_SIMD4_ALIGN int iota[4]   = { 0, 1, 2, 3 };
2463         static const OIIO_SIMD4_ALIGN int iota_1[4] = { -1, 0, 1, 2 };
2464         simd::vint4 stex, ttex;  // Texel coords for each row and column
2465         stex                = sint + (*(vint4*)iota_1);
2466         ttex                = tint + (*(vint4*)iota_1);
2467         simd::vbool4 svalid = swrap_func_simd(stex, spec_x_simd,
2468                                               spec_width_simd);
2469         simd::vbool4 tvalid = twrap_func_simd(ttex, spec_y_simd,
2470                                               spec_height_simd);
2471         bool allvalid       = reduce_and(svalid & tvalid);
2472         bool anyvalid       = reduce_or(svalid | tvalid);
2473         if (!levelinfo.full_pixel_range && anyvalid) {
2474             // Handle case of crop windows or overscan
2475             svalid &= (stex >= spec_x_simd) & (stex < spec_x_plus_width_simd);
2476             tvalid &= (ttex >= spec_y_simd) & (ttex < spec_y_plus_height_simd);
2477             allvalid = reduce_and(svalid & tvalid);
2478             anyvalid = reduce_or(svalid | tvalid);
2479         }
2480         if (!anyvalid) {
2481             // All texels we need were out of range and using 'black' wrap.
2482             nonfill += weight;
2483             continue;
2484         }
2485 
2486         simd::vfloat4 texel_simd[4][4];
2487         // int tile_s = (stex[0] - spec.x) % spec.tile_width;
2488         // int tile_t = (ttex[0] - spec.y) % spec.tile_height;
2489         int tile_s = (stex[0] - spec.x);
2490         int tile_t = (ttex[0] - spec.y);
2491         if (tilepow2) {
2492             tile_s &= tilewidthmask;
2493             tile_t &= tileheightmask;
2494         } else {
2495             tile_s %= spec.tile_width;
2496             tile_t %= spec.tile_height;
2497         }
2498         bool s_onetile = (tile_s <= tilewidthmask - 3);
2499         bool t_onetile = (tile_t <= tileheightmask - 3);
2500         if (s_onetile & t_onetile) {
2501             // If we thought it was one tile, realize that it isn't unless
2502             // it's ascending.
2503             s_onetile &= all(stex
2504                              == (simd::shuffle<0>(stex) + (*(vint4*)iota)));
2505             t_onetile &= all(ttex
2506                              == (simd::shuffle<0>(ttex) + (*(vint4*)iota)));
2507         }
2508         bool onetile = (s_onetile & t_onetile);
2509         if (onetile & allvalid) {
2510             // Shortcut if all the texels we need are on the same tile
2511             id.xy(stex[0] - tile_s, ttex[0] - tile_t);
2512             bool ok = find_tile(id, thread_info, sample == 0);
2513             if (!ok)
2514                 error("{}", m_imagecache->geterror());
2515             TileRef& tile(thread_info->tile);
2516             if (!tile) {
2517                 return false;
2518             }
2519             // N.B. thread_info->tile will keep holding a ref-counted pointer
2520             // to the tile for the duration that we're using the tile data.
2521             int offset = pixelsize * (tile_t * spec.tile_width + tile_s);
2522             const unsigned char* base = tile->bytedata() + offset
2523                                         + firstchannel_offset_bytes;
2524             OIIO_DASSERT(tile->data());
2525             if (pixeltype == TypeDesc::UINT8) {
2526                 for (int j = 0, j_offset = 0; j < 4;
2527                      ++j, j_offset += pixelsize * spec.tile_width)
2528                     for (int i = 0, i_offset = j_offset; i < 4;
2529                          ++i, i_offset += pixelsize)
2530                         texel_simd[j][i] = uchar2float4(base + i_offset);
2531             } else if (pixeltype == TypeDesc::UINT16) {
2532                 for (int j = 0, j_offset = 0; j < 4;
2533                      ++j, j_offset += pixelsize * spec.tile_width)
2534                     for (int i = 0, i_offset = j_offset; i < 4;
2535                          ++i, i_offset += pixelsize)
2536                         texel_simd[j][i] = ushort2float4(
2537                             (const uint16_t*)(base + i_offset));
2538             } else if (pixeltype == TypeDesc::HALF) {
2539                 for (int j = 0, j_offset = 0; j < 4;
2540                      ++j, j_offset += pixelsize * spec.tile_width)
2541                     for (int i = 0, i_offset = j_offset; i < 4;
2542                          ++i, i_offset += pixelsize)
2543                         texel_simd[j][i] = half2float4(
2544                             (const half*)(base + i_offset));
2545             } else {
2546                 for (int j = 0, j_offset = 0; j < 4;
2547                      ++j, j_offset += pixelsize * spec.tile_width)
2548                     for (int i = 0, i_offset = j_offset; i < 4;
2549                          ++i, i_offset += pixelsize)
2550                         texel_simd[j][i].load((const float*)(base + i_offset));
2551             }
2552         } else {
2553             simd::vint4 tile_s, tile_t;  // texel offset WITHIN its tile
2554             simd::vint4 tile_s_edge,
2555                 tile_t_edge;  // coordinate of the tile edge
2556             tile_s      = (stex - spec_x_simd) % spec.tile_width;
2557             tile_t      = (ttex - spec_y_simd) % spec.tile_height;
2558             tile_s_edge = stex - tile_s;
2559             tile_t_edge = ttex - tile_t;
2560             simd::vint4 column_offset_bytes = tile_s * pixelsize
2561                                               + firstchannel_offset_bytes;
2562             for (int j = 0; j < 4; ++j) {
2563                 if (!tvalid[j]) {
2564                     for (int i = 0; i < 4; ++i)
2565                         texel_simd[j][i].clear();
2566                     continue;
2567                 }
2568                 int row_offset_bytes = tile_t[j]
2569                                        * (spec.tile_width * pixelsize);
2570                 for (int i = 0; i < 4; ++i) {
2571                     if (!svalid[i]) {
2572                         texel_simd[j][i].clear();
2573                         continue;
2574                     }
2575                     // Trick: we only need to find a tile if i == 0 or if we
2576                     // just crossed a tile boundary (if tile_s[i] == 0).
2577                     // Otherwise, we are still on the same tile as the last
2578                     // iteration, as long as we aren't using mirror wrap mode!
2579                     if (i == 0 || tile_s[i] == 0
2580                         || options.swrap == TextureOpt::WrapMirror) {
2581                         id.xy(tile_s_edge[i], tile_t_edge[j]);
2582                         bool ok = find_tile(id, thread_info, sample == 0);
2583                         if (!ok)
2584                             error("{}", m_imagecache->geterror());
2585                         DASSERT(thread_info->tile->id() == id);
2586                         if (!thread_info->tile->valid())
2587                             return false;
2588                     }
2589                     TileRef& tile(thread_info->tile);
2590                     OIIO_DASSERT(tile->data());
2591                     int offset = row_offset_bytes + column_offset_bytes[i];
2592                     // const unsigned char *pixelptr = tile->bytedata() + offset[i];
2593                     if (pixeltype == TypeDesc::UINT8)
2594                         texel_simd[j][i] = uchar2float4(tile->bytedata()
2595                                                         + offset);
2596                     else if (pixeltype == TypeDesc::UINT16)
2597                         texel_simd[j][i] = ushort2float4(
2598                             (const uint16_t*)(tile->bytedata() + offset));
2599                     else if (pixeltype == TypeDesc::HALF)
2600                         texel_simd[j][i] = half2float4(
2601                             (const half*)(tile->bytedata() + offset));
2602                     else
2603                         texel_simd[j][i].load(
2604                             (const float*)(tile->bytedata() + offset));
2605                 }
2606             }
2607         }
2608 
2609         // When we're on the lowest res mipmap levels, it's more pleasing if
2610         // we converge to a single pole color right at the pole.  Fade to
2611         // the average color over the texel height right next to the pole.
2612         if (need_pole) {
2613             float height = spec.height;
2614             if (texturefile.m_sample_border)
2615                 height -= 1.0f;
2616             float tt = t_[sample] * height;
2617             if (tt < 1.0f || tt > (height - 1.0f))
2618                 fade_to_pole(tt, (float*)&accum, weight, texturefile,
2619                              thread_info, levelinfo, options, miplevel,
2620                              actualchannels);
2621         }
2622 
2623         // We use a formulation of cubic B-spline evaluation that reduces to
2624         // lerps.  It's tricky to follow, but the references are:
2625         //   * Ruijters, Daniel et al, "Efficient GPU-Based Texture
2626         //     Interpolation using Uniform B-Splines", Journal of Graphics
2627         //     Tools 13(4), pp. 61-69, 2008.
2628         //     http://jgt.akpeters.com/papers/RuijtersEtAl08/
2629         //   * Sigg, Christian and Markus Hadwiger, "Fast Third-Order Texture
2630         //     Filtering", in GPU Gems 2 (Chapter 20), Pharr and Fernando, ed.
2631         //     http://http.developer.nvidia.com/GPUGems2/gpugems2_chapter20.html
2632         // We like this formulation because it's slightly faster than any of
2633         // the other B-spline evaluation routines we tried, and also the lerp
2634         // guarantees that the filtered results will be non-negative for
2635         // non-negative texel values (which we had trouble with before due to
2636         // numerical imprecision).
2637         vfloat4 wx, dwx;
2638         vfloat4 wy, dwy;
2639         if (daccumds_) {
2640             evalBSplineWeights_and_derivs(&wx, sfrac, &dwx);
2641             evalBSplineWeights_and_derivs(&wy, tfrac, &dwy);
2642         } else {
2643             wx = evalBSplineWeights(vfloat4(sfrac));
2644             wy = evalBSplineWeights(vfloat4(tfrac));
2645 #if (defined(__i386__) && !defined(__x86_64__)) || defined(__aarch64__)
2646             // Some platforms complain here about these being uninitialized,
2647             // so initialize them. Don't waste the cycles for platforms that
2648             // don't seem to have that error. It's a false positive -- this
2649             // code really is safe without the initialization, but that
2650             // doesn't seem to get figured out on some platforms (even when
2651             // running the same gcc version).
2652             dwx = vfloat4::Zero();
2653             dwy = vfloat4::Zero();
2654 #endif
2655         }
2656 
2657         // figure out lerp weights so we can turn the filter into a sequence of lerp's
2658         // Here's the obvious scalar code:
2659         //   float g0x = wx[0] + wx[1]; float h0x = (wx[1] / g0x);
2660         //   float g1x = wx[2] + wx[3]; float h1x = (wx[3] / g1x);
2661         //   float g0y = wy[0] + wy[1]; float h0y = (wy[1] / g0y);
2662         //   float g1y = wy[2] + wy[3]; float h1y = (wy[3] / g1y);
2663         // But instead, we convolutedly (but quickly!) compute the four g
2664         // and four h values using SIMD ops:
2665         vfloat4 wx_0213        = simd::shuffle<0, 2, 1, 3>(wx);
2666         vfloat4 wx_1302        = simd::shuffle<1, 3, 0, 2>(wx);
2667         vfloat4 wx_01_23_01_23 = wx_0213
2668                                  + wx_1302;  // wx[0]+wx[1] wx[2]+wx[3] ..
2669         vfloat4 wy_0213        = simd::shuffle<0, 2, 1, 3>(wy);
2670         vfloat4 wy_1302        = simd::shuffle<1, 3, 0, 2>(wy);
2671         vfloat4 wy_01_23_01_23 = wy_0213
2672                                  + wy_1302;  // wy[0]+wy[1] wy[2]+wy[3] ..
2673         vfloat4 g = AxyBxy(
2674             wx_01_23_01_23,
2675             wy_01_23_01_23);  // wx[0]+wx[1] wx[2]+wx[3] wy[0]+wy[1] wy[2]+wy[3]
2676         vfloat4 wx13_wy13 = AxyBxy(wx_1302, wy_1302);
2677         vfloat4 h         = wx13_wy13 / g;  // [ h0x h1x h0y h1y ]
2678 
2679         simd::vfloat4 col[4];
2680         for (int j = 0; j < 4; ++j) {
2681             simd::vfloat4 lx = lerp(texel_simd[j][0], texel_simd[j][1],
2682                                     shuffle<0>(h) /*h0x*/);
2683             simd::vfloat4 rx = lerp(texel_simd[j][2], texel_simd[j][3],
2684                                     shuffle<1>(h) /*h1x*/);
2685             col[j]           = lerp(lx, rx, shuffle<1>(g) /*g1x*/);
2686         }
2687         simd::vfloat4 ly          = lerp(col[0], col[1], shuffle<2>(h) /*h0y*/);
2688         simd::vfloat4 ry          = lerp(col[2], col[3], shuffle<3>(h) /*h1y*/);
2689         simd::vfloat4 weight_simd = weight;
2690         accum += weight_simd * lerp(ly, ry, shuffle<3>(g) /*g1y*/);
2691         if (daccumds_) {
2692             simd::vfloat4 scalex = weight_simd * float(spec.width);
2693             simd::vfloat4 scaley = weight_simd * float(spec.height);
2694             daccumds += scalex
2695                         * (dwx[0]
2696                                * (wy[0] * texel_simd[0][0]
2697                                   + wy[1] * texel_simd[1][0]
2698                                   + wy[2] * texel_simd[2][0]
2699                                   + wy[3] * texel_simd[3][0])
2700                            + dwx[1]
2701                                  * (wy[0] * texel_simd[0][1]
2702                                     + wy[1] * texel_simd[1][1]
2703                                     + wy[2] * texel_simd[2][1]
2704                                     + wy[3] * texel_simd[3][1])
2705                            + dwx[2]
2706                                  * (wy[0] * texel_simd[0][2]
2707                                     + wy[1] * texel_simd[1][2]
2708                                     + wy[2] * texel_simd[2][2]
2709                                     + wy[3] * texel_simd[3][2])
2710                            + dwx[3]
2711                                  * (wy[0] * texel_simd[0][3]
2712                                     + wy[1] * texel_simd[1][3]
2713                                     + wy[2] * texel_simd[2][3]
2714                                     + wy[3] * texel_simd[3][3]));
2715             daccumdt += scaley
2716                         * (dwy[0]
2717                                * (wx[0] * texel_simd[0][0]
2718                                   + wx[1] * texel_simd[0][1]
2719                                   + wx[2] * texel_simd[0][2]
2720                                   + wx[3] * texel_simd[0][3])
2721                            + dwy[1]
2722                                  * (wx[0] * texel_simd[1][0]
2723                                     + wx[1] * texel_simd[1][1]
2724                                     + wx[2] * texel_simd[1][2]
2725                                     + wx[3] * texel_simd[1][3])
2726                            + dwy[2]
2727                                  * (wx[0] * texel_simd[2][0]
2728                                     + wx[1] * texel_simd[2][1]
2729                                     + wx[2] * texel_simd[2][2]
2730                                     + wx[3] * texel_simd[2][3])
2731                            + dwy[3]
2732                                  * (wx[0] * texel_simd[3][0]
2733                                     + wx[1] * texel_simd[3][1]
2734                                     + wx[2] * texel_simd[3][2]
2735                                     + wx[3] * texel_simd[3][3]));
2736         }
2737 
2738         // Compute appropriate amount of "fill" color to extra channels in
2739         // non-"black"-wrapped regions.
2740         if (!allvalid && use_fill) {
2741             float col[4];
2742             for (int j = 0; j < 4; ++j) {
2743                 float lx = lerp(1.0f * tvalid[j] * svalid[0],
2744                                 1.0f * tvalid[j] * svalid[1],
2745                                 extract<0>(h) /*h0x*/);
2746                 float rx = lerp(1.0f * tvalid[j] * svalid[2],
2747                                 1.0f * tvalid[j] * svalid[3],
2748                                 extract<1>(h) /*h1x*/);
2749                 col[j]   = lerp(lx, rx, extract<1>(g) /*g1x*/);
2750             }
2751             float ly = lerp(col[0], col[1], extract<2>(h) /*h0y*/);
2752             float ry = lerp(col[2], col[3], extract<3>(h) /*h1y*/);
2753             nonfill += weight * (1.0f - lerp(ly, ry, extract<3>(g) /*g1y*/));
2754         }
2755     }
2756 
2757     simd::vbool4 channel_mask = channel_masks[actualchannels];
2758     accum                     = blend0(accum, channel_mask);
2759     if (use_fill) {
2760         // Add the weighted fill color
2761         accum += blend0not(vfloat4((1.0f - nonfill) * options.fill),
2762                            channel_mask);
2763     }
2764 
2765     *accum_ = accum;
2766     if (daccumds_) {
2767         *daccumds_ = blend0(daccumds, channel_mask);
2768         *daccumdt_ = blend0(daccumdt, channel_mask);
2769     }
2770     return true;
2771 }
2772 
2773 
2774 
2775 void
visualize_ellipse(const std::string & name,float dsdx,float dtdx,float dsdy,float dtdy,float sblur,float tblur)2776 TextureSystemImpl::visualize_ellipse(const std::string& name, float dsdx,
2777                                      float dtdx, float dsdy, float dtdy,
2778                                      float sblur, float tblur)
2779 {
2780     std::cout << name << " derivs dx " << dsdx << ' ' << dtdx << ", dt " << dtdx
2781               << ' ' << dtdy << "\n";
2782     adjust_width(dsdx, dtdx, dsdy, dtdy, 1.0f, 1.0f /*, sblur, tblur */);
2783     float majorlength, minorlength, theta;
2784     float ABCF[4];
2785     ellipse_axes(dsdx, dtdx, dsdy, dtdy, majorlength, minorlength, theta, ABCF);
2786     std::cout << "  ellipse major " << majorlength << ", minor " << minorlength
2787               << ", theta " << theta << "\n";
2788     adjust_blur(majorlength, minorlength, theta, sblur, tblur);
2789     std::cout << "  post " << sblur << ' ' << tblur << " blur: major "
2790               << majorlength << ", minor " << minorlength << "\n\n";
2791 
2792     TextureOpt options;
2793     float trueaspect;
2794     float aspect      = TextureSystemImpl::anisotropic_aspect(majorlength,
2795                                                          minorlength, options,
2796                                                          trueaspect);
2797     float* lineweight = OIIO_ALLOCA(float, 2 * options.anisotropic);
2798     float smajor, tmajor, invsamples;
2799     int nsamples = compute_ellipse_sampling(aspect, theta, majorlength,
2800                                             minorlength, smajor, tmajor,
2801                                             invsamples, lineweight);
2802 
2803     // Make an ImageBuf to hold our visualization image, set it to grey
2804     float scale = 100;
2805     int w = 256, h = 256;
2806     ImageSpec spec(w, h, 3);
2807     ImageBuf ib(spec);
2808     static float dark[3]  = { 0.2f, 0.2f, 0.2f };
2809     static float white[3] = { 1, 1, 1 };
2810     static float grey[3]  = { 0.5, 0.5, 0.5 };
2811     static float red[3]   = { 1, 0, 0 };
2812     static float green[3] = { 0, 1, 0 };
2813     ImageBufAlgo::fill(ib, grey);
2814 
2815     // scan all the pixels, darken the ellipse interior (no blur considered)
2816     for (int j = 0; j < h; ++j) {
2817         float y = (j - h / 2) / scale;
2818         for (int i = 0; i < w; ++i) {
2819             float x  = (i - w / 2) / scale;
2820             float d2 = ABCF[0] * x * x + ABCF[1] * x * y + ABCF[2] * y * y;
2821             if (d2 < 1.0f)
2822                 ib.setpixel(i, h - 1 - j, dark);
2823         }
2824     }
2825 
2826     // Draw red and green axes for the dx and dy derivatives, respectively
2827     ImageBufAlgo::render_line(ib, w / 2, h / 2, w / 2 + int(dsdx * scale),
2828                               h / 2 - int(dtdx * scale), red);
2829     ImageBufAlgo::render_line(ib, w / 2, h / 2, w / 2 + int(dsdy * scale),
2830                               h / 2 - int(dtdy * scale), green);
2831 
2832     // Draw yellow and blue axes for the ellipse axes, with blur
2833     ImageBufAlgo::render_line(ib, w / 2, h / 2,
2834                               w / 2 + int(scale * majorlength * cosf(theta)),
2835                               h / 2 - int(scale * majorlength * sinf(theta)),
2836                               { 1.0f, 1.0f, 0.0f });
2837     ImageBufAlgo::render_line(ib, w / 2, h / 2,
2838                               w / 2 + int(scale * minorlength * -sinf(theta)),
2839                               h / 2 - int(scale * minorlength * cosf(theta)),
2840                               { 0.0f, 0.0f, 1.0f });
2841 
2842     float bigweight = 0;
2843     for (int i = 0; i < nsamples; ++i)
2844         bigweight = std::max(lineweight[i], bigweight);
2845 
2846     // Plop white dots at the sample positions
2847     int rad = int(scale * minorlength);
2848     for (int sample = 0; sample < nsamples; ++sample) {
2849         float pos = 1.0f * (sample + 0.5f) * invsamples - 0.5f;
2850         float x = pos * smajor, y = pos * tmajor;
2851         int xx = w / 2 + int(x * scale), yy = h / 2 - int(y * scale);
2852         int size = int(5 * lineweight[sample] / bigweight);
2853         ImageBufAlgo::render_box(ib, xx - rad, yy - rad, xx + rad, yy + rad,
2854                                  { 0.65f, 0.65f, 0.65f });
2855         ImageBufAlgo::render_box(ib, xx - size / 2, yy - size / 2,
2856                                  xx + size / 2, yy + size / 2, white, true);
2857     }
2858 
2859     ib.write(name);
2860 }
2861 
2862 
2863 
2864 void
unit_test_texture()2865 TextureSystemImpl::unit_test_texture()
2866 {
2867     // Just blur in s, it is a harder case
2868     float sblur = unit_test_texture_blur, tblur = 0.0f;
2869     float dsdx, dtdx, dsdy, dtdy;
2870 
2871     dsdx = 0.4;
2872     dtdx = 0.0;
2873     dsdy = 0.0;
2874     dtdy = 0.2;
2875     visualize_ellipse("0.tif", dsdx, dtdx, dsdy, dtdy, sblur, tblur);
2876 
2877     dsdx = 0.2;
2878     dtdx = 0.0;
2879     dsdy = 0.0;
2880     dtdy = 0.4;
2881     visualize_ellipse("1.tif", dsdx, dtdx, dsdy, dtdy, sblur, tblur);
2882 
2883     dsdx = 0.2;
2884     dtdx = 0.2;
2885     dsdy = -0.2;
2886     dtdy = 0.2;
2887     visualize_ellipse("2.tif", dsdx, dtdx, dsdy, dtdy, sblur, tblur);
2888 
2889     dsdx = 0.35;
2890     dtdx = 0.27;
2891     dsdy = 0.1;
2892     dtdy = 0.35;
2893     visualize_ellipse("3.tif", dsdx, dtdx, dsdy, dtdy, sblur, tblur);
2894 
2895     dsdx = 0.35;
2896     dtdx = 0.27;
2897     dsdy = 0.1;
2898     dtdy = -0.35;
2899     visualize_ellipse("4.tif", dsdx, dtdx, dsdy, dtdy, sblur, tblur);
2900 
2901     // Major axis starts vertical, but blur make it minor?
2902     dsdx = 0.2;
2903     dtdx = 0.0;
2904     dsdy = 0.0;
2905     dtdy = 0.3;
2906     visualize_ellipse("5.tif", dsdx, dtdx, dsdy, dtdy, 0.5, 0.0);
2907     dsdx = 0.3;
2908     dtdx = 0.0;
2909     dsdy = 0.0;
2910     dtdy = 0.2;
2911     visualize_ellipse("6.tif", dsdx, dtdx, dsdy, dtdy, 0.0, 0.5);
2912 
2913     std::mt19937 gen;
2914     std::uniform_real_distribution<float> rnd(0.0f, 1.0f);
2915     for (int i = 0; i < 100; ++i) {
2916         dsdx = 1.5f * (rnd(gen) - 0.5f);
2917         dtdx = 1.5f * (rnd(gen) - 0.5f);
2918         dsdy = 1.5f * (rnd(gen) - 0.5f);
2919         dtdy = 1.5f * (rnd(gen) - 0.5f);
2920         visualize_ellipse(Strutil::sprintf("%04d.tif", 100 + i), dsdx, dtdx,
2921                           dsdy, dtdy, sblur, tblur);
2922     }
2923 }
2924 
2925 
2926 
2927 }  // end namespace pvt
2928 
2929 OIIO_NAMESPACE_END
2930