MagickCore  7.1.0
accelerate-kernels-private.h
Go to the documentation of this file.
1 /*
2  Copyright 1999-2021 ImageMagick Studio LLC, a non-profit organization
3  dedicated to making software imaging solutions freely available.
4 
5  You may not use this file except in compliance with the License. You may
6  obtain a copy of the License at
7 
8  https://imagemagick.org/script/license.php
9 
10  Unless required by applicable law or agreed to in writing, software
11  distributed under the License is distributed on an "AS IS" BASIS,
12  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  See the License for the specific language governing permissions and
14  limitations under the License.
15 
16  MagickCore private kernels for accelerated functions.
17 */
18 
19 #ifndef MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
20 #define MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
21 
22 #if defined(__cplusplus) || defined(c_plusplus)
23 extern "C" {
24 #endif
25 
26 #if defined(MAGICKCORE_OPENCL_SUPPORT)
27 
28 /*
29  Define declarations.
30 */
31 #define OPENCL_DEFINE(VAR,...) "\n #""define " #VAR " " #__VA_ARGS__ " \n"
32 #define OPENCL_ELIF(...) "\n #""elif " #__VA_ARGS__ " \n"
33 #define OPENCL_ELSE() "\n #""else " " \n"
34 #define OPENCL_ENDIF() "\n #""endif " " \n"
35 #define OPENCL_IF(...) "\n #""if " #__VA_ARGS__ " \n"
36 #define STRINGIFY(...) #__VA_ARGS__ "\n"
37 
38 const char *accelerateKernels =
39 
40 /*
41  Define declarations.
42 */
43  OPENCL_DEFINE(SigmaUniform, (attenuate*0.015625f))
44  OPENCL_DEFINE(SigmaGaussian, (attenuate*0.015625f))
45  OPENCL_DEFINE(SigmaImpulse, (attenuate*0.1f))
46  OPENCL_DEFINE(SigmaLaplacian, (attenuate*0.0390625f))
47  OPENCL_DEFINE(SigmaMultiplicativeGaussian, (attenuate*0.5f))
48  OPENCL_DEFINE(SigmaPoisson, (attenuate*12.5f))
49  OPENCL_DEFINE(SigmaRandom, (attenuate))
50  OPENCL_DEFINE(TauGaussian, (attenuate*0.078125f))
51  OPENCL_DEFINE(MagickMax(x,y), (((x) > (y)) ? (x) : (y)))
52  OPENCL_DEFINE(MagickMin(x,y), (((x) < (y)) ? (x) : (y)))
53 
54 /*
55  Typedef declarations.
56 */
57  STRINGIFY(
58  typedef enum
59  {
61  CMYColorspace, /* negated linear RGB colorspace */
62  CMYKColorspace, /* CMY with Black separation */
63  GRAYColorspace, /* Single Channel greyscale (non-linear) image */
69  HSVColorspace, /* alias for HSB */
72  LCHColorspace, /* alias for LCHuv */
73  LCHabColorspace, /* Cylindrical (Polar) Lab */
74  LCHuvColorspace, /* Cylindrical (Polar) Luv */
81  RGBColorspace, /* Linear RGB colorspace */
82  scRGBColorspace, /* ??? */
83  sRGBColorspace, /* Default: non-linear sRGB colorspace */
86  XYZColorspace, /* IEEE Color Reference colorspace */
93  LinearGRAYColorspace /* Single Channel greyscale (linear) image */
95  )
96 
97  STRINGIFY(
98  typedef enum
99  {
173  )
174 
175  STRINGIFY(
176  typedef enum
177  {
183  } MagickFunction;
184  )
185 
186  STRINGIFY(
187  typedef enum
188  {
190  UniformNoise,
193  ImpulseNoise,
195  PoissonNoise,
197  } NoiseType;
198  )
199 
200  STRINGIFY(
201  typedef enum
202  {
214  )
215 
216  STRINGIFY(
217  typedef enum
218  {
236  )
237 
238  STRINGIFY(
239  typedef enum
240  {
241  UndefinedChannel = 0x0000,
242  RedChannel = 0x0001,
243  GrayChannel = 0x0001,
244  CyanChannel = 0x0001,
245  GreenChannel = 0x0002,
246  MagentaChannel = 0x0002,
247  BlueChannel = 0x0004,
248  YellowChannel = 0x0004,
249  BlackChannel = 0x0008,
250  AlphaChannel = 0x0010,
251  OpacityChannel = 0x0010,
252  IndexChannel = 0x0020, /* Color Index Table? */
253  ReadMaskChannel = 0x0040, /* Pixel is Not Readable? */
254  WriteMaskChannel = 0x0080, /* Pixel is Write Protected? */
255  MetaChannel = 0x0100, /* ???? */
256  CompositeChannels = 0x001F,
257  AllChannels = 0x7ffffff,
258  /*
259  Special purpose channel types.
260  FUTURE: are these needed any more - they are more like hacks
261  SyncChannels for example is NOT a real channel but a 'flag'
262  It really says -- "User has not defined channels"
263  Though it does have extra meaning in the "-auto-level" operator
264  */
265  TrueAlphaChannel = 0x0100, /* extract actual alpha channel from opacity */
266  RGBChannels = 0x0200, /* set alpha from grayscale mask in RGB */
267  GrayChannels = 0x0400,
268  SyncChannels = 0x20000, /* channels modified as a single unit */
270  } ChannelType; /* must correspond to PixelChannel */
271  )
272 
273 /*
274  Helper functions.
275 */
276 
277 OPENCL_IF((MAGICKCORE_QUANTUM_DEPTH == 8))
278 
279  STRINGIFY(
280  static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
281  {
282  return((CLQuantum) value);
283  }
284  )
285 
286 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 16))
287 
288  STRINGIFY(
289  static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
290  {
291  return((CLQuantum) (257.0f*value));
292  }
293  )
294 
295 OPENCL_ELIF((MAGICKCORE_QUANTUM_DEPTH == 32))
296 
297  STRINGIFY(
298  static inline CLQuantum ScaleCharToQuantum(const unsigned char value)
299  {
300  return((CLQuantum) (16843009.0*value));
301  }
302  )
303 
304 OPENCL_ENDIF()
305 
306 OPENCL_IF((MAGICKCORE_HDRI_SUPPORT == 1))
307 
308  STRINGIFY(
309  static inline CLQuantum ClampToQuantum(const float value)
310  {
311  return (CLQuantum) clamp(value, 0.0f, QuantumRange);
312  }
313  )
314 
315 OPENCL_ELSE()
316 
317  STRINGIFY(
318  static inline CLQuantum ClampToQuantum(const float value)
319  {
320  return (CLQuantum) (clamp(value, 0.0f, QuantumRange) + 0.5f);
321  }
322  )
323 
324 OPENCL_ENDIF()
325 
326  STRINGIFY(
327  static inline int ClampToCanvas(const int offset,const int range)
328  {
329  return clamp(offset, (int)0, range-1);
330  }
331  )
332 
333  STRINGIFY(
334  static inline uint ScaleQuantumToMap(CLQuantum value)
335  {
336  if (value >= (CLQuantum) MaxMap)
337  return ((uint)MaxMap);
338  else
339  return ((uint)value);
340  }
341  )
342 
343  STRINGIFY(
344  static inline float PerceptibleReciprocal(const float x)
345  {
346  float sign = x < (float) 0.0 ? (float) -1.0 : (float) 1.0;
347  return((sign*x) >= MagickEpsilon ? (float) 1.0/x : sign*((float) 1.0/MagickEpsilon));
348  }
349  )
350 
351  STRINGIFY(
352 
353  static inline unsigned int getPixelIndex(const unsigned int number_channels,
354  const unsigned int columns, const unsigned int x, const unsigned int y)
355  {
356  return (x * number_channels) + (y * columns * number_channels);
357  }
358 
359  static inline float getPixelRed(const __global CLQuantum *p) { return (float)*p; }
360  static inline float getPixelGreen(const __global CLQuantum *p) { return (float)*(p+1); }
361  static inline float getPixelBlue(const __global CLQuantum *p) { return (float)*(p+2); }
362  static inline float getPixelAlpha(const __global CLQuantum *p,const unsigned int number_channels) { return (float)*(p+number_channels-1); }
363 
364  static inline void setPixelRed(__global CLQuantum *p,const CLQuantum value) { *p=value; }
365  static inline void setPixelGreen(__global CLQuantum *p,const CLQuantum value) { *(p+1)=value; }
366  static inline void setPixelBlue(__global CLQuantum *p,const CLQuantum value) { *(p+2)=value; }
367  static inline void setPixelAlpha(__global CLQuantum *p,const unsigned int number_channels,const CLQuantum value) { *(p+number_channels-1)=value; }
368 
369  static inline CLQuantum getBlue(CLPixelType p) { return p.x; }
370  static inline void setBlue(CLPixelType* p, CLQuantum value) { (*p).x = value; }
371  static inline float getBlueF4(float4 p) { return p.x; }
372  static inline void setBlueF4(float4* p, float value) { (*p).x = value; }
373 
374  static inline CLQuantum getGreen(CLPixelType p) { return p.y; }
375  static inline void setGreen(CLPixelType* p, CLQuantum value) { (*p).y = value; }
376  static inline float getGreenF4(float4 p) { return p.y; }
377  static inline void setGreenF4(float4* p, float value) { (*p).y = value; }
378 
379  static inline CLQuantum getRed(CLPixelType p) { return p.z; }
380  static inline void setRed(CLPixelType* p, CLQuantum value) { (*p).z = value; }
381  static inline float getRedF4(float4 p) { return p.z; }
382  static inline void setRedF4(float4* p, float value) { (*p).z = value; }
383 
384  static inline CLQuantum getAlpha(CLPixelType p) { return p.w; }
385  static inline void setAlpha(CLPixelType* p, CLQuantum value) { (*p).w = value; }
386  static inline float getAlphaF4(float4 p) { return p.w; }
387  static inline void setAlphaF4(float4* p, float value) { (*p).w = value; }
388 
389  static inline void ReadChannels(const __global CLQuantum *p, const unsigned int number_channels,
390  const ChannelType channel, float *red, float *green, float *blue, float *alpha)
391  {
392  if ((channel & RedChannel) != 0)
393  *red=getPixelRed(p);
394 
395  if (number_channels > 2)
396  {
397  if ((channel & GreenChannel) != 0)
398  *green=getPixelGreen(p);
399 
400  if ((channel & BlueChannel) != 0)
401  *blue=getPixelBlue(p);
402  }
403 
404  if (((number_channels == 4) || (number_channels == 2)) &&
405  ((channel & AlphaChannel) != 0))
406  *alpha=getPixelAlpha(p,number_channels);
407  }
408 
409  static inline float4 ReadAllChannels(const __global CLQuantum *image, const unsigned int number_channels,
410  const unsigned int columns, const unsigned int x, const unsigned int y)
411  {
412  const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
413 
414  float4 pixel;
415 
416  pixel.x=getPixelRed(p);
417 
418  if (number_channels > 2)
419  {
420  pixel.y=getPixelGreen(p);
421  pixel.z=getPixelBlue(p);
422  }
423 
424  if ((number_channels == 4) || (number_channels == 2))
425  pixel.w=getPixelAlpha(p,number_channels);
426  return(pixel);
427  }
428 
429  static inline float4 ReadFloat4(const __global CLQuantum *image, const unsigned int number_channels,
430  const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel)
431  {
432  const __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
433 
434  float red = 0.0f;
435  float green = 0.0f;
436  float blue = 0.0f;
437  float alpha = 0.0f;
438 
439  ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
440  return (float4)(red, green, blue, alpha);
441  }
442 
443  static inline void WriteChannels(__global CLQuantum *p, const unsigned int number_channels,
444  const ChannelType channel, float red, float green, float blue, float alpha)
445  {
446  if ((channel & RedChannel) != 0)
447  setPixelRed(p,ClampToQuantum(red));
448 
449  if (number_channels > 2)
450  {
451  if ((channel & GreenChannel) != 0)
452  setPixelGreen(p,ClampToQuantum(green));
453 
454  if ((channel & BlueChannel) != 0)
455  setPixelBlue(p,ClampToQuantum(blue));
456  }
457 
458  if (((number_channels == 4) || (number_channels == 2)) &&
459  ((channel & AlphaChannel) != 0))
460  setPixelAlpha(p,number_channels,ClampToQuantum(alpha));
461  }
462 
463  static inline void WriteAllChannels(__global CLQuantum *image, const unsigned int number_channels,
464  const unsigned int columns, const unsigned int x, const unsigned int y, float4 pixel)
465  {
466  __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
467 
468  setPixelRed(p,ClampToQuantum(pixel.x));
469 
470  if (number_channels > 2)
471  {
472  setPixelGreen(p,ClampToQuantum(pixel.y));
473  setPixelBlue(p,ClampToQuantum(pixel.z));
474  }
475 
476  if ((number_channels == 4) || (number_channels == 2))
477  setPixelAlpha(p,number_channels,ClampToQuantum(pixel.w));
478  }
479 
480  static inline void WriteFloat4(__global CLQuantum *image, const unsigned int number_channels,
481  const unsigned int columns, const unsigned int x, const unsigned int y, const ChannelType channel,
482  float4 pixel)
483  {
484  __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
485  WriteChannels(p, number_channels, channel, pixel.x, pixel.y, pixel.z, pixel.w);
486  }
487 
488  static inline float GetPixelIntensity(const unsigned int colorspace,
489  const unsigned int method,float red,float green,float blue)
490  {
491  float intensity;
492 
493  if ((colorspace == GRAYColorspace) || (colorspace == LinearGRAYColorspace))
494  return red;
495 
496  switch (method)
497  {
499  {
500  intensity=(red+green+blue)/3.0;
501  break;
502  }
504  {
505  intensity=MagickMax(MagickMax(red,green),blue);
506  break;
507  }
509  {
510  intensity=(MagickMin(MagickMin(red,green),blue)+
511  MagickMax(MagickMax(red,green),blue))/2.0;
512  break;
513  }
515  {
516  intensity=(float) (((float) red*red+green*green+blue*blue)/
517  (3.0*QuantumRange));
518  break;
519  }
521  {
522  /*
523  if (image->colorspace == RGBColorspace)
524  {
525  red=EncodePixelGamma(red);
526  green=EncodePixelGamma(green);
527  blue=EncodePixelGamma(blue);
528  }
529  */
530  intensity=0.298839*red+0.586811*green+0.114350*blue;
531  break;
532  }
534  {
535  /*
536  if (image->colorspace == sRGBColorspace)
537  {
538  red=DecodePixelGamma(red);
539  green=DecodePixelGamma(green);
540  blue=DecodePixelGamma(blue);
541  }
542  */
543  intensity=0.298839*red+0.586811*green+0.114350*blue;
544  break;
545  }
547  default:
548  {
549  /*
550  if (image->colorspace == RGBColorspace)
551  {
552  red=EncodePixelGamma(red);
553  green=EncodePixelGamma(green);
554  blue=EncodePixelGamma(blue);
555  }
556  */
557  intensity=0.212656*red+0.715158*green+0.072186*blue;
558  break;
559  }
561  {
562  /*
563  if (image->colorspace == sRGBColorspace)
564  {
565  red=DecodePixelGamma(red);
566  green=DecodePixelGamma(green);
567  blue=DecodePixelGamma(blue);
568  }
569  */
570  intensity=0.212656*red+0.715158*green+0.072186*blue;
571  break;
572  }
574  {
575  intensity=(float) (sqrt((float) red*red+green*green+blue*blue)/
576  sqrt(3.0));
577  break;
578  }
579  }
580 
581  return intensity;
582  }
583 
584  static inline int mirrorBottom(int value)
585  {
586  return (value < 0) ? - (value) : value;
587  }
588 
589  static inline int mirrorTop(int value, int width)
590  {
591  return (value >= width) ? (2 * width - value - 1) : value;
592  }
593  )
594 
595 /*
596 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
597 % %
598 % %
599 % %
600 % A d d N o i s e %
601 % %
602 % %
603 % %
604 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
605 */
606 
607  STRINGIFY(
608  /*
609  Part of MWC64X by David Thomas, dt10@imperial.ac.uk
610  This is provided under BSD, full license is with the main package.
611  See http://www.doc.ic.ac.uk/~dt10/research
612  */
613 
614  // Pre: a<M, b<M
615  // Post: r=(a+b) mod M
616  ulong MWC_AddMod64(ulong a, ulong b, ulong M)
617  {
618  ulong v=a+b;
619  //if( (v>=M) || (v<a) )
620  if( (v>=M) || (convert_float(v) < convert_float(a)) ) // workaround for what appears to be an optimizer bug.
621  v=v-M;
622  return v;
623  }
624 
625  // Pre: a<M,b<M
626  // Post: r=(a*b) mod M
627  // This could be done more efficently, but it is portable, and should
628  // be easy to understand. It can be replaced with any of the better
629  // modular multiplication algorithms (for example if you know you have
630  // double precision available or something).
631  ulong MWC_MulMod64(ulong a, ulong b, ulong M)
632  {
633  ulong r=0;
634  while(a!=0){
635  if(a&1)
636  r=MWC_AddMod64(r,b,M);
637  b=MWC_AddMod64(b,b,M);
638  a=a>>1;
639  }
640  return r;
641  }
642 
643  // Pre: a<M, e>=0
644  // Post: r=(a^b) mod M
645  // This takes at most ~64^2 modular additions, so probably about 2^15 or so instructions on
646  // most architectures
647  ulong MWC_PowMod64(ulong a, ulong e, ulong M)
648  {
649  ulong sqr=a, acc=1;
650  while(e!=0){
651  if(e&1)
652  acc=MWC_MulMod64(acc,sqr,M);
653  sqr=MWC_MulMod64(sqr,sqr,M);
654  e=e>>1;
655  }
656  return acc;
657  }
658 
659  uint2 MWC_SkipImpl_Mod64(uint2 curr, ulong A, ulong M, ulong distance)
660  {
661  ulong m=MWC_PowMod64(A, distance, M);
662  ulong x=curr.x*(ulong)A+curr.y;
663  x=MWC_MulMod64(x, m, M);
664  return (uint2)((uint)(x/A), (uint)(x%A));
665  }
666 
667  uint2 MWC_SeedImpl_Mod64(ulong A, ulong M, uint vecSize, uint vecOffset, ulong streamBase, ulong streamGap)
668  {
669  // This is an arbitrary constant for starting LCG jumping from. I didn't
670  // want to start from 1, as then you end up with the two or three first values
671  // being a bit poor in ones - once you've decided that, one constant is as
672  // good as any another. There is no deep mathematical reason for it, I just
673  // generated a random number.
674  enum{ MWC_BASEID = 4077358422479273989UL };
675 
676  ulong dist=streamBase + (get_global_id(0)*vecSize+vecOffset)*streamGap;
677  ulong m=MWC_PowMod64(A, dist, M);
678 
679  ulong x=MWC_MulMod64(MWC_BASEID, m, M);
680  return (uint2)((uint)(x/A), (uint)(x%A));
681  }
682 
684  typedef struct{ uint x; uint c; uint seed0; ulong seed1; } mwc64x_state_t;
685 
686  void MWC64X_Step(mwc64x_state_t *s)
687  {
688  uint X=s->x, C=s->c;
689 
690  uint Xn=s->seed0*X+C;
691  uint carry=(uint)(Xn<C); // The (Xn<C) will be zero or one for scalar
692  uint Cn=mad_hi(s->seed0,X,carry);
693 
694  s->x=Xn;
695  s->c=Cn;
696  }
697 
698  void MWC64X_Skip(mwc64x_state_t *s, ulong distance)
699  {
700  uint2 tmp=MWC_SkipImpl_Mod64((uint2)(s->x,s->c), s->seed0, s->seed1, distance);
701  s->x=tmp.x;
702  s->c=tmp.y;
703  }
704 
705  void MWC64X_SeedStreams(mwc64x_state_t *s, ulong baseOffset, ulong perStreamOffset)
706  {
707  uint2 tmp=MWC_SeedImpl_Mod64(s->seed0, s->seed1, 1, 0, baseOffset, perStreamOffset);
708  s->x=tmp.x;
709  s->c=tmp.y;
710  }
711 
713  uint MWC64X_NextUint(mwc64x_state_t *s)
714  {
715  uint res=s->x ^ s->c;
716  MWC64X_Step(s);
717  return res;
718  }
719 
720  //
721  // End of MWC64X excerpt
722  //
723 
724  float mwcReadPseudoRandomValue(mwc64x_state_t* rng)
725  {
726  return (1.0f * MWC64X_NextUint(rng)) / (float)(0xffffffff); // normalized to 1.0
727  }
728 
729  float mwcGenerateDifferentialNoise(mwc64x_state_t* r, float pixel, NoiseType noise_type, float attenuate)
730  {
731  float
732  alpha,
733  beta,
734  noise,
735  sigma;
736 
737  noise = 0.0f;
738  alpha=mwcReadPseudoRandomValue(r);
739  switch(noise_type)
740  {
741  case UniformNoise:
742  default:
743  {
744  noise=(pixel+QuantumRange*SigmaUniform*(alpha-0.5f));
745  break;
746  }
747  case GaussianNoise:
748  {
749  float
750  gamma,
751  tau;
752 
753  if (alpha == 0.0f)
754  alpha=1.0f;
755  beta=mwcReadPseudoRandomValue(r);
756  gamma=sqrt(-2.0f*log(alpha));
757  sigma=gamma*cospi((2.0f*beta));
758  tau=gamma*sinpi((2.0f*beta));
759  noise=pixel+sqrt(pixel)*SigmaGaussian*sigma+QuantumRange*TauGaussian*tau;
760  break;
761  }
762  case ImpulseNoise:
763  {
764  if (alpha < (SigmaImpulse/2.0f))
765  noise=0.0f;
766  else
767  if (alpha >= (1.0f-(SigmaImpulse/2.0f)))
768  noise=QuantumRange;
769  else
770  noise=pixel;
771  break;
772  }
773  case LaplacianNoise:
774  {
775  if (alpha <= 0.5f)
776  {
777  if (alpha <= MagickEpsilon)
778  noise=(pixel-QuantumRange);
779  else
780  noise=(pixel+QuantumRange*SigmaLaplacian*log(2.0f*alpha)+
781  0.5f);
782  break;
783  }
784  beta=1.0f-alpha;
785  if (beta <= (0.5f*MagickEpsilon))
786  noise=(pixel+QuantumRange);
787  else
788  noise=(pixel-QuantumRange*SigmaLaplacian*log(2.0f*beta)+0.5f);
789  break;
790  }
792  {
793  sigma=1.0f;
794  if (alpha > MagickEpsilon)
795  sigma=sqrt(-2.0f*log(alpha));
796  beta=mwcReadPseudoRandomValue(r);
797  noise=(pixel+pixel*SigmaMultiplicativeGaussian*sigma*
798  cospi((2.0f*beta))/2.0f);
799  break;
800  }
801  case PoissonNoise:
802  {
803  float
804  poisson;
805  unsigned int i;
806  poisson=exp(-SigmaPoisson*QuantumScale*pixel);
807  for (i=0; alpha > poisson; i++)
808  {
809  beta=mwcReadPseudoRandomValue(r);
810  alpha*=beta;
811  }
813  break;
814  }
815  case RandomNoise:
816  {
817  noise=(QuantumRange*SigmaRandom*alpha);
818  break;
819  }
820  }
821  return noise;
822  }
823 
824  __kernel
825  void AddNoise(const __global CLQuantum *image,
826  const unsigned int number_channels,const ChannelType channel,
827  const unsigned int length,const unsigned int pixelsPerWorkItem,
828  const NoiseType noise_type,const float attenuate,const unsigned int seed0,
829  const unsigned int seed1,const unsigned int numRandomNumbersPerPixel,
830  __global CLQuantum *filteredImage)
831  {
832  mwc64x_state_t rng;
833  rng.seed0 = seed0;
834  rng.seed1 = seed1;
835 
836  uint span = pixelsPerWorkItem * numRandomNumbersPerPixel; // length of RNG substream each workitem will use
837  uint offset = span * get_local_size(0) * get_group_id(0); // offset of this workgroup's RNG substream (in master stream);
838  MWC64X_SeedStreams(&rng, offset, span); // Seed the RNG streams
839 
840  uint pos = get_group_id(0) * get_local_size(0) * pixelsPerWorkItem * number_channels + (get_local_id(0) * number_channels);
841  uint count = pixelsPerWorkItem;
842 
843  while (count > 0 && pos < length)
844  {
845  const __global CLQuantum *p = image + pos;
846  __global CLQuantum *q = filteredImage + pos;
847 
848  float red;
849  float green;
850  float blue;
851  float alpha;
852 
853  ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
854 
855  if ((channel & RedChannel) != 0)
856  red=mwcGenerateDifferentialNoise(&rng,red,noise_type,attenuate);
857 
858  if (number_channels > 2)
859  {
860  if ((channel & GreenChannel) != 0)
861  green=mwcGenerateDifferentialNoise(&rng,green,noise_type,attenuate);
862 
863  if ((channel & BlueChannel) != 0)
864  blue=mwcGenerateDifferentialNoise(&rng,blue,noise_type,attenuate);
865  }
866 
867  if (((number_channels == 4) || (number_channels == 2)) &&
868  ((channel & AlphaChannel) != 0))
869  alpha=mwcGenerateDifferentialNoise(&rng,alpha,noise_type,attenuate);
870 
871  WriteChannels(q, number_channels, channel, red, green, blue, alpha);
872 
873  pos += (get_local_size(0) * number_channels);
874  count--;
875  }
876  }
877  )
878 
879 /*
880 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
881 % %
882 % %
883 % %
884 % B l u r %
885 % %
886 % %
887 % %
888 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
889 */
890 
891  STRINGIFY(
892  /*
893  Reduce image noise and reduce detail levels by row
894  */
895  __kernel void BlurRow(const __global CLQuantum *image,
896  const unsigned int number_channels,const ChannelType channel,
897  __constant float *filter,const unsigned int width,
898  const unsigned int imageColumns,const unsigned int imageRows,
899  __local float4 *temp,__global float4 *tempImage)
900  {
901  const int x = get_global_id(0);
902  const int y = get_global_id(1);
903 
904  const int columns = imageColumns;
905 
906  const unsigned int radius = (width-1)/2;
907  const int wsize = get_local_size(0);
908  const unsigned int loadSize = wsize+width;
909 
910  //group coordinate
911  const int groupX=get_local_size(0)*get_group_id(0);
912 
913  //parallel load and clamp
914  for (int i=get_local_id(0); i < loadSize; i=i+get_local_size(0))
915  {
916  int cx = ClampToCanvas(i + groupX - radius, columns);
917  temp[i] = ReadFloat4(image, number_channels, columns, cx, y, channel);
918  }
919 
920  // barrier
921  barrier(CLK_LOCAL_MEM_FENCE);
922 
923  // only do the work if this is not a patched item
924  if (get_global_id(0) < columns)
925  {
926  // compute
927  float4 result = (float4) 0;
928 
929  int i = 0;
930 
931  for ( ; i+7 < width; )
932  {
933  for (int j=0; j < 8; j++)
934  result+=filter[i+j]*temp[i+j+get_local_id(0)];
935  i+=8;
936  }
937 
938  for ( ; i < width; i++)
939  result+=filter[i]*temp[i+get_local_id(0)];
940 
941  // write back to global
942  tempImage[y*columns+x] = result;
943  }
944  }
945  )
946 
947  STRINGIFY(
948  /*
949  Reduce image noise and reduce detail levels by line
950  */
951  __kernel void BlurColumn(const __global float4 *blurRowData,
952  const unsigned int number_channels,const ChannelType channel,
953  __constant float *filter,const unsigned int width,
954  const unsigned int imageColumns,const unsigned int imageRows,
955  __local float4 *temp,__global CLQuantum *filteredImage)
956  {
957  const int x = get_global_id(0);
958  const int y = get_global_id(1);
959 
960  const int columns = imageColumns;
961  const int rows = imageRows;
962 
963  unsigned int radius = (width-1)/2;
964  const int wsize = get_local_size(1);
965  const unsigned int loadSize = wsize+width;
966 
967  //group coordinate
968  const int groupX=get_local_size(0)*get_group_id(0);
969  const int groupY=get_local_size(1)*get_group_id(1);
970  //notice that get_local_size(0) is 1, so
971  //groupX=get_group_id(0);
972 
973  //parallel load and clamp
974  for (int i = get_local_id(1); i < loadSize; i=i+get_local_size(1))
975  temp[i] = blurRowData[ClampToCanvas(i+groupY-radius, rows) * columns + groupX];
976 
977  // barrier
978  barrier(CLK_LOCAL_MEM_FENCE);
979 
980  // only do the work if this is not a patched item
981  if (get_global_id(1) < rows)
982  {
983  // compute
984  float4 result = (float4) 0;
985 
986  int i = 0;
987 
988  for ( ; i+7 < width; )
989  {
990  for (int j=0; j < 8; j++)
991  result+=filter[i+j]*temp[i+j+get_local_id(1)];
992  i+=8;
993  }
994 
995  for ( ; i < width; i++)
996  result+=filter[i]*temp[i+get_local_id(1)];
997 
998  // write back to global
999  WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
1000  }
1001  }
1002  )
1003 
1004 /*
1005 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1006 % %
1007 % %
1008 % %
1009 % C o n t r a s t %
1010 % %
1011 % %
1012 % %
1013 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1014 */
1015 
1016  STRINGIFY(
1017 
1018  static inline float4 ConvertRGBToHSB(const float4 pixel)
1019  {
1020  float4 result=0.0f;
1021  result.w=pixel.w;
1022  float tmax=MagickMax(MagickMax(pixel.x,pixel.y),pixel.z);
1023  if (tmax != 0.0f)
1024  {
1025  float tmin=MagickMin(MagickMin(pixel.x,pixel.y),pixel.z);
1026  float delta=tmax-tmin;
1027 
1028  result.y=delta/tmax;
1029  result.z=QuantumScale*tmax;
1030  if (delta != 0.0f)
1031  {
1032  result.x =((pixel.x == tmax) ? 0.0f : ((pixel.y == tmax) ? 2.0f : 4.0f));
1033  result.x+=((pixel.x == tmax) ? (pixel.y-pixel.z) : ((pixel.y == tmax) ?
1034  (pixel.z-pixel.x) : (pixel.x-pixel.y)))/delta;
1035  result.x/=6.0f;
1036  result.x+=(result.x < 0.0f) ? 0.0f : 1.0f;
1037  }
1038  }
1039  return(result);
1040  }
1041 
1042  static inline float4 ConvertHSBToRGB(const float4 pixel)
1043  {
1044  float hue=pixel.x;
1045  float saturation=pixel.y;
1046  float brightness=pixel.z;
1047 
1048  float4 result=pixel;
1049 
1050  if (saturation == 0.0f)
1051  {
1052  result.x=result.y=result.z=ClampToQuantum(QuantumRange*brightness);
1053  }
1054  else
1055  {
1056  float h=6.0f*(hue-floor(hue));
1057  float f=h-floor(h);
1058  float p=brightness*(1.0f-saturation);
1059  float q=brightness*(1.0f-saturation*f);
1060  float t=brightness*(1.0f-(saturation*(1.0f-f)));
1061  int ih = (int)h;
1062 
1063  if (ih == 1)
1064  {
1065  result.x=ClampToQuantum(QuantumRange*q);
1066  result.y=ClampToQuantum(QuantumRange*brightness);
1067  result.z=ClampToQuantum(QuantumRange*p);
1068  }
1069  else if (ih == 2)
1070  {
1071  result.x=ClampToQuantum(QuantumRange*p);
1072  result.y=ClampToQuantum(QuantumRange*brightness);
1073  result.z=ClampToQuantum(QuantumRange*t);
1074  }
1075  else if (ih == 3)
1076  {
1077  result.x=ClampToQuantum(QuantumRange*p);
1078  result.y=ClampToQuantum(QuantumRange*q);
1079  result.z=ClampToQuantum(QuantumRange*brightness);
1080  }
1081  else if (ih == 4)
1082  {
1083  result.x=ClampToQuantum(QuantumRange*t);
1084  result.y=ClampToQuantum(QuantumRange*p);
1085  result.z=ClampToQuantum(QuantumRange*brightness);
1086  }
1087  else if (ih == 5)
1088  {
1089  result.x=ClampToQuantum(QuantumRange*brightness);
1090  result.y=ClampToQuantum(QuantumRange*p);
1091  result.z=ClampToQuantum(QuantumRange*q);
1092  }
1093  else
1094  {
1095  result.x=ClampToQuantum(QuantumRange*brightness);
1096  result.y=ClampToQuantum(QuantumRange*t);
1097  result.z=ClampToQuantum(QuantumRange*p);
1098  }
1099  }
1100  return(result);
1101  }
1102 
1103  __kernel void Contrast(__global CLQuantum *image,
1104  const unsigned int number_channels,const int sign)
1105  {
1106  const int x=get_global_id(0);
1107  const int y=get_global_id(1);
1108  const unsigned int columns=get_global_size(0);
1109 
1110  float4 pixel=ReadAllChannels(image,number_channels,columns,x,y);
1111  if (number_channels < 3)
1112  pixel.y=pixel.z=pixel.x;
1113 
1114  pixel=ConvertRGBToHSB(pixel);
1115  float brightness=pixel.z;
1116  brightness+=0.5f*sign*(0.5f*(sinpi(brightness-0.5f)+1.0f)-brightness);
1117  brightness=clamp(brightness,0.0f,1.0f);
1118  pixel.z=brightness;
1119  pixel=ConvertHSBToRGB(pixel);
1120 
1121  WriteAllChannels(image,number_channels,columns,x,y,pixel);
1122  }
1123  )
1124 
1125 /*
1126 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1127 % %
1128 % %
1129 % %
1130 % C o n t r a s t S t r e t c h %
1131 % %
1132 % %
1133 % %
1134 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1135 */
1136 
1137  STRINGIFY(
1138  /*
1139  */
1140  __kernel void Histogram(__global CLPixelType * restrict im,
1141  const ChannelType channel,
1142  const unsigned int colorspace,
1143  const unsigned int method,
1144  __global uint4 * restrict histogram)
1145  {
1146  const int x = get_global_id(0);
1147  const int y = get_global_id(1);
1148  const int columns = get_global_size(0);
1149  const int c = x + y * columns;
1150  if ((channel & SyncChannels) != 0)
1151  {
1152  float red=(float)getRed(im[c]);
1153  float green=(float)getGreen(im[c]);
1154  float blue=(float)getBlue(im[c]);
1155 
1156  float intensity = GetPixelIntensity(colorspace, method, red, green, blue);
1157  uint pos = ScaleQuantumToMap(ClampToQuantum(intensity));
1158  atomic_inc((__global uint *)(&(histogram[pos]))+2); //red position
1159  }
1160  else
1161  {
1162  // for equalizing, we always need all channels?
1163  // otherwise something more
1164  }
1165  }
1166  )
1167 
1168  STRINGIFY(
1169  /*
1170  */
1171  __kernel void ContrastStretch(__global CLPixelType * restrict im,
1172  const ChannelType channel,
1173  __global CLPixelType * restrict stretch_map,
1174  const float4 white, const float4 black)
1175  {
1176  const int x = get_global_id(0);
1177  const int y = get_global_id(1);
1178  const int columns = get_global_size(0);
1179  const int c = x + y * columns;
1180 
1181  uint ePos;
1182  CLPixelType oValue, eValue;
1183  CLQuantum red, green, blue, alpha;
1184 
1185  //read from global
1186  oValue=im[c];
1187 
1188  if ((channel & RedChannel) != 0)
1189  {
1190  if (getRedF4(white) != getRedF4(black))
1191  {
1192  ePos = ScaleQuantumToMap(getRed(oValue));
1193  eValue = stretch_map[ePos];
1194  red = getRed(eValue);
1195  }
1196  }
1197 
1198  if ((channel & GreenChannel) != 0)
1199  {
1200  if (getGreenF4(white) != getGreenF4(black))
1201  {
1202  ePos = ScaleQuantumToMap(getGreen(oValue));
1203  eValue = stretch_map[ePos];
1204  green = getGreen(eValue);
1205  }
1206  }
1207 
1208  if ((channel & BlueChannel) != 0)
1209  {
1210  if (getBlueF4(white) != getBlueF4(black))
1211  {
1212  ePos = ScaleQuantumToMap(getBlue(oValue));
1213  eValue = stretch_map[ePos];
1214  blue = getBlue(eValue);
1215  }
1216  }
1217 
1218  if ((channel & AlphaChannel) != 0)
1219  {
1220  if (getAlphaF4(white) != getAlphaF4(black))
1221  {
1222  ePos = ScaleQuantumToMap(getAlpha(oValue));
1223  eValue = stretch_map[ePos];
1224  alpha = getAlpha(eValue);
1225  }
1226  }
1227 
1228  //write back
1229  im[c]=(CLPixelType)(blue, green, red, alpha);
1230 
1231  }
1232  )
1233 
1234 /*
1235 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1236 % %
1237 % %
1238 % %
1239 % C o n v o l v e %
1240 % %
1241 % %
1242 % %
1243 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1244 */
1245 
1246  STRINGIFY(
1247  __kernel
1248  void ConvolveOptimized(const __global CLPixelType *input, __global CLPixelType *output,
1249  const unsigned int imageWidth, const unsigned int imageHeight,
1250  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1251  const uint matte, const ChannelType channel, __local CLPixelType *pixelLocalCache, __local float* filterCache) {
1252 
1253  int2 blockID;
1254  blockID.x = get_global_id(0) / get_local_size(0);
1255  blockID.y = get_global_id(1) / get_local_size(1);
1256 
1257  // image area processed by this workgroup
1258  int2 imageAreaOrg;
1259  imageAreaOrg.x = blockID.x * get_local_size(0);
1260  imageAreaOrg.y = blockID.y * get_local_size(1);
1261 
1262  int2 midFilterDimen;
1263  midFilterDimen.x = (filterWidth-1)/2;
1264  midFilterDimen.y = (filterHeight-1)/2;
1265 
1266  int2 cachedAreaOrg = imageAreaOrg - midFilterDimen;
1267 
1268  // dimension of the local cache
1269  int2 cachedAreaDimen;
1270  cachedAreaDimen.x = get_local_size(0) + filterWidth - 1;
1271  cachedAreaDimen.y = get_local_size(1) + filterHeight - 1;
1272 
1273  // cache the pixels accessed by this workgroup in local memory
1274  int localID = get_local_id(1)*get_local_size(0)+get_local_id(0);
1275  int cachedAreaNumPixels = cachedAreaDimen.x * cachedAreaDimen.y;
1276  int groupSize = get_local_size(0) * get_local_size(1);
1277  for (int i = localID; i < cachedAreaNumPixels; i+=groupSize) {
1278 
1279  int2 cachedAreaIndex;
1280  cachedAreaIndex.x = i % cachedAreaDimen.x;
1281  cachedAreaIndex.y = i / cachedAreaDimen.x;
1282 
1283  int2 imagePixelIndex;
1284  imagePixelIndex = cachedAreaOrg + cachedAreaIndex;
1285 
1286  // only support EdgeVirtualPixelMethod through ClampToCanvas
1287  // TODO: implement other virtual pixel method
1288  imagePixelIndex.x = ClampToCanvas(imagePixelIndex.x, imageWidth);
1289  imagePixelIndex.y = ClampToCanvas(imagePixelIndex.y, imageHeight);
1290 
1291  pixelLocalCache[i] = input[imagePixelIndex.y * imageWidth + imagePixelIndex.x];
1292  }
1293 
1294  // cache the filter
1295  for (int i = localID; i < filterHeight*filterWidth; i+=groupSize) {
1296  filterCache[i] = filter[i];
1297  }
1298  barrier(CLK_LOCAL_MEM_FENCE);
1299 
1300 
1301  int2 imageIndex;
1302  imageIndex.x = imageAreaOrg.x + get_local_id(0);
1303  imageIndex.y = imageAreaOrg.y + get_local_id(1);
1304 
1305  // if out-of-range, stops here and quit
1306  if (imageIndex.x >= imageWidth
1307  || imageIndex.y >= imageHeight) {
1308  return;
1309  }
1310 
1311  int filterIndex = 0;
1312  float4 sum = (float4)0.0f;
1313  float gamma = 0.0f;
1314  if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1315  int cacheIndexY = get_local_id(1);
1316  for (int j = 0; j < filterHeight; j++) {
1317  int cacheIndexX = get_local_id(0);
1318  for (int i = 0; i < filterWidth; i++) {
1319  CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1320  float f = filterCache[filterIndex];
1321 
1322  sum.x += f * p.x;
1323  sum.y += f * p.y;
1324  sum.z += f * p.z;
1325  sum.w += f * p.w;
1326 
1327  gamma += f;
1328  filterIndex++;
1329  cacheIndexX++;
1330  }
1331  cacheIndexY++;
1332  }
1333  }
1334  else {
1335  int cacheIndexY = get_local_id(1);
1336  for (int j = 0; j < filterHeight; j++) {
1337  int cacheIndexX = get_local_id(0);
1338  for (int i = 0; i < filterWidth; i++) {
1339 
1340  CLPixelType p = pixelLocalCache[cacheIndexY*cachedAreaDimen.x + cacheIndexX];
1341  float alpha = QuantumScale*p.w;
1342  float f = filterCache[filterIndex];
1343  float g = alpha * f;
1344 
1345  sum.x += g*p.x;
1346  sum.y += g*p.y;
1347  sum.z += g*p.z;
1348  sum.w += f*p.w;
1349 
1350  gamma += g;
1351  filterIndex++;
1352  cacheIndexX++;
1353  }
1354  cacheIndexY++;
1355  }
1356  gamma = PerceptibleReciprocal(gamma);
1357  sum.xyz = gamma*sum.xyz;
1358  }
1359  CLPixelType outputPixel;
1360  outputPixel.x = ClampToQuantum(sum.x);
1361  outputPixel.y = ClampToQuantum(sum.y);
1362  outputPixel.z = ClampToQuantum(sum.z);
1363  outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1364 
1365  output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1366  }
1367  )
1368 
1369  STRINGIFY(
1370  __kernel
1371  void Convolve(const __global CLPixelType *input, __global CLPixelType *output,
1372  const uint imageWidth, const uint imageHeight,
1373  __constant float *filter, const unsigned int filterWidth, const unsigned int filterHeight,
1374  const uint matte, const ChannelType channel) {
1375 
1376  int2 imageIndex;
1377  imageIndex.x = get_global_id(0);
1378  imageIndex.y = get_global_id(1);
1379 
1380  /*
1381  unsigned int imageWidth = get_global_size(0);
1382  unsigned int imageHeight = get_global_size(1);
1383  */
1384  if (imageIndex.x >= imageWidth
1385  || imageIndex.y >= imageHeight)
1386  return;
1387 
1388  int2 midFilterDimen;
1389  midFilterDimen.x = (filterWidth-1)/2;
1390  midFilterDimen.y = (filterHeight-1)/2;
1391 
1392  int filterIndex = 0;
1393  float4 sum = (float4)0.0f;
1394  float gamma = 0.0f;
1395  if (((channel & AlphaChannel) == 0) || (matte == 0)) {
1396  for (int j = 0; j < filterHeight; j++) {
1397  int2 inputPixelIndex;
1398  inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1399  inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1400  for (int i = 0; i < filterWidth; i++) {
1401  inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1402  inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1403 
1404  CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1405  float f = filter[filterIndex];
1406 
1407  sum.x += f * p.x;
1408  sum.y += f * p.y;
1409  sum.z += f * p.z;
1410  sum.w += f * p.w;
1411 
1412  gamma += f;
1413 
1414  filterIndex++;
1415  }
1416  }
1417  }
1418  else {
1419 
1420  for (int j = 0; j < filterHeight; j++) {
1421  int2 inputPixelIndex;
1422  inputPixelIndex.y = imageIndex.y - midFilterDimen.y + j;
1423  inputPixelIndex.y = ClampToCanvas(inputPixelIndex.y, imageHeight);
1424  for (int i = 0; i < filterWidth; i++) {
1425  inputPixelIndex.x = imageIndex.x - midFilterDimen.x + i;
1426  inputPixelIndex.x = ClampToCanvas(inputPixelIndex.x, imageWidth);
1427 
1428  CLPixelType p = input[inputPixelIndex.y * imageWidth + inputPixelIndex.x];
1429  float alpha = QuantumScale*p.w;
1430  float f = filter[filterIndex];
1431  float g = alpha * f;
1432 
1433  sum.x += g*p.x;
1434  sum.y += g*p.y;
1435  sum.z += g*p.z;
1436  sum.w += f*p.w;
1437 
1438  gamma += g;
1439 
1440 
1441  filterIndex++;
1442  }
1443  }
1444  gamma = PerceptibleReciprocal(gamma);
1445  sum.xyz = gamma*sum.xyz;
1446  }
1447 
1448  CLPixelType outputPixel;
1449  outputPixel.x = ClampToQuantum(sum.x);
1450  outputPixel.y = ClampToQuantum(sum.y);
1451  outputPixel.z = ClampToQuantum(sum.z);
1452  outputPixel.w = ((channel & AlphaChannel)!=0)?ClampToQuantum(sum.w):input[imageIndex.y * imageWidth + imageIndex.x].w;
1453 
1454  output[imageIndex.y * imageWidth + imageIndex.x] = outputPixel;
1455  }
1456  )
1457 
1458 /*
1459 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1460 % %
1461 % %
1462 % %
1463 % D e s p e c k l e %
1464 % %
1465 % %
1466 % %
1467 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1468 */
1469 
1470  STRINGIFY(
1471 
1472  __kernel void HullPass1(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1473  , const unsigned int imageWidth, const unsigned int imageHeight
1474  , const int2 offset, const int polarity, const int matte) {
1475 
1476  int x = get_global_id(0);
1477  int y = get_global_id(1);
1478 
1479  CLPixelType v = inputImage[y*imageWidth+x];
1480 
1481  int2 neighbor;
1482  neighbor.y = y + offset.y;
1483  neighbor.x = x + offset.x;
1484 
1485  int2 clampedNeighbor;
1486  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1487  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1488 
1489  CLPixelType r = (clampedNeighbor.x == neighbor.x
1490  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1491  :(CLPixelType)0;
1492 
1493  int sv[4];
1494  sv[0] = (int)v.x;
1495  sv[1] = (int)v.y;
1496  sv[2] = (int)v.z;
1497  sv[3] = (int)v.w;
1498 
1499  int sr[4];
1500  sr[0] = (int)r.x;
1501  sr[1] = (int)r.y;
1502  sr[2] = (int)r.z;
1503  sr[3] = (int)r.w;
1504 
1505  if (polarity > 0) {
1506  \n #pragma unroll 4\n
1507  for (unsigned int i = 0; i < 4; i++) {
1508  sv[i] = (sr[i] >= (sv[i]+ScaleCharToQuantum(2)))?(sv[i]+ScaleCharToQuantum(1)):sv[i];
1509  }
1510  }
1511  else {
1512  \n #pragma unroll 4\n
1513  for (unsigned int i = 0; i < 4; i++) {
1514  sv[i] = (sr[i] <= (sv[i]-ScaleCharToQuantum(2)))?(sv[i]-ScaleCharToQuantum(1)):sv[i];
1515  }
1516 
1517  }
1518 
1519  v.x = (CLQuantum)sv[0];
1520  v.y = (CLQuantum)sv[1];
1521  v.z = (CLQuantum)sv[2];
1522 
1523  if (matte!=0)
1524  v.w = (CLQuantum)sv[3];
1525 
1526  outputImage[y*imageWidth+x] = v;
1527 
1528  }
1529 
1530 
1531  )
1532 
1533  STRINGIFY(
1534 
1535  __kernel void HullPass2(const __global CLPixelType *inputImage, __global CLPixelType *outputImage
1536  , const unsigned int imageWidth, const unsigned int imageHeight
1537  , const int2 offset, const int polarity, const int matte) {
1538 
1539  int x = get_global_id(0);
1540  int y = get_global_id(1);
1541 
1542  CLPixelType v = inputImage[y*imageWidth+x];
1543 
1544  int2 neighbor, clampedNeighbor;
1545 
1546  neighbor.y = y + offset.y;
1547  neighbor.x = x + offset.x;
1548  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1549  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1550 
1551  CLPixelType r = (clampedNeighbor.x == neighbor.x
1552  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1553  :(CLPixelType)0;
1554 
1555 
1556  neighbor.y = y - offset.y;
1557  neighbor.x = x - offset.x;
1558  clampedNeighbor.x = ClampToCanvas(neighbor.x, imageWidth);
1559  clampedNeighbor.y = ClampToCanvas(neighbor.y, imageHeight);
1560 
1561  CLPixelType s = (clampedNeighbor.x == neighbor.x
1562  && clampedNeighbor.y == neighbor.y)?inputImage[clampedNeighbor.y*imageWidth+clampedNeighbor.x]
1563  :(CLPixelType)0;
1564 
1565 
1566  int sv[4];
1567  sv[0] = (int)v.x;
1568  sv[1] = (int)v.y;
1569  sv[2] = (int)v.z;
1570  sv[3] = (int)v.w;
1571 
1572  int sr[4];
1573  sr[0] = (int)r.x;
1574  sr[1] = (int)r.y;
1575  sr[2] = (int)r.z;
1576  sr[3] = (int)r.w;
1577 
1578  int ss[4];
1579  ss[0] = (int)s.x;
1580  ss[1] = (int)s.y;
1581  ss[2] = (int)s.z;
1582  ss[3] = (int)s.w;
1583 
1584  if (polarity > 0) {
1585  \n #pragma unroll 4\n
1586  for (unsigned int i = 0; i < 4; i++) {
1587  //sv[i] = (ss[i] >= (sv[i]+ScaleCharToQuantum(2)) && sr[i] > sv[i] ) ? (sv[i]+ScaleCharToQuantum(1)):sv[i];
1588  //
1589  //sv[i] =(!( (int)(ss[i] >= (sv[i]+ScaleCharToQuantum(2))) && (int) (sr[i] > sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1590  //sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) || (int) ( sr[i] <= sv[i] ) )) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1591  sv[i] =(( (int)( ss[i] < (sv[i]+ScaleCharToQuantum(2))) + (int) ( sr[i] <= sv[i] ) ) !=0) ? sv[i]:(sv[i]+ScaleCharToQuantum(1));
1592  }
1593  }
1594  else {
1595  \n #pragma unroll 4\n
1596  for (unsigned int i = 0; i < 4; i++) {
1597  //sv[i] = (ss[i] <= (sv[i]-ScaleCharToQuantum(2)) && sr[i] < sv[i] ) ? (sv[i]-ScaleCharToQuantum(1)):sv[i];
1598  //
1599  //sv[i] = ( (int)(ss[i] <= (sv[i]-ScaleCharToQuantum(2)) ) + (int)( sr[i] < sv[i] ) ==0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1600  sv[i] = (( (int)(ss[i] > (sv[i]-ScaleCharToQuantum(2))) + (int)( sr[i] >= sv[i] )) !=0) ? sv[i]:(sv[i]-ScaleCharToQuantum(1));
1601  }
1602  }
1603 
1604  v.x = (CLQuantum)sv[0];
1605  v.y = (CLQuantum)sv[1];
1606  v.z = (CLQuantum)sv[2];
1607 
1608  if (matte!=0)
1609  v.w = (CLQuantum)sv[3];
1610 
1611  outputImage[y*imageWidth+x] = v;
1612 
1613  }
1614  )
1615 
1616 /*
1617 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1618 % %
1619 % %
1620 % %
1621 % E q u a l i z e %
1622 % %
1623 % %
1624 % %
1625 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1626 */
1627 
1628  STRINGIFY(
1629  /*
1630  */
1631  __kernel void Equalize(__global CLPixelType * restrict im,
1632  const ChannelType channel,
1633  __global CLPixelType * restrict equalize_map,
1634  const float4 white, const float4 black)
1635  {
1636  const int x = get_global_id(0);
1637  const int y = get_global_id(1);
1638  const int columns = get_global_size(0);
1639  const int c = x + y * columns;
1640 
1641  uint ePos;
1642  CLPixelType oValue, eValue;
1643  CLQuantum red, green, blue, alpha;
1644 
1645  //read from global
1646  oValue=im[c];
1647 
1648  if ((channel & SyncChannels) != 0)
1649  {
1650  if (getRedF4(white) != getRedF4(black))
1651  {
1652  ePos = ScaleQuantumToMap(getRed(oValue));
1653  eValue = equalize_map[ePos];
1654  red = getRed(eValue);
1655  ePos = ScaleQuantumToMap(getGreen(oValue));
1656  eValue = equalize_map[ePos];
1657  green = getRed(eValue);
1658  ePos = ScaleQuantumToMap(getBlue(oValue));
1659  eValue = equalize_map[ePos];
1660  blue = getRed(eValue);
1661  ePos = ScaleQuantumToMap(getAlpha(oValue));
1662  eValue = equalize_map[ePos];
1663  alpha = getRed(eValue);
1664 
1665  //write back
1666  im[c]=(CLPixelType)(blue, green, red, alpha);
1667  }
1668 
1669  }
1670 
1671  // for equalizing, we always need all channels?
1672  // otherwise something more
1673 
1674  }
1675  )
1676 
1677 /*
1678 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1679 % %
1680 % %
1681 % %
1682 % F u n c t i o n %
1683 % %
1684 % %
1685 % %
1686 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1687 */
1688 
1689  STRINGIFY(
1690  /*
1691  apply FunctionImageChannel(braightness-contrast)
1692  */
1693  CLQuantum ApplyFunction(float pixel,const MagickFunction function,
1694  const unsigned int number_parameters,__constant float *parameters)
1695  {
1696  float result = 0.0f;
1697 
1698  switch (function)
1699  {
1700  case PolynomialFunction:
1701  {
1702  for (unsigned int i=0; i < number_parameters; i++)
1703  result = result*QuantumScale*pixel + parameters[i];
1704  result *= QuantumRange;
1705  break;
1706  }
1707  case SinusoidFunction:
1708  {
1709  float freq,phase,ampl,bias;
1710  freq = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1711  phase = ( number_parameters >= 2 ) ? parameters[1] : 0.0f;
1712  ampl = ( number_parameters >= 3 ) ? parameters[2] : 0.5f;
1713  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1714  result = QuantumRange*(ampl*sin(2.0f*MagickPI*
1715  (freq*QuantumScale*pixel + phase/360.0f)) + bias);
1716  break;
1717  }
1718  case ArcsinFunction:
1719  {
1720  float width,range,center,bias;
1721  width = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1722  center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1723  range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1724  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1725 
1726  result = 2.0f/width*(QuantumScale*pixel - center);
1727  result = range/MagickPI*asin(result)+bias;
1728  result = ( result <= -1.0f ) ? bias - range/2.0f : result;
1729  result = ( result >= 1.0f ) ? bias + range/2.0f : result;
1730  result *= QuantumRange;
1731  break;
1732  }
1733  case ArctanFunction:
1734  {
1735  float slope,range,center,bias;
1736  slope = ( number_parameters >= 1 ) ? parameters[0] : 1.0f;
1737  center = ( number_parameters >= 2 ) ? parameters[1] : 0.5f;
1738  range = ( number_parameters >= 3 ) ? parameters[2] : 1.0f;
1739  bias = ( number_parameters >= 4 ) ? parameters[3] : 0.5f;
1740  result = MagickPI*slope*(QuantumScale*pixel-center);
1741  result = QuantumRange*(range/MagickPI*atan(result) + bias);
1742  break;
1743  }
1744  case UndefinedFunction:
1745  break;
1746  }
1747  return(ClampToQuantum(result));
1748  }
1749  )
1750 
1751  STRINGIFY(
1752  /*
1753  Improve brightness / contrast of the image
1754  channel : define which channel is improved
1755  function : the function called to enchance the brightness contrast
1756  number_parameters : numbers of parameters
1757  parameters : the parameter
1758  */
1759  __kernel void ComputeFunction(__global CLQuantum *image,const unsigned int number_channels,
1760  const ChannelType channel,const MagickFunction function,const unsigned int number_parameters,
1761  __constant float *parameters)
1762  {
1763  const unsigned int x = get_global_id(0);
1764  const unsigned int y = get_global_id(1);
1765  const unsigned int columns = get_global_size(0);
1766  __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1767 
1768  float red;
1769  float green;
1770  float blue;
1771  float alpha;
1772 
1773  ReadChannels(p, number_channels, channel, &red, &green, &blue, &alpha);
1774 
1775  if ((channel & RedChannel) != 0)
1776  red=ApplyFunction(red, function, number_parameters, parameters);
1777 
1778  if (number_channels > 2)
1779  {
1780  if ((channel & GreenChannel) != 0)
1781  green=ApplyFunction(green, function, number_parameters, parameters);
1782 
1783  if ((channel & BlueChannel) != 0)
1784  blue=ApplyFunction(blue, function, number_parameters, parameters);
1785  }
1786 
1787  if (((number_channels == 4) || (number_channels == 2)) &&
1788  ((channel & AlphaChannel) != 0))
1789  alpha=ApplyFunction(alpha, function, number_parameters, parameters);
1790 
1791  WriteChannels(p, number_channels, channel, red, green, blue, alpha);
1792  }
1793  )
1794 
1795 /*
1796 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1797 % %
1798 % %
1799 % %
1800 % G r a y s c a l e %
1801 % %
1802 % %
1803 % %
1804 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1805 */
1806 
1807  STRINGIFY(
1808  __kernel void Grayscale(__global CLQuantum *image,const int number_channels,
1809  const unsigned int colorspace,const unsigned int method)
1810  {
1811  const unsigned int x = get_global_id(0);
1812  const unsigned int y = get_global_id(1);
1813  const unsigned int columns = get_global_size(0);
1814  __global CLQuantum *p = image + getPixelIndex(number_channels, columns, x, y);
1815 
1816  float
1817  blue,
1818  green,
1819  red;
1820 
1821  red=getPixelRed(p);
1822  green=getPixelGreen(p);
1823  blue=getPixelBlue(p);
1824 
1825  CLQuantum intensity=ClampToQuantum(GetPixelIntensity(colorspace, method, red, green, blue));
1826 
1827  setPixelRed(p,intensity);
1828  setPixelGreen(p,intensity);
1829  setPixelBlue(p,intensity);
1830  }
1831  )
1832 
1833 /*
1834 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1835 % %
1836 % %
1837 % %
1838 % L o c a l C o n t r a s t %
1839 % %
1840 % %
1841 % %
1842 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1843 */
1844 
1845  STRINGIFY(
1846 
1847  __kernel void LocalContrastBlurRow(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *tmpImage,
1848  const int radius,
1849  const int imageWidth,
1850  const int imageHeight)
1851  {
1852  const float4 RGB = ((float4)(0.2126f, 0.7152f, 0.0722f, 0.0f));
1853 
1854  int x = get_local_id(0);
1855  int y = get_global_id(1);
1856 
1857  if ((x >= imageWidth) || (y >= imageHeight))
1858  return;
1859 
1860  global CLPixelType *src = srcImage + y * imageWidth;
1861 
1862  for (int i = x; i < imageWidth; i += get_local_size(0)) {
1863  float sum = 0.0f;
1864  float weight = 1.0f;
1865 
1866  int j = i - radius;
1867  while ((j + 7) < i) {
1868  for (int k = 0; k < 8; ++k) // Unroll 8x
1869  sum += (weight + k) * dot(RGB, convert_float4(src[mirrorBottom(j+k)]));
1870  weight += 8.0f;
1871  j+=8;
1872  }
1873  while (j < i) {
1874  sum += weight * dot(RGB, convert_float4(src[mirrorBottom(j)]));
1875  weight += 1.0f;
1876  ++j;
1877  }
1878 
1879  while ((j + 7) < radius + i) {
1880  for (int k = 0; k < 8; ++k) // Unroll 8x
1881  sum += (weight - k) * dot(RGB, convert_float4(src[mirrorTop(j + k, imageWidth)]));
1882  weight -= 8.0f;
1883  j+=8;
1884  }
1885  while (j < radius + i) {
1886  sum += weight * dot(RGB, convert_float4(src[mirrorTop(j, imageWidth)]));
1887  weight -= 1.0f;
1888  ++j;
1889  }
1890 
1891  tmpImage[i + y * imageWidth] = sum / ((radius + 1) * (radius + 1));
1892  }
1893  }
1894  )
1895 
1896  STRINGIFY(
1897  __kernel void LocalContrastBlurApplyColumn(__global CLPixelType *srcImage, __global CLPixelType *dstImage, __global float *blurImage,
1898  const int radius,
1899  const float strength,
1900  const int imageWidth,
1901  const int imageHeight)
1902  {
1903  const float4 RGB = (float4)(0.2126f, 0.7152f, 0.0722f, 0.0f);
1904 
1905  int x = get_global_id(0);
1906  int y = get_global_id(1);
1907 
1908  if ((x >= imageWidth) || (y >= imageHeight))
1909  return;
1910 
1911  global float *src = blurImage + x;
1912 
1913  float sum = 0.0f;
1914  float weight = 1.0f;
1915 
1916  int j = y - radius;
1917  while ((j + 7) < y) {
1918  for (int k = 0; k < 8; ++k) // Unroll 8x
1919  sum += (weight + k) * src[mirrorBottom(j+k) * imageWidth];
1920  weight += 8.0f;
1921  j+=8;
1922  }
1923  while (j < y) {
1924  sum += weight * src[mirrorBottom(j) * imageWidth];
1925  weight += 1.0f;
1926  ++j;
1927  }
1928 
1929  while ((j + 7) < radius + y) {
1930  for (int k = 0; k < 8; ++k) // Unroll 8x
1931  sum += (weight - k) * src[mirrorTop(j + k, imageHeight) * imageWidth];
1932  weight -= 8.0f;
1933  j+=8;
1934  }
1935  while (j < radius + y) {
1936  sum += weight * src[mirrorTop(j, imageHeight) * imageWidth];
1937  weight -= 1.0f;
1938  ++j;
1939  }
1940 
1941  CLPixelType pixel = srcImage[x + y * imageWidth];
1942  float srcVal = dot(RGB, convert_float4(pixel));
1943  float mult = (srcVal - (sum / ((radius + 1) * (radius + 1)))) * (strength / 100.0f);
1944  mult = (srcVal + mult) / srcVal;
1945 
1946  pixel.x = ClampToQuantum(pixel.x * mult);
1947  pixel.y = ClampToQuantum(pixel.y * mult);
1948  pixel.z = ClampToQuantum(pixel.z * mult);
1949 
1950  dstImage[x + y * imageWidth] = pixel;
1951  }
1952  )
1953 
1954 /*
1955 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1956 % %
1957 % %
1958 % %
1959 % M o d u l a t e %
1960 % %
1961 % %
1962 % %
1963 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
1964 */
1965 
1966  STRINGIFY(
1967 
1968  static inline void ConvertRGBToHSL(const CLQuantum red,const CLQuantum green, const CLQuantum blue,
1969  float *hue, float *saturation, float *lightness)
1970  {
1971  float
1972  c,
1973  tmax,
1974  tmin;
1975 
1976  /*
1977  Convert RGB to HSL colorspace.
1978  */
1979  tmax=MagickMax(QuantumScale*red,MagickMax(QuantumScale*green, QuantumScale*blue));
1980  tmin=MagickMin(QuantumScale*red,MagickMin(QuantumScale*green, QuantumScale*blue));
1981 
1982  c=tmax-tmin;
1983 
1984  *lightness=(tmax+tmin)/2.0;
1985  if (c <= 0.0)
1986  {
1987  *hue=0.0;
1988  *saturation=0.0;
1989  return;
1990  }
1991 
1992  if (tmax == (QuantumScale*red))
1993  {
1994  *hue=(QuantumScale*green-QuantumScale*blue)/c;
1995  if ((QuantumScale*green) < (QuantumScale*blue))
1996  *hue+=6.0;
1997  }
1998  else
1999  if (tmax == (QuantumScale*green))
2000  *hue=2.0+(QuantumScale*blue-QuantumScale*red)/c;
2001  else
2002  *hue=4.0+(QuantumScale*red-QuantumScale*green)/c;
2003 
2004  *hue*=60.0/360.0;
2005  if (*lightness <= 0.5)
2006  *saturation=c/(2.0*(*lightness));
2007  else
2008  *saturation=c/(2.0-2.0*(*lightness));
2009  }
2010 
2011  static inline void ConvertHSLToRGB(const float hue,const float saturation, const float lightness,
2012  CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2013  {
2014  float
2015  b,
2016  c,
2017  g,
2018  h,
2019  tmin,
2020  r,
2021  x;
2022 
2023  /*
2024  Convert HSL to RGB colorspace.
2025  */
2026  h=hue*360.0;
2027  if (lightness <= 0.5)
2028  c=2.0*lightness*saturation;
2029  else
2030  c=(2.0-2.0*lightness)*saturation;
2031  tmin=lightness-0.5*c;
2032  h-=360.0*floor(h/360.0);
2033  h/=60.0;
2034  x=c*(1.0-fabs(h-2.0*floor(h/2.0)-1.0));
2035  switch ((int) floor(h) % 6)
2036  {
2037  case 0:
2038  {
2039  r=tmin+c;
2040  g=tmin+x;
2041  b=tmin;
2042  break;
2043  }
2044  case 1:
2045  {
2046  r=tmin+x;
2047  g=tmin+c;
2048  b=tmin;
2049  break;
2050  }
2051  case 2:
2052  {
2053  r=tmin;
2054  g=tmin+c;
2055  b=tmin+x;
2056  break;
2057  }
2058  case 3:
2059  {
2060  r=tmin;
2061  g=tmin+x;
2062  b=tmin+c;
2063  break;
2064  }
2065  case 4:
2066  {
2067  r=tmin+x;
2068  g=tmin;
2069  b=tmin+c;
2070  break;
2071  }
2072  case 5:
2073  {
2074  r=tmin+c;
2075  g=tmin;
2076  b=tmin+x;
2077  break;
2078  }
2079  default:
2080  {
2081  r=0.0;
2082  g=0.0;
2083  b=0.0;
2084  }
2085  }
2086  *red=ClampToQuantum(QuantumRange*r);
2087  *green=ClampToQuantum(QuantumRange*g);
2088  *blue=ClampToQuantum(QuantumRange*b);
2089  }
2090 
2091  static inline void ModulateHSL(const float percent_hue, const float percent_saturation,const float percent_lightness,
2092  CLQuantum *red,CLQuantum *green,CLQuantum *blue)
2093  {
2094  float
2095  hue,
2096  lightness,
2097  saturation;
2098 
2099  /*
2100  Increase or decrease color lightness, saturation, or hue.
2101  */
2102  ConvertRGBToHSL(*red,*green,*blue,&hue,&saturation,&lightness);
2103  hue+=0.5*(0.01*percent_hue-1.0);
2104  while (hue < 0.0)
2105  hue+=1.0;
2106  while (hue >= 1.0)
2107  hue-=1.0;
2108  saturation*=0.01*percent_saturation;
2109  lightness*=0.01*percent_lightness;
2110  ConvertHSLToRGB(hue,saturation,lightness,red,green,blue);
2111  }
2112 
2113  __kernel void Modulate(__global CLPixelType *im,
2114  const float percent_brightness,
2115  const float percent_hue,
2116  const float percent_saturation,
2117  const int colorspace)
2118  {
2119 
2120  const int x = get_global_id(0);
2121  const int y = get_global_id(1);
2122  const int columns = get_global_size(0);
2123  const int c = x + y * columns;
2124 
2125  CLPixelType pixel = im[c];
2126 
2127  CLQuantum
2128  blue,
2129  green,
2130  red;
2131 
2132  red=getRed(pixel);
2133  green=getGreen(pixel);
2134  blue=getBlue(pixel);
2135 
2136  switch (colorspace)
2137  {
2138  case HSLColorspace:
2139  default:
2140  {
2141  ModulateHSL(percent_hue, percent_saturation, percent_brightness,
2142  &red, &green, &blue);
2143  }
2144 
2145  }
2146 
2147  CLPixelType filteredPixel;
2148 
2149  setRed(&filteredPixel, red);
2150  setGreen(&filteredPixel, green);
2151  setBlue(&filteredPixel, blue);
2152  filteredPixel.w = pixel.w;
2153 
2154  im[c] = filteredPixel;
2155  }
2156  )
2157 
2158 /*
2159 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2160 % %
2161 % %
2162 % %
2163 % M o t i o n B l u r %
2164 % %
2165 % %
2166 % %
2167 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2168 */
2169 
2170  STRINGIFY(
2171  __kernel
2172  void MotionBlur(const __global CLPixelType *input, __global CLPixelType *output,
2173  const unsigned int imageWidth, const unsigned int imageHeight,
2174  const __global float *filter, const unsigned int width, const __global int2* offset,
2175  const float4 bias,
2176  const ChannelType channel, const unsigned int matte) {
2177 
2178  int2 currentPixel;
2179  currentPixel.x = get_global_id(0);
2180  currentPixel.y = get_global_id(1);
2181 
2182  if (currentPixel.x >= imageWidth
2183  || currentPixel.y >= imageHeight)
2184  return;
2185 
2186  float4 pixel;
2187  pixel.x = (float)bias.x;
2188  pixel.y = (float)bias.y;
2189  pixel.z = (float)bias.z;
2190  pixel.w = (float)bias.w;
2191 
2192  if (((channel & AlphaChannel) == 0) || (matte == 0)) {
2193 
2194  for (int i = 0; i < width; i++) {
2195  // only support EdgeVirtualPixelMethod through ClampToCanvas
2196  // TODO: implement other virtual pixel method
2197  int2 samplePixel = currentPixel + offset[i];
2198  samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2199  samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2200  CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2201 
2202  pixel.x += (filter[i] * (float)samplePixelValue.x);
2203  pixel.y += (filter[i] * (float)samplePixelValue.y);
2204  pixel.z += (filter[i] * (float)samplePixelValue.z);
2205  pixel.w += (filter[i] * (float)samplePixelValue.w);
2206  }
2207 
2208  CLPixelType outputPixel;
2209  outputPixel.x = ClampToQuantum(pixel.x);
2210  outputPixel.y = ClampToQuantum(pixel.y);
2211  outputPixel.z = ClampToQuantum(pixel.z);
2212  outputPixel.w = ClampToQuantum(pixel.w);
2213  output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2214  }
2215  else {
2216 
2217  float gamma = 0.0f;
2218  for (int i = 0; i < width; i++) {
2219  // only support EdgeVirtualPixelMethod through ClampToCanvas
2220  // TODO: implement other virtual pixel method
2221  int2 samplePixel = currentPixel + offset[i];
2222  samplePixel.x = ClampToCanvas(samplePixel.x, imageWidth);
2223  samplePixel.y = ClampToCanvas(samplePixel.y, imageHeight);
2224 
2225  CLPixelType samplePixelValue = input[ samplePixel.y * imageWidth + samplePixel.x];
2226 
2227  float alpha = QuantumScale*samplePixelValue.w;
2228  float k = filter[i];
2229  pixel.x = pixel.x + k * alpha * samplePixelValue.x;
2230  pixel.y = pixel.y + k * alpha * samplePixelValue.y;
2231  pixel.z = pixel.z + k * alpha * samplePixelValue.z;
2232 
2233  pixel.w += k * alpha * samplePixelValue.w;
2234 
2235  gamma+=k*alpha;
2236  }
2237  gamma = PerceptibleReciprocal(gamma);
2238  pixel.xyz = gamma*pixel.xyz;
2239 
2240  CLPixelType outputPixel;
2241  outputPixel.x = ClampToQuantum(pixel.x);
2242  outputPixel.y = ClampToQuantum(pixel.y);
2243  outputPixel.z = ClampToQuantum(pixel.z);
2244  outputPixel.w = ClampToQuantum(pixel.w);
2245  output[currentPixel.y * imageWidth + currentPixel.x] = outputPixel;
2246  }
2247  }
2248  )
2249 
2250 /*
2251 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2252 % %
2253 % %
2254 % %
2255 % R e s i z e %
2256 % %
2257 % %
2258 % %
2259 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2260 */
2261 
2262  STRINGIFY(
2263  // Based on Box from resize.c
2264  float BoxResizeFilter(const float x)
2265  {
2266  return 1.0f;
2267  }
2268  )
2269 
2270  STRINGIFY(
2271  // Based on CubicBC from resize.c
2272  float CubicBC(const float x,const __global float* resizeFilterCoefficients)
2273  {
2274  /*
2275  Cubic Filters using B,C determined values:
2276  Mitchell-Netravali B = 1/3 C = 1/3 "Balanced" cubic spline filter
2277  Catmull-Rom B = 0 C = 1/2 Interpolatory and exact on linears
2278  Spline B = 1 C = 0 B-Spline Gaussian approximation
2279  Hermite B = 0 C = 0 B-Spline interpolator
2280 
2281  See paper by Mitchell and Netravali, Reconstruction Filters in Computer
2282  Graphics Computer Graphics, Volume 22, Number 4, August 1988
2283  http://www.cs.utexas.edu/users/fussell/courses/cs384g/lectures/mitchell/
2284  Mitchell.pdf.
2285 
2286  Coefficents are determined from B,C values:
2287  P0 = ( 6 - 2*B )/6 = coeff[0]
2288  P1 = 0
2289  P2 = (-18 +12*B + 6*C )/6 = coeff[1]
2290  P3 = ( 12 - 9*B - 6*C )/6 = coeff[2]
2291  Q0 = ( 8*B +24*C )/6 = coeff[3]
2292  Q1 = ( -12*B -48*C )/6 = coeff[4]
2293  Q2 = ( 6*B +30*C )/6 = coeff[5]
2294  Q3 = ( - 1*B - 6*C )/6 = coeff[6]
2295 
2296  which are used to define the filter:
2297 
2298  P0 + P1*x + P2*x^2 + P3*x^3 0 <= x < 1
2299  Q0 + Q1*x + Q2*x^2 + Q3*x^3 1 <= x < 2
2300 
2301  which ensures function is continuous in value and derivative (slope).
2302  */
2303  if (x < 1.0)
2304  return(resizeFilterCoefficients[0]+x*(x*
2305  (resizeFilterCoefficients[1]+x*resizeFilterCoefficients[2])));
2306  if (x < 2.0)
2307  return(resizeFilterCoefficients[3]+x*(resizeFilterCoefficients[4]+x*
2308  (resizeFilterCoefficients[5]+x*resizeFilterCoefficients[6])));
2309  return(0.0);
2310  }
2311  )
2312 
2313  STRINGIFY(
2314  float Sinc(const float x)
2315  {
2316  if (x != 0.0f)
2317  {
2318  const float alpha=(float) (MagickPI*x);
2319  return sinpi(x)/alpha;
2320  }
2321  return(1.0f);
2322  }
2323  )
2324 
2325  STRINGIFY(
2326  float Triangle(const float x)
2327  {
2328  /*
2329  1st order (linear) B-Spline, bilinear interpolation, Tent 1D filter, or
2330  a Bartlett 2D Cone filter. Also used as a Bartlett Windowing function
2331  for Sinc().
2332  */
2333  return ((x<1.0f)?(1.0f-x):0.0f);
2334  }
2335  )
2336 
2337 
2338  STRINGIFY(
2339  float Hann(const float x)
2340  {
2341  /*
2342  Cosine window function:
2343  0.5+0.5*cos(pi*x).
2344  */
2345  const float cosine=cos((MagickPI*x));
2346  return(0.5f+0.5f*cosine);
2347  }
2348  )
2349 
2350  STRINGIFY(
2351  float Hamming(const float x)
2352  {
2353  /*
2354  Offset cosine window function:
2355  .54 + .46 cos(pi x).
2356  */
2357  const float cosine=cos((MagickPI*x));
2358  return(0.54f+0.46f*cosine);
2359  }
2360  )
2361 
2362  STRINGIFY(
2363  float Blackman(const float x)
2364  {
2365  /*
2366  Blackman: 2nd order cosine windowing function:
2367  0.42 + 0.5 cos(pi x) + 0.08 cos(2pi x)
2368 
2369  Refactored by Chantal Racette and Nicolas Robidoux to one trig call and
2370  five flops.
2371  */
2372  const float cosine=cos((MagickPI*x));
2373  return(0.34f+cosine*(0.5f+cosine*0.16f));
2374  }
2375  )
2376 
2377  STRINGIFY(
2378  static inline float applyResizeFilter(const float x, const ResizeWeightingFunctionType filterType, const __global float* filterCoefficients)
2379  {
2380  switch (filterType)
2381  {
2382  /* Call Sinc even for SincFast to get better precision on GPU
2383  and to avoid thread divergence. Sinc is pretty fast on GPU anyway...*/
2384  case SincWeightingFunction:
2386  return Sinc(x);
2388  return CubicBC(x,filterCoefficients);
2389  case BoxWeightingFunction:
2390  return BoxResizeFilter(x);
2392  return Triangle(x);
2393  case HannWeightingFunction:
2394  return Hann(x);
2396  return Hamming(x);
2398  return Blackman(x);
2399 
2400  default:
2401  return 0.0f;
2402  }
2403  }
2404  )
2405 
2406 
2407  STRINGIFY(
2408  static inline float getResizeFilterWeight(const __global float* resizeFilterCubicCoefficients, const ResizeWeightingFunctionType resizeFilterType
2409  , const ResizeWeightingFunctionType resizeWindowType
2410  , const float resizeFilterScale, const float resizeWindowSupport, const float resizeFilterBlur, const float x)
2411  {
2412  float scale;
2413  float xBlur = fabs(x/resizeFilterBlur);
2414  if (resizeWindowSupport < MagickEpsilon
2415  || resizeWindowType == BoxWeightingFunction)
2416  {
2417  scale = 1.0f;
2418  }
2419  else
2420  {
2421  scale = resizeFilterScale;
2422  scale = applyResizeFilter(xBlur*scale, resizeWindowType, resizeFilterCubicCoefficients);
2423  }
2424  float weight = scale * applyResizeFilter(xBlur, resizeFilterType, resizeFilterCubicCoefficients);
2425  return weight;
2426  }
2427 
2428  )
2429 
2430  ;
2431  const char *accelerateKernels2 =
2432 
2433  STRINGIFY(
2434 
2435  static inline unsigned int getNumWorkItemsPerPixel(const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2436  return (numWorkItems/pixelPerWorkgroup);
2437  }
2438 
2439  // returns the index of the pixel for the current workitem to compute.
2440  // returns -1 if this workitem doesn't need to participate in any computation
2441  static inline int pixelToCompute(const unsigned itemID, const unsigned int pixelPerWorkgroup, const unsigned int numWorkItems) {
2442  const unsigned int numWorkItemsPerPixel = getNumWorkItemsPerPixel(pixelPerWorkgroup, numWorkItems);
2443  int pixelIndex = itemID/numWorkItemsPerPixel;
2444  pixelIndex = (pixelIndex<pixelPerWorkgroup)?pixelIndex:-1;
2445  return pixelIndex;
2446  }
2447 
2448  )
2449 
2450  STRINGIFY(
2451  __kernel __attribute__((reqd_work_group_size(256, 1, 1)))
2452  void ResizeHorizontalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2453  const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2454  const unsigned int filteredColumns, const unsigned int filteredRows, const float xFactor,
2455  const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2456  const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2457  const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2458  const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2459  __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2460  {
2461  // calculate the range of resized image pixels computed by this workgroup
2462  const unsigned int startX = get_group_id(0)*pixelPerWorkgroup;
2463  const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup,filteredColumns);
2464  const unsigned int actualNumPixelToCompute = stopX - startX;
2465 
2466  // calculate the range of input image pixels to cache
2467  float scale = MagickMax(1.0f/xFactor+MagickEpsilon ,1.0f);
2468  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2469  scale = PerceptibleReciprocal(scale);
2470 
2471  const int cacheRangeStartX = MagickMax((int)((startX+0.5f)/xFactor+MagickEpsilon-support+0.5f),(int)(0));
2472  const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
2473 
2474  // cache the input pixels into local memory
2475  const unsigned int y = get_global_id(1);
2476  const unsigned int pos = getPixelIndex(number_channels, inputColumns, cacheRangeStartX, y);
2477  const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * number_channels;
2478  event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
2479  wait_group_events(1, &e);
2480 
2481  unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2482  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2483  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2484  {
2485  const unsigned int chunkStartX = startX + chunk*pixelChunkSize;
2486  const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
2487  const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
2488 
2489  // determine which resized pixel computed by this workitem
2490  const unsigned int itemID = get_local_id(0);
2491  const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
2492 
2493  const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
2494 
2495  float4 filteredPixel = (float4)0.0f;
2496  float density = 0.0f;
2497  float gamma = 0.0f;
2498  // -1 means this workitem doesn't participate in the computation
2499  if (pixelIndex != -1)
2500  {
2501  // x coordinated of the resized pixel computed by this workitem
2502  const int x = chunkStartX + pixelIndex;
2503 
2504  // calculate how many steps required for this pixel
2505  const float bisect = (x+0.5)/xFactor+MagickEpsilon;
2506  const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2507  const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputColumns);
2508  const unsigned int n = stop - start;
2509 
2510  // calculate how many steps this workitem will contribute
2511  unsigned int numStepsPerWorkItem = n / numItems;
2512  numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2513 
2514  const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2515  if (startStep < n)
2516  {
2517  const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2518 
2519  unsigned int cacheIndex = start+startStep-cacheRangeStartX;
2520  for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2521  {
2522  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2523  (ResizeWeightingFunctionType) resizeFilterType,
2524  (ResizeWeightingFunctionType) resizeWindowType,
2525  resizeFilterScale, resizeFilterWindowSupport,
2526  resizeFilterBlur, scale*(start + i - bisect + 0.5));
2527 
2528  float4 cp = (float4)0.0f;
2529 
2530  __local CLQuantum *p = inputImageCache + (cacheIndex*number_channels);
2531  cp.x = (float) *(p);
2532  if (number_channels > 2)
2533  {
2534  cp.y = (float) *(p + 1);
2535  cp.z = (float) *(p + 2);
2536  }
2537 
2538  if (alpha_index != 0)
2539  {
2540  cp.w = (float) *(p + alpha_index);
2541 
2542  float alpha = weight * QuantumScale * cp.w;
2543 
2544  filteredPixel.x += alpha * cp.x;
2545  filteredPixel.y += alpha * cp.y;
2546  filteredPixel.z += alpha * cp.z;
2547  filteredPixel.w += weight * cp.w;
2548  gamma += alpha;
2549  }
2550  else
2551  filteredPixel += ((float4) weight)*cp;
2552 
2553  density += weight;
2554  }
2555  }
2556  }
2557 
2558  // initialize the accumulators to zero
2559  if (itemID < actualNumPixelInThisChunk) {
2560  outputPixelCache[itemID] = (float4)0.0f;
2561  densityCache[itemID] = 0.0f;
2562  if (alpha_index != 0)
2563  gammaCache[itemID] = 0.0f;
2564  }
2565  barrier(CLK_LOCAL_MEM_FENCE);
2566 
2567  // accumulatte the filtered pixel value and the density
2568  for (unsigned int i = 0; i < numItems; i++) {
2569  if (pixelIndex != -1) {
2570  if (itemID%numItems == i) {
2571  outputPixelCache[pixelIndex]+=filteredPixel;
2572  densityCache[pixelIndex]+=density;
2573  if (alpha_index != 0)
2574  gammaCache[pixelIndex]+=gamma;
2575  }
2576  }
2577  barrier(CLK_LOCAL_MEM_FENCE);
2578  }
2579 
2580  if (itemID < actualNumPixelInThisChunk)
2581  {
2582  float4 filteredPixel = outputPixelCache[itemID];
2583 
2584  float gamma = 0.0f;
2585  if (alpha_index != 0)
2586  gamma = gammaCache[itemID];
2587 
2588  float density = densityCache[itemID];
2589  if ((density != 0.0f) && (density != 1.0f))
2590  {
2591  density = PerceptibleReciprocal(density);
2592  filteredPixel *= (float4) density;
2593  if (alpha_index != 0)
2594  gamma *= density;
2595  }
2596 
2597  if (alpha_index != 0)
2598  {
2599  gamma = PerceptibleReciprocal(gamma);
2600  filteredPixel.x *= gamma;
2601  filteredPixel.y *= gamma;
2602  filteredPixel.z *= gamma;
2603  }
2604 
2605  WriteAllChannels(filteredImage, number_channels, filteredColumns, chunkStartX + itemID, y, filteredPixel);
2606  }
2607  }
2608  }
2609  )
2610 
2611 
2612  STRINGIFY(
2613  __kernel __attribute__((reqd_work_group_size(1, 256, 1)))
2614  void ResizeVerticalFilter(const __global CLQuantum *inputImage, const unsigned int number_channels,
2615  const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum *filteredImage,
2616  const unsigned int filteredColumns, const unsigned int filteredRows, const float yFactor,
2617  const int resizeFilterType, const int resizeWindowType, const __global float *resizeFilterCubicCoefficients,
2618  const float resizeFilterScale, const float resizeFilterSupport, const float resizeFilterWindowSupport,
2619  const float resizeFilterBlur, __local CLQuantum *inputImageCache, const int numCachedPixels,
2620  const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
2621  __local float4 *outputPixelCache, __local float *densityCache, __local float *gammaCache)
2622  {
2623  // calculate the range of resized image pixels computed by this workgroup
2624  const unsigned int startY = get_group_id(1)*pixelPerWorkgroup;
2625  const unsigned int stopY = MagickMin(startY + pixelPerWorkgroup,filteredRows);
2626  const unsigned int actualNumPixelToCompute = stopY - startY;
2627 
2628  // calculate the range of input image pixels to cache
2629  float scale = MagickMax(1.0f/yFactor+MagickEpsilon ,1.0f);
2630  const float support = MagickMax(scale*resizeFilterSupport,0.5f);
2631  scale = PerceptibleReciprocal(scale);
2632 
2633  const int cacheRangeStartY = MagickMax((int)((startY+0.5f)/yFactor+MagickEpsilon-support+0.5f),(int)(0));
2634  const int cacheRangeEndY = MagickMin((int)(cacheRangeStartY + numCachedPixels), (int)inputRows);
2635 
2636  // cache the input pixels into local memory
2637  const unsigned int x = get_global_id(0);
2638  unsigned int pos = getPixelIndex(number_channels, inputColumns, x, cacheRangeStartY);
2639  unsigned int rangeLength = cacheRangeEndY-cacheRangeStartY;
2640  unsigned int stride = inputColumns * number_channels;
2641  for (unsigned int i = 0; i < number_channels; i++)
2642  {
2643  event_t e = async_work_group_strided_copy(inputImageCache + (rangeLength*i), inputImage+pos+i, rangeLength, stride, 0);
2644  wait_group_events(1,&e);
2645  }
2646 
2647  unsigned int alpha_index = (number_channels == 4) || (number_channels == 2) ? number_channels - 1 : 0;
2648  unsigned int totalNumChunks = (actualNumPixelToCompute+pixelChunkSize-1)/pixelChunkSize;
2649  for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
2650  {
2651  const unsigned int chunkStartY = startY + chunk*pixelChunkSize;
2652  const unsigned int chunkStopY = MagickMin(chunkStartY + pixelChunkSize, stopY);
2653  const unsigned int actualNumPixelInThisChunk = chunkStopY - chunkStartY;
2654 
2655  // determine which resized pixel computed by this workitem
2656  const unsigned int itemID = get_local_id(1);
2657  const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(1));
2658 
2659  const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(1));
2660 
2661  float4 filteredPixel = (float4)0.0f;
2662  float density = 0.0f;
2663  float gamma = 0.0f;
2664  // -1 means this workitem doesn't participate in the computation
2665  if (pixelIndex != -1)
2666  {
2667  // x coordinated of the resized pixel computed by this workitem
2668  const int y = chunkStartY + pixelIndex;
2669 
2670  // calculate how many steps required for this pixel
2671  const float bisect = (y+0.5)/yFactor+MagickEpsilon;
2672  const unsigned int start = (unsigned int)MagickMax(bisect-support+0.5f,0.0f);
2673  const unsigned int stop = (unsigned int)MagickMin(bisect+support+0.5f,(float)inputRows);
2674  const unsigned int n = stop - start;
2675 
2676  // calculate how many steps this workitem will contribute
2677  unsigned int numStepsPerWorkItem = n / numItems;
2678  numStepsPerWorkItem += ((numItems*numStepsPerWorkItem)==n?0:1);
2679 
2680  const unsigned int startStep = (itemID%numItems)*numStepsPerWorkItem;
2681  if (startStep < n)
2682  {
2683  const unsigned int stopStep = MagickMin(startStep+numStepsPerWorkItem, n);
2684 
2685  unsigned int cacheIndex = start+startStep-cacheRangeStartY;
2686  for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
2687  {
2688  float weight = getResizeFilterWeight(resizeFilterCubicCoefficients,
2689  (ResizeWeightingFunctionType) resizeFilterType,
2690  (ResizeWeightingFunctionType) resizeWindowType,
2691  resizeFilterScale, resizeFilterWindowSupport,
2692  resizeFilterBlur, scale*(start + i - bisect + 0.5));
2693 
2694  float4 cp = (float4)0.0f;
2695 
2696  __local CLQuantum *p = inputImageCache + cacheIndex;
2697  cp.x = (float) *(p);
2698  if (number_channels > 2)
2699  {
2700  cp.y = (float) *(p + rangeLength);
2701  cp.z = (float) *(p + (rangeLength * 2));
2702  }
2703 
2704  if (alpha_index != 0)
2705  {
2706  cp.w = (float) *(p + (rangeLength * alpha_index));
2707 
2708  float alpha = weight * QuantumScale * cp.w;
2709 
2710  filteredPixel.x += alpha * cp.x;
2711  filteredPixel.y += alpha * cp.y;
2712  filteredPixel.z += alpha * cp.z;
2713  filteredPixel.w += weight * cp.w;
2714  gamma += alpha;
2715  }
2716  else
2717  filteredPixel += ((float4) weight)*cp;
2718 
2719  density += weight;
2720  }
2721  }
2722  }
2723 
2724  // initialize the accumulators to zero
2725  if (itemID < actualNumPixelInThisChunk) {
2726  outputPixelCache[itemID] = (float4)0.0f;
2727  densityCache[itemID] = 0.0f;
2728  if (alpha_index != 0)
2729  gammaCache[itemID] = 0.0f;
2730  }
2731  barrier(CLK_LOCAL_MEM_FENCE);
2732 
2733  // accumulatte the filtered pixel value and the density
2734  for (unsigned int i = 0; i < numItems; i++) {
2735  if (pixelIndex != -1) {
2736  if (itemID%numItems == i) {
2737  outputPixelCache[pixelIndex]+=filteredPixel;
2738  densityCache[pixelIndex]+=density;
2739  if (alpha_index != 0)
2740  gammaCache[pixelIndex]+=gamma;
2741  }
2742  }
2743  barrier(CLK_LOCAL_MEM_FENCE);
2744  }
2745 
2746  if (itemID < actualNumPixelInThisChunk)
2747  {
2748  float4 filteredPixel = outputPixelCache[itemID];
2749 
2750  float gamma = 0.0f;
2751  if (alpha_index != 0)
2752  gamma = gammaCache[itemID];
2753 
2754  float density = densityCache[itemID];
2755  if ((density != 0.0f) && (density != 1.0f))
2756  {
2757  density = PerceptibleReciprocal(density);
2758  filteredPixel *= (float4) density;
2759  if (alpha_index != 0)
2760  gamma *= density;
2761  }
2762 
2763  if (alpha_index != 0)
2764  {
2765  gamma = PerceptibleReciprocal(gamma);
2766  filteredPixel.x *= gamma;
2767  filteredPixel.y *= gamma;
2768  filteredPixel.z *= gamma;
2769  }
2770 
2771  WriteAllChannels(filteredImage, number_channels, filteredColumns, x, chunkStartY + itemID, filteredPixel);
2772  }
2773  }
2774  }
2775  )
2776 
2777 /*
2778 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2779 % %
2780 % %
2781 % %
2782 % R o t a t i o n a l B l u r %
2783 % %
2784 % %
2785 % %
2786 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2787 */
2788 
2789  STRINGIFY(
2790  __kernel void RotationalBlur(const __global CLQuantum *image,
2791  const unsigned int number_channels,const unsigned int channel,
2792  const float2 blurCenter,__constant float *cos_theta,
2793  __constant float *sin_theta,const unsigned int cossin_theta_size,
2794  __global CLQuantum *filteredImage)
2795  {
2796  const int x = get_global_id(0);
2797  const int y = get_global_id(1);
2798  const int columns = get_global_size(0);
2799  const int rows = get_global_size(1);
2800  unsigned int step = 1;
2801  float center_x = (float) x - blurCenter.x;
2802  float center_y = (float) y - blurCenter.y;
2803  float radius = hypot(center_x, center_y);
2804 
2805  float blur_radius = hypot(blurCenter.x, blurCenter.y);
2806 
2807  if (radius > MagickEpsilon)
2808  {
2809  step = (unsigned int) (blur_radius / radius);
2810  if (step == 0)
2811  step = 1;
2812  if (step >= cossin_theta_size)
2813  step = cossin_theta_size-1;
2814  }
2815 
2816  float4 result = 0.0f;
2817  float normalize = 0.0f;
2818  float gamma = 0.0f;
2819 
2820  for (unsigned int i=0; i<cossin_theta_size; i+=step)
2821  {
2822  int cx = ClampToCanvas(blurCenter.x+center_x*cos_theta[i]-center_y*sin_theta[i]+0.5f,columns);
2823  int cy = ClampToCanvas(blurCenter.y+center_x*sin_theta[i]+center_y*cos_theta[i]+0.5f,rows);
2824 
2825  float4 pixel = ReadAllChannels(image, number_channels, columns, cx, cy);
2826 
2827  if ((number_channels == 4) || (number_channels == 2))
2828  {
2829  float alpha = (float)(QuantumScale*pixel.w);
2830 
2831  gamma += alpha;
2832 
2833  result.x += alpha * pixel.x;
2834  result.y += alpha * pixel.y;
2835  result.z += alpha * pixel.z;
2836  result.w += pixel.w;
2837  }
2838  else
2839  result += pixel;
2840 
2841  normalize += 1.0f;
2842  }
2843 
2844  normalize = PerceptibleReciprocal(normalize);
2845 
2846  if ((number_channels == 4) || (number_channels == 2))
2847  {
2848  gamma = PerceptibleReciprocal(gamma);
2849  result.x *= gamma;
2850  result.y *= gamma;
2851  result.z *= gamma;
2852  result.w *= normalize;
2853  }
2854  else
2855  result *= normalize;
2856 
2857  WriteFloat4(filteredImage, number_channels, columns, x, y, channel, result);
2858  }
2859  )
2860 
2861 /*
2862 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2863 % %
2864 % %
2865 % %
2866 % U n s h a r p M a s k %
2867 % %
2868 % %
2869 % %
2870 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
2871 */
2872 
2873  STRINGIFY(
2874  __kernel void UnsharpMaskBlurColumn(const __global CLQuantum* image,
2875  const __global float4 *blurRowData,const unsigned int number_channels,
2876  const ChannelType channel,const unsigned int columns,
2877  const unsigned int rows,__local float4* cachedData,
2878  __local float* cachedFilter,const __global float *filter,
2879  const unsigned int width,const float gain, const float threshold,
2880  __global CLQuantum *filteredImage)
2881  {
2882  const unsigned int radius = (width-1)/2;
2883 
2884  // cache the pixel shared by the workgroup
2885  const int groupX = get_group_id(0);
2886  const int groupStartY = get_group_id(1)*get_local_size(1) - radius;
2887  const int groupStopY = (get_group_id(1)+1)*get_local_size(1) + radius;
2888 
2889  if ((groupStartY >= 0) && (groupStopY < rows))
2890  {
2891  event_t e = async_work_group_strided_copy(cachedData,
2892  blurRowData+groupStartY*columns+groupX,groupStopY-groupStartY,columns,0);
2893  wait_group_events(1,&e);
2894  }
2895  else
2896  {
2897  for (int i = get_local_id(1); i < (groupStopY - groupStartY); i+=get_local_size(1))
2898  cachedData[i] = blurRowData[ClampToCanvas(groupStartY+i,rows)*columns + groupX];
2899 
2900  barrier(CLK_LOCAL_MEM_FENCE);
2901  }
2902  // cache the filter as well
2903  event_t e = async_work_group_copy(cachedFilter,filter,width,0);
2904  wait_group_events(1,&e);
2905 
2906  // only do the work if this is not a patched item
2907  const int cy = get_global_id(1);
2908 
2909  if (cy < rows)
2910  {
2911  float4 blurredPixel = (float4) 0.0f;
2912 
2913  int i = 0;
2914 
2915  for ( ; i+7 < width; )
2916  {
2917  for (int j=0; j < 8; j++, i++)
2918  blurredPixel+=cachedFilter[i+j]*cachedData[i+j+get_local_id(1)];
2919  }
2920 
2921  for ( ; i < width; i++)
2922  blurredPixel+=cachedFilter[i]*cachedData[i+get_local_id(1)];
2923 
2924  float4 inputImagePixel = ReadFloat4(image,number_channels,columns,groupX,cy,channel);
2925  float4 outputPixel = inputImagePixel - blurredPixel;
2926 
2927  float quantumThreshold = QuantumRange*threshold;
2928 
2929  int4 mask = isless(fabs(2.0f*outputPixel), (float4)quantumThreshold);
2930  outputPixel = select(inputImagePixel + outputPixel * gain, inputImagePixel, mask);
2931 
2932  //write back
2933  WriteFloat4(filteredImage,number_channels,columns,groupX,cy,channel,outputPixel);
2934  }
2935  }
2936  )
2937 
2938  STRINGIFY(
2939  __kernel void UnsharpMask(const __global CLQuantum *image,const unsigned int number_channels,
2940  const ChannelType channel,__constant float *filter,const unsigned int width,
2941  const unsigned int columns,const unsigned int rows,__local float4 *pixels,
2942  const float gain,const float threshold,__global CLQuantum *filteredImage)
2943  {
2944  const unsigned int x = get_global_id(0);
2945  const unsigned int y = get_global_id(1);
2946 
2947  const unsigned int radius = (width - 1) / 2;
2948 
2949  int row = y - radius;
2950  int baseRow = get_group_id(1) * get_local_size(1) - radius;
2951  int endRow = (get_group_id(1) + 1) * get_local_size(1) + radius;
2952 
2953  while (row < endRow) {
2954  int srcy = (row < 0) ? -row : row; // mirror pad
2955  srcy = (srcy >= rows) ? (2 * rows - srcy - 1) : srcy;
2956 
2957  float4 value = 0.0f;
2958 
2959  int ix = x - radius;
2960  int i = 0;
2961 
2962  while (i + 7 < width) {
2963  for (int j = 0; j < 8; ++j) { // unrolled
2964  int srcx = ix + j;
2965  srcx = (srcx < 0) ? -srcx : srcx;
2966  srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2967  value += filter[i + j] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2968  }
2969  ix += 8;
2970  i += 8;
2971  }
2972 
2973  while (i < width) {
2974  int srcx = (ix < 0) ? -ix : ix; // mirror pad
2975  srcx = (srcx >= columns) ? (2 * columns - srcx - 1) : srcx;
2976  value += filter[i] * ReadFloat4(image, number_channels, columns, srcx, srcy, channel);
2977  ++i;
2978  ++ix;
2979  }
2980  pixels[(row - baseRow) * get_local_size(0) + get_local_id(0)] = value;
2981  row += get_local_size(1);
2982  }
2983 
2984  barrier(CLK_LOCAL_MEM_FENCE);
2985 
2986  const int px = get_local_id(0);
2987  const int py = get_local_id(1);
2988  const int prp = get_local_size(0);
2989  float4 value = (float4)(0.0f);
2990 
2991  int i = 0;
2992  while (i + 7 < width) {
2993  for (int j = 0; j < 8; ++j) // unrolled
2994  value += (float4)(filter[i]) * pixels[px + (py + i + j) * prp];
2995  i += 8;
2996  }
2997  while (i < width) {
2998  value += (float4)(filter[i]) * pixels[px + (py + i) * prp];
2999  ++i;
3000  }
3001 
3002  if ((x < columns) && (y < rows)) {
3003  float4 srcPixel = ReadFloat4(image, number_channels, columns, x, y, channel);
3004  float4 diff = srcPixel - value;
3005 
3006  float quantumThreshold = QuantumRange*threshold;
3007 
3008  int4 mask = isless(fabs(2.0f * diff), (float4)quantumThreshold);
3009  value = select(srcPixel + diff * gain, srcPixel, mask);
3010 
3011  WriteFloat4(filteredImage, number_channels, columns, x, y, channel, value);
3012  }
3013  }
3014  )
3015 
3016  STRINGIFY(
3017  __kernel __attribute__((reqd_work_group_size(64, 4, 1)))
3018  void WaveletDenoise(__global CLQuantum *srcImage,__global CLQuantum *dstImage,
3019  const unsigned int number_channels,const unsigned int max_channels,
3020  const float threshold,const int passes,const unsigned int imageWidth,
3021  const unsigned int imageHeight)
3022  {
3023  const int pad = (1 << (passes - 1));
3024  const int tileSize = 64;
3025  const int tileRowPixels = 64;
3026  const float noise[] = { 0.8002, 0.2735, 0.1202, 0.0585, 0.0291, 0.0152, 0.0080, 0.0044 };
3027 
3028  CLQuantum stage[48]; // 16 * 3 (we only need 3 channels)
3029 
3030  local float buffer[64 * 64];
3031 
3032  int srcx = (get_group_id(0) + get_global_offset(0) / tileSize) * (tileSize - 2 * pad) - pad + get_local_id(0);
3033  int srcy = (get_group_id(1) + get_global_offset(1) / 4) * (tileSize - 2 * pad) - pad;
3034 
3035  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3036  int pos = (mirrorTop(mirrorBottom(srcx), imageWidth) * number_channels) +
3037  (mirrorTop(mirrorBottom(srcy + i), imageHeight)) * imageWidth * number_channels;
3038 
3039  for (int channel = 0; channel < max_channels; ++channel)
3040  stage[(i / 4) + (16 * channel)] = srcImage[pos + channel];
3041  }
3042 
3043  for (int channel = 0; channel < max_channels; ++channel) {
3044  // Load LDS
3045  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3046  buffer[get_local_id(0) + i * tileRowPixels] = convert_float(stage[(i / 4) + (16 * channel)]);
3047 
3048  // Process
3049 
3050  float tmp[16];
3051  float accum[16];
3052  float pixel;
3053 
3054  for (int i = 0; i < 16; i++)
3055  accum[i]=0.0f;
3056 
3057  for (int pass = 0; pass < passes; ++pass) {
3058  const int radius = 1 << pass;
3059  const int x = get_local_id(0);
3060  const float thresh = threshold * noise[pass];
3061 
3062  // Apply horizontal hat
3063  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3064  const int offset = i * tileRowPixels;
3065  if (pass == 0)
3066  tmp[i / 4] = buffer[x + offset]; // snapshot input on first pass
3067  pixel = 0.5f * tmp[i / 4] + 0.25 * (buffer[mirrorBottom(x - radius) + offset] + buffer[mirrorTop(x + radius, tileSize) + offset]);
3068  barrier(CLK_LOCAL_MEM_FENCE);
3069  buffer[x + offset] = pixel;
3070  }
3071  barrier(CLK_LOCAL_MEM_FENCE);
3072 
3073  // Apply vertical hat
3074  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3075  pixel = 0.5f * buffer[x + i * tileRowPixels] + 0.25 * (buffer[x + mirrorBottom(i - radius) * tileRowPixels] + buffer[x + mirrorTop(i + radius, tileRowPixels) * tileRowPixels]);
3076  float delta = tmp[i / 4] - pixel;
3077  tmp[i / 4] = pixel; // hold output in tmp until all workitems are done
3078  if (delta < -thresh)
3079  delta += thresh;
3080  else if (delta > thresh)
3081  delta -= thresh;
3082  else
3083  delta = 0;
3084  accum[i / 4] += delta;
3085  }
3086  barrier(CLK_LOCAL_MEM_FENCE);
3087 
3088  if (pass < passes - 1)
3089  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3090  buffer[x + i * tileRowPixels] = tmp[i / 4]; // store lowpass for next pass
3091  else // last pass
3092  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3093  accum[i / 4] += tmp[i / 4]; // add the lowpass signal back to output
3094  barrier(CLK_LOCAL_MEM_FENCE);
3095  }
3096 
3097  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1))
3098  stage[(i / 4) + (16 * channel)] = ClampToQuantum(accum[i / 4]);
3099 
3100  barrier(CLK_LOCAL_MEM_FENCE);
3101  }
3102 
3103  // Write from stage to output
3104 
3105  if ((get_local_id(0) >= pad) && (get_local_id(0) < tileSize - pad) && (srcx >= 0) && (srcx < imageWidth)) {
3106  for (int i = get_local_id(1); i < tileSize; i += get_local_size(1)) {
3107  if ((i >= pad) && (i < tileSize - pad) && (srcy + i >= 0) && (srcy + i < imageHeight)) {
3108  int pos = (srcx * number_channels) + ((srcy + i) * (imageWidth * number_channels));
3109  for (int channel = 0; channel < max_channels; ++channel) {
3110  dstImage[pos + channel] = stage[(i / 4) + (16 * channel)];
3111  }
3112  }
3113  }
3114  }
3115  }
3116  )
3117 
3118  ;
3119 
3120 #endif // MAGICKCORE_OPENCL_SUPPORT
3121 
3122 #if defined(__cplusplus) || defined(c_plusplus)
3123 }
3124 #endif
3125 
3126 #endif // MAGICKCORE_ACCELERATE_KERNELS_PRIVATE_H
#define SigmaPoisson
MagickExport void ConvertRGBToHSL(const double red, const double green, const double blue, double *hue, double *saturation, double *lightness)
Definition: gem.c:1105
#define SigmaRandom
static double Hann(const double x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:322
PixelIntensityMethod
Definition: pixel.h:99
static double Blackman(const double x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:149
#define MagickPI
Definition: image-private.h:40
static void ModulateHSL(const double percent_hue, const double percent_saturation, const double percent_lightness, double *red, double *green, double *blue)
Definition: enhance.c:3526
#define MAGICKCORE_QUANTUM_DEPTH
Definition: magick-type.h:32
NoiseType
#define MagickEpsilon
Definition: magick-type.h:114
#define SigmaLaplacian
static Quantum ClampToQuantum(const MagickRealType quantum)
Definition: quantum.h:85
MagickPrivate void ConvertRGBToHSB(const double, const double, const double, double *, double *, double *)
#define SigmaUniform
static double PerceptibleReciprocal(const double x)
#define SigmaGaussian
static Quantum ApplyFunction(Quantum pixel, const MagickFunction function, const size_t number_parameters, const double *parameters, ExceptionInfo *exception)
Definition: statistic.c:982
static void Contrast(const int sign, double *red, double *green, double *blue)
Definition: enhance.c:1372
#define SigmaMultiplicativeGaussian
#define TauGaussian
static double Triangle(const double x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:543
#define QuantumScale
Definition: magick-type.h:119
ChannelType
Definition: pixel.h:33
MagickExport void ConvertHSLToRGB(const double hue, const double saturation, const double lightness, double *red, double *green, double *blue)
Definition: gem.c:462
#define MaxMap
Definition: magick-type.h:79
#define MagickMax(x, y)
Definition: image-private.h:36
static double Sinc(const double, const ResizeFilter *)
static double CubicBC(const double x, const ResizeFilter *resize_filter)
Definition: resize.c:207
ResizeWeightingFunctionType
static double Hamming(const double x, const ResizeFilter *magick_unused(resize_filter))
Definition: resize.c:334
#define MagickMin(x, y)
Definition: image-private.h:37
ColorspaceType
Definition: colorspace.h:25
#define SigmaImpulse
CompositeOperator
Definition: composite.h:25
MagickPrivate void ConvertHSBToRGB(const double, const double, const double, double *, double *, double *)
MagickExport MagickRealType GetPixelIntensity(const Image *magick_restrict image, const Quantum *magick_restrict pixel)
Definition: pixel.c:2358
MagickFunction
Definition: statistic.h:123
#define QuantumRange
Definition: magick-type.h:87