This file is indexed.

/usr/lib/python2.7/dist-packages/pyopencl/cl/pyopencl-ranluxcl.cl is in python-pyopencl 2016.1+git20161130-1.

This file is owned by root:root, with mode 0o644.

The actual contents of the file can be viewed below.

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
/* RanluxCL is deprecated in PyOpenCL and will be removed in the 2018.x
 * versions of the package. */

#ifndef RANLUXCL_CL
#define RANLUXCL_CL

/**** RANLUXCL v1.3.1 MODIFIED *************************************************

Implements the RANLUX generator of Matrin Luscher, based on the Fortran 77
implementation by Fred James. This OpenCL code is a complete implementation
which should perfectly replicate the numbers generated by the original Fortran
77 implementation (if using the legacy initialization routine).

***** QUICK USAGE DESCRIPTION **************************************************

1. Create an OpenCL buffer with room for at least 28 32-bit variables (112 byte)
per work-item. I.e., in C/C++: size_t buffSize = numWorkitems * 112;

2. Pass the buffer and an unsigned integer seed <ins> to a kernel that launches
the ranluxcl_initialization function. The seed <ins> can be any unsigned 32-bit
integer, and must be different on different OpenCL devices/NDRanges to ensure
different sequences. As long as the number of work-items on each device/NDRange
is less than 2^32 = 4294967296 all sequences will be different.
An examle initialization kernel would be:
	#include "ranluxcl.cl"
	kernel void Kernel_Ranluxcl_Init(private uint ins,
		global ranluxcl_state_t *ranluxcltab)
	{
		ranluxcl_initialization(ins, ranluxcltab);
	}

3. Now the generator is ready for use. Remember to download the seeds first,
and upload them again when done. Example kernel that downloads seeds, generates
a float4 where each component is uniformly distributed between 0 and 1, end
points not included, then uploads the seeds again:
	#include "ranluxcl.cl"
	kernel void Kernel_Example(global ranluxcl_state_t *ranluxcltab)
	{
		//ranluxclstate stores the state of the generator.
		ranluxcl_state_t ranluxclstate;

		//Download state into ranluxclstate struct.
		ranluxcl_download_seed(&ranluxclstate, ranluxcltab);

		//Generate a float4 with each component on (0,1),
		//end points not included. We can call ranluxcl as many
		//times as we like until we upload the state again.
		float4 randomnr = ranluxcl32(&ranluxclstate);

		//Upload state again so that we don't get the same
		//numbers over again the next time we use ranluxcl.
		ranluxcl_upload_seed(&ranluxclstate, ranluxcltab);
	}

***** MACROS *******************************************************************

The following macros can optionally be defined:

RANLUXCL_LUX:
Sets the luxury level of the generator. Should be 0-4, or if it is 24 or larger
it sets the p-value of the generator (generally not needed). If this macro is
not set then lux=4 is the default (highest quality). For many applications the
high quality of lux=4 may not be needed. Indeed if two values (each value
having 24 random bits) are glued together to form a 48-bit value the generator
passes all tests in the TestU01 suite already with lux=2. See
"TestU01: A C Library for Empirical Testing of Random Number Generators" by
PIERRE LAeECUYER and RICHARD SIMARD. SWB(224, 10, 24)[24, l] is RANLUX with
two values glued together to create 48-bit numbers, and we see that it passes
all tests already at luxury value 2.

RANLUXCL_NO_WARMUP:
Turns off the warmup functionality in ranluxcl_initialization. This macro
should generally not be used, since the generators will initially be correlated
if it is defined. The only advantage is that the numbers generated will exactly
correspond to those of the original Fortran 77 implementation.

RANLUXCL_SUPPORT_DOUBLE:
Enables double precision functions. Please enable the OpenCL double precision
extension yourself, usually by "#pragma OPENCL EXTENSION cl_khr_fp64 : enable".

RANLUXCL_USE_LEGACY_INITIALIZATION
Uses exactly the same initialization routine as in the original Fortran 77 code,
leading to the same sequences. If using legacy initialization there are some
restrictions on what the seed <ins> can be, and it may also be necessary to
define RANLUXCL_MAXWORKITEMS if several sequences are to be run in parallel.

RANLUXCL_MAXWORKITEMS:
When RANLUXCL_USE_LEGACY_INITIALIZATION is defined we may need this macro.
If several OpenCL NDRanges will be running in parallel and the parallel
sequences should be different then this macro should have a value equal or
larger than the
largest number of work-items in any of the parallel runs. The default is to
use the current global size, so if all NDRanges are of the same size this need
not be defined.
	Each parallel instance must also have different seeds <ins>. For example if
we are launching 5120 work-items on GPU1 and 10240 work-items on GPU2 we would
use different seeds for the two generators, and RANLUXCL_MAXWORKITEMS must be
defined to be at least 10240. If GPU1 and GPU2 had the same number of work-items
this would not be necessary. 
	An underestimate of the highest permissible seed <ins> is given by the
smallest of:
(<maxins> = 10^9 / <numWorkitems>) or (<maxins> = 10^9 / RANLUXCL_MAXWORKITEMS).
Please make sure that <ins> is never higher than this since it could cause
undetected problems. For example with 10240 work-items the highest permissible
<ins> is about 100 000.
	Again note that this is only relevant when using the legacy initialization
function enabled by RANLUXCL_USE_LEGACY_INITIALIZATION. When not using the
legacy initialization this macro is effectively set to a very high value of
2^32-1.

***** FUNCTIONS: INITIALIZATION ************************************************

The initialization function is defined as:
void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
Run once at the very beginning. ranluxcltab should be a buffer with space for
112 byte per work-item in the NDRange. <ins> is the seed to the generator.
For a given <ins> each work-item in the NDRange will generate a different
sequence. If more than one NDRange is used in parallel then <ins> must be
different for each NDRange to avoid identical sequences.

***** FUNCTIONS: SEED UPLOAD/DOWNLOAD ******************************************

The following two functions should be launced at the beginning and end of a
kernel that uses ranluxcl to generate numbers, respectively:

void ranluxcl_download_seed(ranluxcl_state_t *rst,
	global ranluxcl_state_t *ranluxcltab)
Run at the beginning of a kernel to download ranluxcl state data

void ranluxcl_upload_seed(ranluxcl_state_t *rst,
	global ranluxcl_state_t *ranluxcltab)
Run at the end of a kernel to upload state data

***** FUNCTIONS: GENERATION AND SYNCHRONIZATION ********************************

float4 ranluxcl32(ranluxcl_state_t *rst)
Run to generate a pseudo-random float4 where each component is a number between
0 and 1, end points not included (meaning the number will never be exactly 0 or
1).

double4 ranluxcl64(ranluxcl_state_t *rst)
Double precision version of the above function. The preprocessor macro
RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available.
This function "glues" together two single-precision numbers to make one double
precision number. Most of the work is still done in single precision, so the
performance will be roughly halved regardless of the double precision
performance of the hardware.

float4 ranluxcl32norm(ranluxcl_state_t *rst)
Run to generate a pseudo-random float4 where each component is normally
distributed with mean 0 and standard deviation 1.

double4 ranluxcl64norm(ranluxcl_state_t *rst)
Double precision version of the above function. The preprocessor macro
RANLUXCL_SUPPORT_DOUBLE must be defined for this function to be available.

void ranluxcl_synchronize(ranluxcl_state_t *rst)
Run to synchronize execution in case different work-items have made a different
number of calls to ranluxcl. On SIMD machines this could lead to inefficient
execution. ranluxcl_synchronize allows us to make sure all generators are
SIMD-friendly again. Not needed if all work-items always call ranluxcl the same
number of times.

***** PERFORMANCE **************************************************************

For luxury setting 4, performance on AMD Cypress should be ~4.5*10^9 pseudo-
random values per second, when not downloading values to host memory (i.e. the
values are just generated, but not used for anything in particular).

***** DESCRIPTION OF THE IMPLEMENTATION ****************************************

This code closely follows the original Fortran 77 code (see credit section).
Here the differences (and similarities) between RANLUXCL (this implementation)
and the original RANLUX are discussed.

The Fortran 77 implementation uses a simple LCG to initialize the generator, and
so the same approach is taken here. If RANLUXCL is initialized with <ins> = 0 as
seed, the first work-item behaves like the original RANLUX with seed equal 1,
the second work-item as if with seed equal 2 and so on. If <ins> = 1 then the
first work-item behaves like the original RANLUX with seed equal to
<numWorkitems> + 1, and so on for higher <ins> so that we never have overlapping
sequences. This is why the RANLUXCL_MAXWORKITEMS macro must be set if we have
different NDRanges with a different number of work-items.

RANLUX is based on chaos theory, and what we are actually doing when selecting
a luxury value is setting how many values to skip over (causing decorrelation).
The number of values to skip is controlled by the so-called p-value of the
generator. After generating 24 values we skip p - 24 values until again
generating 24 values.

This implementation is somewhat modified from the original fortran
implementation by F. James. Because of the way the OpenCL code is optimized with
4-component 32-bit float vectors, it is most convenient to always throw away
some multiple of 24 values (i.e. p is always a multiple of 24).

However, there might be some resonances if we always throw away a multiple of
the seeds table size. Therefore the implementation is slightly more intricate
where p can be a multiple of 4 instead, at a cost to performance (only about 10%
lower than the cleaner 24 values approach on AMD Cypress). These two approaches
are termed planar and planar shift respectively. The idea for the planar
approach comes from the following paper:
Vadim Demchik, Pseudo-random number generators for Monte Carlo simulations on
Graphics Processing Units, arXiv:1003.1898v1 [hep-lat]

Below the p-values for the original reference implementation are listed along
with those of the planar shift implementation. Suggested values for the planar
approach are also presented. When this function is called with RANLUXCL_LUX
set to 0-4, the planar shift values are used. To use the pure planar approach
(for some extra performance with likely undetectable quality decrease), set lux
equal to the specific p-value.

Luxury setting (RANLUXCL_LUX):                   0   1   2   3   4
Original fortran77 implementation by F. James:  24  48  97  223 389
Planar (suggested):                             24  48  120 240 408
Planar shift:                                   24  48  100 224 404

Note that levels 0 and 1 are the same as in the original implementation for both
planar and planar shift. Level 4 of planar shift where p=404 is the same as
chosen for luxury level 1 by Martin Luescher for his v3 version of RANLUX.
Therefore if it is considered important to only use "official" values, luxury
settings 0, 1 or 4 of planar shift should be used. It is however unlikely that
the other values are bad, they just haven't been as extensively used and tested
by others.

Variable names are generally the same as in the fortran77 implementation,
however because of the way the generator is implemented, the i24 and j24
variables are no longer needed.

***** CREDIT *******************************************************************

I have been told by Fred James (the coder) that the original Fortran 77
implementation (which is the subject of the second paper below) is free to use
and share. Therefore I am using the MIT license (below). But most importantly
please always remember to give credit to the two articles by Martin Luscher and
Fred James, describing the generator and the Fortran 77 implementation on which
this implementation is based, respectively:

Martin Luescher, A portable high-quality random number generator for lattice
field theory simulations, Computer Physics Communications 79 (1994) 100-110

F. James, RANLUX: A Fortran implementation of the high-quality pseudorandom
number generator of Luescher, Computer Physics Communications 79 (1994) 111-114

***** LICENSE ******************************************************************

Copyright (c) 2011 Ivar Ursin Nikolaisen

Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

*******************************************************************************/

typedef struct{
	float
		s01, s02, s03, s04,
		s05, s06, s07, s08,
		s09, s10, s11, s12,
		s13, s14, s15, s16,
		s17, s18, s19, s20,
		s21, s22, s23, s24;
	float carry;
	float dummy; //Causes struct to be a multiple of 128 bits
	int in24;
	int stepnr;
} ranluxcl_state_t;

//Initial prototypes makes Apple's compiler happy
void ranluxcl_download_seed(ranluxcl_state_t *, global ranluxcl_state_t *);
void ranluxcl_upload_seed(ranluxcl_state_t *, global ranluxcl_state_t *);
float ranluxcl_os(float, float, float *, float *);
float4 ranluxcl32(ranluxcl_state_t *);
void ranluxcl_synchronize(ranluxcl_state_t *);
void ranluxcl_initialization(uint, global ranluxcl_state_t *);
float4 ranluxcl32norm(ranluxcl_state_t *);

#ifdef RANLUXCL_SUPPORT_DOUBLE
double4 ranluxcl64(ranluxcl_state_t *);
double4 ranluxcl64norm(ranluxcl_state_t *);
#endif

#define RANLUXCL_TWOM24 0.000000059604644775f
#define RANLUXCL_TWOM12 0.000244140625f

#ifdef RANLUXCL_LUX
#if RANLUXCL_LUX < 0
#error ranluxcl: lux must be zero or positive.
#endif
#else
#define RANLUXCL_LUX 4 //Default to high quality
#endif //RANLUXCL_LUX

//Here the luxury values are defined
#if RANLUXCL_LUX == 0
#define RANLUXCL_NSKIP 0
#elif RANLUXCL_LUX == 1
#define RANLUXCL_NSKIP 24
#elif RANLUXCL_LUX == 2
#define RANLUXCL_NSKIP 76
#elif RANLUXCL_LUX == 3
#define RANLUXCL_NSKIP 200
#elif RANLUXCL_LUX == 4
#define RANLUXCL_NSKIP 380
#else
#define RANLUXCL_NSKIP (RANLUXCL_LUX - 24)
#endif //RANLUXCL_LUX == 0

//Check that nskip is a permissible value
#if RANLUXCL_NSKIP % 4 != 0 
#error nskip must be divisible by 4!
#endif
#if RANLUXCL_NSKIP < 24 && RANLUXCL_NSKIP != 0
#error nskip must be either 0 or >= 24!
#endif
#if RANLUXCL_NSKIP < 0
#error nskip is negative!
#endif

//Check if planar scheme is recovered
#if RANLUXCL_NSKIP % 24 == 0
#define RANLUXCL_PLANAR
#endif

//Check if we will skip at all
#if RANLUXCL_NSKIP == 0
#define RANLUXCL_NOSKIP
#endif

//Single-value global size and id
#define RANLUXCL_NUMWORKITEMS \
	(get_global_size(0) * get_global_size(1) * get_global_size(2))
#define RANLUXCL_MYID \
	(get_global_id(0) + get_global_id(1) * get_global_size(0) + \
	 get_global_id(2) * get_global_size(0) * get_global_size(1))

void ranluxcl_download_seed(ranluxcl_state_t *rst,
	global ranluxcl_state_t *ranluxcltab)
{
	(*rst) = ranluxcltab[RANLUXCL_MYID];
}

void ranluxcl_upload_seed(ranluxcl_state_t *rst,
	global ranluxcl_state_t *ranluxcltab)
{
	ranluxcltab[RANLUXCL_MYID] = (*rst);
}

/*
 * Performs one "step" (generates a single value or skip). Only used internally,
 * not intended to be called from user code.
 */
float ranluxcl_os(float sj24m1, float sj24, float *si24, float *carry)
{
	float uni, out;
	uni = sj24 - (*si24) - (*carry);
	if(uni < 0.0f){
		uni += 1.0f;
		(*carry) = RANLUXCL_TWOM24;
	} else (*carry) = 0.0f;
	out = ((*si24) = uni);

	if(uni < RANLUXCL_TWOM12){
		out += RANLUXCL_TWOM24 * sj24m1;
		if(out == 0.0f) out = RANLUXCL_TWOM24 * RANLUXCL_TWOM24;
	}
	return out;
}

/*
 * Return a float4 where each component is a uniformly distributed pseudo-
 * random value between 0 and 1, end points not included.
 */
float4 ranluxcl32(ranluxcl_state_t *rst)
{
	float4 out;

	if(rst->stepnr == 0){
		out.x = ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry));
		out.y = ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry));
		out.z = ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry));
		out.w = ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry));
		rst->stepnr += 4;
	}

	else if(rst->stepnr == 4){
		out.x = ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
		out.y = ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
		out.z = ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
		out.w = ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
		rst->stepnr += 4;
	}

	else if(rst->stepnr == 8){
		out.x = ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
		out.y = ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
		out.z = ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
		out.w = ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
		rst->stepnr += 4;
	}

	else if(rst->stepnr == 12){
		out.x = ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
		out.y = ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
		out.z = ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
		out.w = ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
		rst->stepnr += 4;
	}

	else if(rst->stepnr == 16){
		out.x = ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
		out.y = ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
		out.z = ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
		out.w = ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
		rst->stepnr += 4;
	}

	else if(rst->stepnr == 20){
		out.x = ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry));
		out.y = ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry));
		out.z = ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry));
		out.w = ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry));
		rst->stepnr = 0;

// The below preprocessor directives are here to recover the simpler planar
// scheme when nskip is a multiple of 24. For the most general planar shift
// approach, just ignore all #if's below.
#ifndef RANLUXCL_PLANAR
	}

	(*&(rst->in24)) += 4;
	if((*&(rst->in24)) == 24){
		(*&(rst->in24)) = 0;
#endif //RANLUXCL_PLANAR

		int initialskips = (rst->stepnr) ? (24 - rst->stepnr) : 0;
		int bulkskips = ((RANLUXCL_NSKIP - initialskips)/24) * 24;
		int remainingskips = RANLUXCL_NSKIP - initialskips - bulkskips;

//We know there won't be any initial skips in the planar scheme
#ifndef RANLUXCL_PLANAR
		//Do initial skips (lack of breaks in switch is intentional).
		switch(initialskips){
			case(20):
				ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
				ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
				ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
				ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
			case(16):
				ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
				ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
				ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
				ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
			case(12):
				ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
				ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
				ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
				ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
			case(8):
				ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
				ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
				ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
				ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
			case(4):
				ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry));
				ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry));
				ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry));
				ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry));
		}
#endif //RANLUXCL_PLANAR

//Also check if we will ever need to skip at all
#ifndef RANLUXCL_NOSKIP
		for(int i=0; i<bulkskips/24; i++){
			ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry));
			ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry));
			ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry));
			ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry));
			ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
			ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
			ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
			ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
			ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
			ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
			ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
			ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
			ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
			ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
			ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
			ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
			ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
			ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
			ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
			ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
			ranluxcl_os(rst->s13, rst->s14, &(rst->s04), &(rst->carry));
			ranluxcl_os(rst->s12, rst->s13, &(rst->s03), &(rst->carry));
			ranluxcl_os(rst->s11, rst->s12, &(rst->s02), &(rst->carry));
			ranluxcl_os(rst->s10, rst->s11, &(rst->s01), &(rst->carry));
		}
#endif //RANLUXCL_NOSKIP

//There also won't be any remaining skips in the planar scheme
#ifndef RANLUXCL_PLANAR
		//Do remaining skips
		if(remainingskips){
			ranluxcl_os(rst->s09, rst->s10, &(rst->s24), &(rst->carry));
			ranluxcl_os(rst->s08, rst->s09, &(rst->s23), &(rst->carry));
			ranluxcl_os(rst->s07, rst->s08, &(rst->s22), &(rst->carry));
			ranluxcl_os(rst->s06, rst->s07, &(rst->s21), &(rst->carry));

			if(remainingskips > 4){
				ranluxcl_os(rst->s05, rst->s06, &(rst->s20), &(rst->carry));
				ranluxcl_os(rst->s04, rst->s05, &(rst->s19), &(rst->carry));
				ranluxcl_os(rst->s03, rst->s04, &(rst->s18), &(rst->carry));
				ranluxcl_os(rst->s02, rst->s03, &(rst->s17), &(rst->carry));
			}

			if(remainingskips > 8){
				ranluxcl_os(rst->s01, rst->s02, &(rst->s16), &(rst->carry));
				ranluxcl_os(rst->s24, rst->s01, &(rst->s15), &(rst->carry));
				ranluxcl_os(rst->s23, rst->s24, &(rst->s14), &(rst->carry));
				ranluxcl_os(rst->s22, rst->s23, &(rst->s13), &(rst->carry));
			}

			if(remainingskips > 12){
				ranluxcl_os(rst->s21, rst->s22, &(rst->s12), &(rst->carry));
				ranluxcl_os(rst->s20, rst->s21, &(rst->s11), &(rst->carry));
				ranluxcl_os(rst->s19, rst->s20, &(rst->s10), &(rst->carry));
				ranluxcl_os(rst->s18, rst->s19, &(rst->s09), &(rst->carry));
			}

			if(remainingskips > 16){
				ranluxcl_os(rst->s17, rst->s18, &(rst->s08), &(rst->carry));
				ranluxcl_os(rst->s16, rst->s17, &(rst->s07), &(rst->carry));
				ranluxcl_os(rst->s15, rst->s16, &(rst->s06), &(rst->carry));
				ranluxcl_os(rst->s14, rst->s15, &(rst->s05), &(rst->carry));
			}
		}
#endif //RANLUXCL_PLANAR

		// Initial skips brought stepnr down to 0. The bulk skips did only
		// full cycles. Therefore stepnr is now equal to remainingskips.
		rst->stepnr = remainingskips;
	}

	return out;
}

/*
 * Perform the necessary operations to set the generator to the "beginning",
 * i.e., ready to generate 24 numbers before the next skipping sequence. This
 * is useful if different work-items have called ranluxcl a different number
 * of times. Since that would lead to out of sync execution on different work-
 * items it could be rather inefficient on SIMD architectures (like current
 * GPUs). This function thus allows us to resynchronize execution across work-
 * items.
 */
void ranluxcl_synchronize(ranluxcl_state_t *rst)
{
	// Do necessary number of calls to ranluxcl so that stepnr == 0 at the end.
	if(rst->stepnr == 4)
		ranluxcl32(rst);
	if(rst->stepnr == 8)
		ranluxcl32(rst);
	if(rst->stepnr == 12)
		ranluxcl32(rst);
	if(rst->stepnr == 16)
		ranluxcl32(rst);
	if(rst->stepnr == 20)
		ranluxcl32(rst);
}

/*
 * Uses a 64-bit xorshift PRNG by George Marsaglia to initialize the generator.
 *
 * This function can be used instead of ranluxcl_initialization if manual
 * control of the seed of each generator is desired. x must be unique for each
 * time this function is called, and *ranluxcltab should point to the specific
 * entry in the table to be initialized. Compare this to ranluxcl_initialization
 * where ins needs only be unique for each NDRange, and *ranluxcltab points
 * to the base address of the table for the entire NDRange. Also note that
 * depending on what you are doing the ranluxcl_upload_seed and
 * ranluxcl_download_seed functions may not do what you want, so make sure
 * you know what you are doing!
 */

void ranluxcl_init(ulong x, global ranluxcl_state_t *ranluxcltab)
{
	ranluxcl_state_t rst;

	#define RANLUXCL_POW2_24 16777216
	#define RANLUXCL_56 0x00FFFFFFFFFFFFFF
	#define RANLUXCL_48 0x0000FFFFFFFFFFFF
	#define RANLUXCL_40 0x000000FFFFFFFFFF
	#define RANLUXCL_32 0x00000000FFFFFFFF
	#define RANLUXCL_24 0x0000000000FFFFFF
	#define RANLUXCL_16 0x000000000000FFFF
	#define RANLUXCL_8  0x00000000000000FF

	ulong x1, x2, x3;

	//Logical shifts used so that all 64 bits of output are used (24 bits
	//per float), to be certain that all initial states are different.
	x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x;
	x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x;
	x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x;
	rst.s01 = (float)  (x1 >> 40)
		/ (float)RANLUXCL_POW2_24;
	rst.s02 = (float) ((x1 & RANLUXCL_40) >> 16)
		/ (float)RANLUXCL_POW2_24;
	rst.s03 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))
		/ (float)RANLUXCL_POW2_24;
	rst.s04 = (float) ((x2 & RANLUXCL_56) >> 32)
		/ (float)RANLUXCL_POW2_24;
	rst.s05 = (float) ((x2 & RANLUXCL_32) >> 8)
		/ (float)RANLUXCL_POW2_24;
	rst.s06 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48))
		/ (float)RANLUXCL_POW2_24;
	rst.s07 = (float) ((x3 & RANLUXCL_48) >> 24)
		/ (float)RANLUXCL_POW2_24;
	rst.s08 = (float)  (x3 & RANLUXCL_24)
		/ (float)RANLUXCL_POW2_24;

	x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x;
	x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x;
	x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x;
	rst.s09 = (float)  (x1 >> 40)
		/ (float)RANLUXCL_POW2_24;
	rst.s10 = (float) ((x1 & RANLUXCL_40) >> 16)
		/ (float)RANLUXCL_POW2_24;
	rst.s11 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))
		/ (float)RANLUXCL_POW2_24;
	rst.s12 = (float) ((x2 & RANLUXCL_56) >> 32)
		/ (float)RANLUXCL_POW2_24;
	rst.s13 = (float) ((x2 & RANLUXCL_32) >> 8)
		/ (float)RANLUXCL_POW2_24;
	rst.s14 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48))
		/ (float)RANLUXCL_POW2_24;
	rst.s15 = (float) ((x3 & RANLUXCL_48) >> 24)
		/ (float)RANLUXCL_POW2_24;
	rst.s16 = (float)  (x3 & RANLUXCL_24)
		/ (float)RANLUXCL_POW2_24;

	x^=(x<<13);x^=(x>>7);x^=(x<<17);x1=x;
	x^=(x<<13);x^=(x>>7);x^=(x<<17);x2=x;
	x^=(x<<13);x^=(x>>7);x^=(x<<17);x3=x;
	rst.s17 = (float)  (x1 >> 40)
		/ (float)RANLUXCL_POW2_24;
	rst.s18 = (float) ((x1 & RANLUXCL_40) >> 16)
		/ (float)RANLUXCL_POW2_24;
	rst.s19 = (float)(((x1 & RANLUXCL_16) << 8) + (x2 >> 56))
		/ (float)RANLUXCL_POW2_24;
	rst.s20 = (float) ((x2 & RANLUXCL_56) >> 32)
		/ (float)RANLUXCL_POW2_24;
	rst.s21 = (float) ((x2 & RANLUXCL_32) >> 8)
		/ (float)RANLUXCL_POW2_24;
	rst.s22 = (float)(((x2 & RANLUXCL_8 ) << 16) + (x3 >> 48))
		/ (float)RANLUXCL_POW2_24;
	rst.s23 = (float) ((x3 & RANLUXCL_48) >> 24)
		/ (float)RANLUXCL_POW2_24;
	rst.s24 = (float)  (x3 & RANLUXCL_24)
		/ (float)RANLUXCL_POW2_24;

	#undef RANLUXCL_POW2_24
	#undef RANLUXCL_56
	#undef RANLUXCL_48
	#undef RANLUXCL_40
	#undef RANLUXCL_32
	#undef RANLUXCL_24
	#undef RANLUXCL_16
	#undef RANLUXCL_8

	rst.in24 = 0;
	rst.stepnr = 0;
	rst.carry = 0.0f;
	if(rst.s24 == 0.0f)
		rst.carry = RANLUXCL_TWOM24;

	#ifndef RANLUXCL_NO_WARMUP
	//Warming up the generator, ensuring there are no initial correlations.
	//16 is a "magic number". It is the number of times we must generate
	//a batch of 24 numbers to ensure complete decorrelation, however it
	//seems like it is necessary to double this for the special case when
	//the generator is initialized to all zeros.
	for(int i=0; i<16 * 2; i++){
		ranluxcl_os(rst.s09, rst.s10, &(rst.s24), &(rst.carry));
		ranluxcl_os(rst.s08, rst.s09, &(rst.s23), &(rst.carry));
		ranluxcl_os(rst.s07, rst.s08, &(rst.s22), &(rst.carry));
		ranluxcl_os(rst.s06, rst.s07, &(rst.s21), &(rst.carry));
		ranluxcl_os(rst.s05, rst.s06, &(rst.s20), &(rst.carry));
		ranluxcl_os(rst.s04, rst.s05, &(rst.s19), &(rst.carry));
		ranluxcl_os(rst.s03, rst.s04, &(rst.s18), &(rst.carry));
		ranluxcl_os(rst.s02, rst.s03, &(rst.s17), &(rst.carry));
		ranluxcl_os(rst.s01, rst.s02, &(rst.s16), &(rst.carry));
		ranluxcl_os(rst.s24, rst.s01, &(rst.s15), &(rst.carry));
		ranluxcl_os(rst.s23, rst.s24, &(rst.s14), &(rst.carry));
		ranluxcl_os(rst.s22, rst.s23, &(rst.s13), &(rst.carry));
		ranluxcl_os(rst.s21, rst.s22, &(rst.s12), &(rst.carry));
		ranluxcl_os(rst.s20, rst.s21, &(rst.s11), &(rst.carry));
		ranluxcl_os(rst.s19, rst.s20, &(rst.s10), &(rst.carry));
		ranluxcl_os(rst.s18, rst.s19, &(rst.s09), &(rst.carry));
		ranluxcl_os(rst.s17, rst.s18, &(rst.s08), &(rst.carry));
		ranluxcl_os(rst.s16, rst.s17, &(rst.s07), &(rst.carry));
		ranluxcl_os(rst.s15, rst.s16, &(rst.s06), &(rst.carry));
		ranluxcl_os(rst.s14, rst.s15, &(rst.s05), &(rst.carry));
		ranluxcl_os(rst.s13, rst.s14, &(rst.s04), &(rst.carry));
		ranluxcl_os(rst.s12, rst.s13, &(rst.s03), &(rst.carry));
		ranluxcl_os(rst.s11, rst.s12, &(rst.s02), &(rst.carry));
		ranluxcl_os(rst.s10, rst.s11, &(rst.s01), &(rst.carry));
	}
	#endif //RANLUXCL_NO_WARMUP

	//Upload the state
	*ranluxcltab = rst;
}

void ranluxcl_init_legacy(uint ins, global ranluxcl_state_t *ranluxcltab)
{
	//Using legacy initialization from original Fortan 77 implementation

	//ins is scaled so that if the user makes another call somewhere else
	//with ins + 1 there should be no overlap. Also adding one
	//allows us to use ins = 0.
	int k, maxWorkitems;
	ranluxcl_state_t rst;

	#ifdef RANLUXCL_MAXWORKITEMS
	maxWorkitems = RANLUXCL_MAXWORKITEMS;
	#else
	maxWorkitems = RANLUXCL_NUMWORKITEMS;
	#endif //RANLUXCL_MAXWORKITEMS

	int scaledins = ins * maxWorkitems + 1;

	int js = scaledins + RANLUXCL_MYID;

	//Make sure js is not too small (should really be an error)
	if(js < 1)
		js = 1;

	#define IC 2147483563
	#define ITWO24 16777216

	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s01=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s02=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s03=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s04=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s05=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s06=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s07=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s08=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s09=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s10=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s11=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s12=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s13=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s14=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s15=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s16=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s17=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s18=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s19=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s20=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s21=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s22=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s23=(js%ITWO24)*RANLUXCL_TWOM24;
	k = js/53668; js=40014*(js-k*53668)-k*12211; if(js<0)js=js+IC;
		rst.s24=(js%ITWO24)*RANLUXCL_TWOM24;

	#undef IC
	#undef ITWO24

	rst.in24 = 0;
	rst.stepnr = 0;
	rst.carry = 0.0f;
	if(rst.s24 == 0.0f)
		rst.carry = RANLUXCL_TWOM24;

	#ifndef RANLUXCL_NO_WARMUP
	//Warming up the generator, ensuring there are no initial correlations.
	//16 is a "magic number". It is the number of times we must generate
	//a batch of 24 numbers to ensure complete decorrelation.
	for(int i=0; i<16; i++){
		ranluxcl_os(rst.s09, rst.s10, &(rst.s24), &(rst.carry));
		ranluxcl_os(rst.s08, rst.s09, &(rst.s23), &(rst.carry));
		ranluxcl_os(rst.s07, rst.s08, &(rst.s22), &(rst.carry));
		ranluxcl_os(rst.s06, rst.s07, &(rst.s21), &(rst.carry));
		ranluxcl_os(rst.s05, rst.s06, &(rst.s20), &(rst.carry));
		ranluxcl_os(rst.s04, rst.s05, &(rst.s19), &(rst.carry));
		ranluxcl_os(rst.s03, rst.s04, &(rst.s18), &(rst.carry));
		ranluxcl_os(rst.s02, rst.s03, &(rst.s17), &(rst.carry));
		ranluxcl_os(rst.s01, rst.s02, &(rst.s16), &(rst.carry));
		ranluxcl_os(rst.s24, rst.s01, &(rst.s15), &(rst.carry));
		ranluxcl_os(rst.s23, rst.s24, &(rst.s14), &(rst.carry));
		ranluxcl_os(rst.s22, rst.s23, &(rst.s13), &(rst.carry));
		ranluxcl_os(rst.s21, rst.s22, &(rst.s12), &(rst.carry));
		ranluxcl_os(rst.s20, rst.s21, &(rst.s11), &(rst.carry));
		ranluxcl_os(rst.s19, rst.s20, &(rst.s10), &(rst.carry));
		ranluxcl_os(rst.s18, rst.s19, &(rst.s09), &(rst.carry));
		ranluxcl_os(rst.s17, rst.s18, &(rst.s08), &(rst.carry));
		ranluxcl_os(rst.s16, rst.s17, &(rst.s07), &(rst.carry));
		ranluxcl_os(rst.s15, rst.s16, &(rst.s06), &(rst.carry));
		ranluxcl_os(rst.s14, rst.s15, &(rst.s05), &(rst.carry));
		ranluxcl_os(rst.s13, rst.s14, &(rst.s04), &(rst.carry));
		ranluxcl_os(rst.s12, rst.s13, &(rst.s03), &(rst.carry));
		ranluxcl_os(rst.s11, rst.s12, &(rst.s02), &(rst.carry));
		ranluxcl_os(rst.s10, rst.s11, &(rst.s01), &(rst.carry));
	}
	#endif //RANLUXCL_NO_WARMUP

	//Upload the state
	ranluxcl_upload_seed(&rst, ranluxcltab);
}

void ranluxcl_initialization(uint ins, global ranluxcl_state_t *ranluxcltab)
{
	#ifdef RANLUXCL_USE_LEGACY_INITIALIZATION
	ranluxcl_init_legacy(ins, ranluxcltab);

	#else // Not RANLUXCL_USE_LEGACY_INITIALIZATION

	// We scale ins by 2^32. As long as we never use more than (2^32)-1
	// work-items per NDRange the initial states should never be the same.

	ulong x = (ulong)RANLUXCL_MYID + (ulong)ins * ((ulong)UINT_MAX + 1);
	ranluxcl_init(x, ranluxcltab + RANLUXCL_MYID);

	#endif // RANLUXCL_USE_LEGACY_INITIALIZATION
}

float4 ranluxcl32norm(ranluxcl_state_t *rst)
{
	//Returns a vector where each component is a normally
	//distributed PRN centered on 0, with standard deviation 1.

	//Roll our own since M_PI_F does not exist in OpenCL 1.0.
	#define RANLUXCL_PI_F 3.1415926535f

	float4 U = ranluxcl32(rst);

	float4 Z;
	float R, phi;

	R = sqrt(-2 * log(U.x));
	phi = 2 * RANLUXCL_PI_F * U.y;
	Z.x = R * cos(phi);
	Z.y = R * sin(phi);

	R = sqrt(-2 * log(U.z));
	phi = 2 * RANLUXCL_PI_F * U.w;
	Z.z = R * cos(phi);
	Z.w = R * sin(phi);

	return Z;

	#undef RANLUXCL_PI_F
}

#ifdef RANLUXCL_SUPPORT_DOUBLE
double4 ranluxcl64(ranluxcl_state_t *rst)
{
	double4 out;
	float4 randvec;

	//We know this value is caused by the never-zero part
	//of the original algorithm, but we want to allow zero for
	//the most significant bits in the double precision result.
	randvec = ranluxcl32(rst);
	if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
		randvec.x = 0.0f;
	if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
		randvec.z = 0.0f;

	out.x = (double)(randvec.x) + (double)(randvec.y) / 16777216;
	out.y = (double)(randvec.z) + (double)(randvec.w) / 16777216;

	randvec = ranluxcl32(rst);
	if(randvec.x == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
		randvec.x = 0.0f;
	if(randvec.z == RANLUXCL_TWOM24 * RANLUXCL_TWOM24)
		randvec.z = 0.0f;

	out.z = (double)(randvec.x) + (double)(randvec.y) / 16777216;
	out.w = (double)(randvec.z) + (double)(randvec.w) / 16777216;

	return out;
}

double4 ranluxcl64norm(ranluxcl_state_t *rst)
{
	//Returns a vector where each component is a normally
	//distributed PRN centered on 0, with standard deviation
	//1.

	double4 U = ranluxcl64(rst);

	double4 Z;
	double R, phi;

	R = sqrt(-2 * log(U.x));
	phi = 2 * M_PI * U.y;
	Z.x = R * cos(phi);
	Z.y = R * sin(phi);

	R = sqrt(-2 * log(U.z));
	phi = 2 * M_PI * U.w;
	Z.z = R * cos(phi);
	Z.w = R * sin(phi);

	return Z;
}
#endif //RANLUXCL_SUPPORT_DOUBLE

#undef RANLUXCL_TWOM24
#undef RANLUXCL_TWOM12
#undef RANLUXCL_NUMWORKITEMS
#undef RANLUXCL_MYID
#undef RANLUXCL_PLANAR
#undef RANLUXCL_NOSKIP

#endif //RANLUXCL_CL