wined3d: Don't run out of varyings because of clipplanes.
[wine] / dlls / wined3d / arb_program_shader.c
1 /*
2  * Pixel and vertex shaders implementation using ARB_vertex_program
3  * and ARB_fragment_program GL extensions.
4  *
5  * Copyright 2002-2003 Jason Edmeades
6  * Copyright 2002-2003 Raphael Junqueira
7  * Copyright 2004 Christian Costa
8  * Copyright 2005 Oliver Stieber
9  * Copyright 2006 Ivan Gyurdiev
10  * Copyright 2006 Jason Green
11  * Copyright 2006 Henri Verbeet
12  * Copyright 2007-2008 Stefan Dösinger for CodeWeavers
13  * Copyright 2009 Henri Verbeet for CodeWeavers
14  *
15  * This library is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU Lesser General Public
17  * License as published by the Free Software Foundation; either
18  * version 2.1 of the License, or (at your option) any later version.
19  *
20  * This library is distributed in the hope that it will be useful,
21  * but WITHOUT ANY WARRANTY; without even the implied warranty of
22  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23  * Lesser General Public License for more details.
24  *
25  * You should have received a copy of the GNU Lesser General Public
26  * License along with this library; if not, write to the Free Software
27  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
28  */
29
30 #include "config.h"
31
32 #include <math.h>
33 #include <stdio.h>
34
35 #include "wined3d_private.h"
36
37 WINE_DEFAULT_DEBUG_CHANNEL(d3d_shader);
38 WINE_DECLARE_DEBUG_CHANNEL(d3d_constants);
39 WINE_DECLARE_DEBUG_CHANNEL(d3d_caps);
40 WINE_DECLARE_DEBUG_CHANNEL(d3d);
41
42 #define GLINFO_LOCATION      (*gl_info)
43
44 /* GL locking for state handlers is done by the caller. */
45 static BOOL need_mova_const(IWineD3DBaseShader *shader, const WineD3D_GL_Info *gl_info) {
46     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *) shader;
47     if(!This->baseShader.reg_maps.usesmova) return FALSE;
48     return !GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION);
49 }
50
51 static BOOL need_helper_const(const WineD3D_GL_Info *gl_info) {
52     if(!GL_SUPPORT(NV_VERTEX_PROGRAM)   || /* Need to init colors */
53        gl_info->arb_vs_offset_limit     || /* Have to init texcoords */
54        gl_info->set_texcoord_w) {          /* Load the immval offset */
55         return TRUE;
56     }
57     return FALSE;
58 }
59
60 static unsigned int reserved_vs_const(IWineD3DBaseShader *shader, const WineD3D_GL_Info *gl_info) {
61     unsigned int ret = 1;
62     /* We use one PARAM for the pos fixup, and in some cases one to load
63      * some immediate values into the shader
64      */
65     if(need_helper_const(gl_info)) ret++;
66     if(need_mova_const(shader, gl_info)) ret++;
67     return ret;
68 }
69
70 /* Internally used shader constants. Applications can use constants 0 to GL_LIMITS(vshader_constantsF) - 1,
71  * so upload them above that
72  */
73 #define ARB_SHADER_PRIVCONST_BASE (GL_LIMITS(vshader_constantsF) - 1)
74 #define ARB_SHADER_PRIVCONST_POS ARB_SHADER_PRIVCONST_BASE + 0
75
76 /* ARB_program_shader private data */
77 struct control_frame
78 {
79     struct                          list entry;
80     enum
81     {
82         IF,
83         IFC,
84         LOOP,
85         REP
86     } type;
87     BOOL                            muting;
88     BOOL                            outer_loop;
89     union
90     {
91         unsigned int                loop_no;
92         unsigned int                ifc_no;
93     };
94     DWORD                           loop_control[3];
95     BOOL                            had_else;
96 };
97
98 struct arb_ps_compile_args
99 {
100     struct ps_compile_args          super;
101     DWORD                           bools; /* WORD is enough, use DWORD for alignment */
102     unsigned char                   loop_ctrl[MAX_CONST_I][3];
103 };
104
105 struct stb_const_desc
106 {
107     unsigned char           texunit;
108     UINT                    const_num;
109 };
110
111 struct arb_ps_compiled_shader
112 {
113     struct arb_ps_compile_args      args;
114     GLuint                          prgId;
115     struct stb_const_desc           bumpenvmatconst[MAX_TEXTURES];
116     unsigned char                   numbumpenvmatconsts;
117     struct stb_const_desc           luminanceconst[MAX_TEXTURES];
118     UINT                            int_consts[MAX_CONST_I];
119     char                            num_int_consts;
120     UINT                            ycorrection;
121 };
122
123 struct arb_vs_compile_args
124 {
125     struct vs_compile_args          super;
126     DWORD                           bools; /* WORD is enough, use DWORD for alignment */
127     DWORD                           ps_signature;
128     unsigned char                   loop_ctrl[MAX_CONST_I][3];
129 };
130
131 struct arb_vs_compiled_shader
132 {
133     struct arb_vs_compile_args      args;
134     GLuint                          prgId;
135     UINT                            int_consts[MAX_CONST_I];
136     char                            num_int_consts;
137 };
138
139 struct recorded_instruction
140 {
141     struct wined3d_shader_instruction ins;
142     struct list entry;
143 };
144
145 struct shader_arb_ctx_priv
146 {
147     char addr_reg[20];
148     enum
149     {
150         /* plain GL_ARB_vertex_program or GL_ARB_fragment_program */
151         ARB,
152         /* GL_NV_vertex_progam2_option or GL_NV_fragment_program_option */
153         NV2,
154         /* GL_NV_vertex_program3 or GL_NV_fragment_program2 */
155         NV3
156     } target_version;
157
158     const struct arb_vs_compile_args    *cur_vs_args;
159     const struct arb_ps_compile_args    *cur_ps_args;
160     const struct arb_ps_compiled_shader *compiled_fprog;
161     const struct arb_vs_compiled_shader *compiled_vprog;
162     struct list                         control_frames;
163     struct list                         record;
164     BOOL                                recording;
165     BOOL                                muted;
166     unsigned int                        num_loops, loop_depth, num_ifcs;
167     int                                 aL;
168
169     /* For 3.0 vertex shaders */
170     const char                          *vs_output[MAX_REG_OUTPUT];
171     /* For 2.x and earlier vertex shaders */
172     const char                          *texcrd_output[8], *color_output[2], *fog_output;
173
174     /* 3.0 pshader input for compatibility with fixed function */
175     const char                          *ps_input[MAX_REG_INPUT];
176 };
177
178 struct ps_signature
179 {
180     struct wined3d_shader_signature_element *sig;
181     DWORD                               idx;
182     struct wine_rb_entry                entry;
183 };
184
185 struct arb_pshader_private {
186     struct arb_ps_compiled_shader   *gl_shaders;
187     UINT                            num_gl_shaders, shader_array_size;
188     BOOL                            has_signature_idx;
189     DWORD                           input_signature_idx;
190 };
191
192 struct arb_vshader_private {
193     struct arb_vs_compiled_shader   *gl_shaders;
194     UINT                            num_gl_shaders, shader_array_size;
195 };
196
197 struct shader_arb_priv
198 {
199     GLuint                  current_vprogram_id;
200     GLuint                  current_fprogram_id;
201     const struct arb_ps_compiled_shader *compiled_fprog;
202     const struct arb_vs_compiled_shader *compiled_vprog;
203     GLuint                  depth_blt_vprogram_id;
204     GLuint                  depth_blt_fprogram_id[tex_type_count];
205     BOOL                    use_arbfp_fixed_func;
206     struct wine_rb_tree     fragment_shaders;
207
208     struct wine_rb_tree     signature_tree;
209     DWORD ps_sig_number;
210 };
211
212 /********************************************************
213  * ARB_[vertex/fragment]_program helper functions follow
214  ********************************************************/
215
216 /** 
217  * Loads floating point constants into the currently set ARB_vertex/fragment_program.
218  * When constant_list == NULL, it will load all the constants.
219  *  
220  * @target_type should be either GL_VERTEX_PROGRAM_ARB (for vertex shaders)
221  *  or GL_FRAGMENT_PROGRAM_ARB (for pixel shaders)
222  */
223 /* GL locking is done by the caller */
224 static unsigned int shader_arb_load_constantsF(IWineD3DBaseShaderImpl* This, const WineD3D_GL_Info *gl_info,
225         GLuint target_type, unsigned int max_constants, const float *constants, char *dirty_consts)
226 {
227     local_constant* lconst;
228     DWORD i, j;
229     unsigned int ret;
230
231     if (TRACE_ON(d3d_shader)) {
232         for(i = 0; i < max_constants; i++) {
233             if(!dirty_consts[i]) continue;
234             TRACE_(d3d_constants)("Loading constants %i: %f, %f, %f, %f\n", i,
235                         constants[i * 4 + 0], constants[i * 4 + 1],
236                         constants[i * 4 + 2], constants[i * 4 + 3]);
237         }
238     }
239     /* In 1.X pixel shaders constants are implicitly clamped in the range [-1;1] */
240     if (target_type == GL_FRAGMENT_PROGRAM_ARB && This->baseShader.reg_maps.shader_version.major == 1)
241     {
242         float lcl_const[4];
243         for(i = 0; i < max_constants; i++) {
244             if(!dirty_consts[i]) continue;
245             dirty_consts[i] = 0;
246
247             j = 4 * i;
248             if(constants[j + 0] > 1.0) lcl_const[0] = 1.0;
249             else if(constants[j + 0] < -1.0) lcl_const[0] = -1.0;
250             else lcl_const[0] = constants[j + 0];
251
252             if(constants[j + 1] > 1.0) lcl_const[1] = 1.0;
253             else if(constants[j + 1] < -1.0) lcl_const[1] = -1.0;
254             else lcl_const[1] = constants[j + 1];
255
256             if(constants[j + 2] > 1.0) lcl_const[2] = 1.0;
257             else if(constants[j + 2] < -1.0) lcl_const[2] = -1.0;
258             else lcl_const[2] = constants[j + 2];
259
260             if(constants[j + 3] > 1.0) lcl_const[3] = 1.0;
261             else if(constants[j + 3] < -1.0) lcl_const[3] = -1.0;
262             else lcl_const[3] = constants[j + 3];
263
264             GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, i, lcl_const));
265         }
266     } else {
267         if(GL_SUPPORT(EXT_GPU_PROGRAM_PARAMETERS)) {
268             /* TODO: Benchmark if we're better of with finding the dirty constants ourselves,
269              * or just reloading *all* constants at once
270              *
271             GL_EXTCALL(glProgramEnvParameters4fvEXT(target_type, 0, max_constants, constants));
272              */
273             for(i = 0; i < max_constants; i++) {
274                 if(!dirty_consts[i]) continue;
275
276                 /* Find the next block of dirty constants */
277                 dirty_consts[i] = 0;
278                 j = i;
279                 for(i++; (i < max_constants) && dirty_consts[i]; i++) {
280                     dirty_consts[i] = 0;
281                 }
282
283                 GL_EXTCALL(glProgramEnvParameters4fvEXT(target_type, j, i - j, constants + (j * 4)));
284             }
285         } else {
286             for(i = 0; i < max_constants; i++) {
287                 if(dirty_consts[i]) {
288                     dirty_consts[i] = 0;
289                     GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, i, constants + (i * 4)));
290                 }
291             }
292         }
293     }
294     checkGLcall("glProgramEnvParameter4fvARB()");
295
296     /* Load immediate constants */
297     if(This->baseShader.load_local_constsF) {
298         if (TRACE_ON(d3d_shader)) {
299             LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
300                 GLfloat* values = (GLfloat*)lconst->value;
301                 TRACE_(d3d_constants)("Loading local constants %i: %f, %f, %f, %f\n", lconst->idx,
302                         values[0], values[1], values[2], values[3]);
303             }
304         }
305         /* Immediate constants are clamped for 1.X shaders at loading times */
306         ret = 0;
307         LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
308             dirty_consts[lconst->idx] = 1; /* Dirtify so the non-immediate constant overwrites it next time */
309             ret = max(ret, lconst->idx + 1);
310             GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, lconst->idx, (GLfloat*)lconst->value));
311         }
312         checkGLcall("glProgramEnvParameter4fvARB()");
313         return ret; /* The loaded immediate constants need reloading for the next shader */
314     } else {
315         return 0; /* No constants are dirty now */
316     }
317 }
318
319 /**
320  * Loads the texture dimensions for NP2 fixup into the currently set ARB_[vertex/fragment]_programs.
321  */
322 static void shader_arb_load_np2fixup_constants(
323     IWineD3DDevice* device,
324     char usePixelShader,
325     char useVertexShader) {
326     /* not implemented */
327 }
328
329 static inline void shader_arb_ps_local_constants(IWineD3DDeviceImpl* deviceImpl)
330 {
331     IWineD3DStateBlockImpl* stateBlock = deviceImpl->stateBlock;
332     const WineD3D_GL_Info *gl_info = &deviceImpl->adapter->gl_info;
333     unsigned char i;
334     struct shader_arb_priv *priv = deviceImpl->shader_priv;
335     const struct arb_ps_compiled_shader *gl_shader = priv->compiled_fprog;
336
337     for(i = 0; i < gl_shader->numbumpenvmatconsts; i++)
338     {
339         int texunit = gl_shader->bumpenvmatconst[i].texunit;
340
341         /* The state manager takes care that this function is always called if the bump env matrix changes */
342         const float *data = (const float *)&stateBlock->textureState[texunit][WINED3DTSS_BUMPENVMAT00];
343         GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, gl_shader->bumpenvmatconst[i].const_num, data));
344
345         if (gl_shader->luminanceconst[i].const_num != WINED3D_CONST_NUM_UNUSED)
346         {
347             /* WINED3DTSS_BUMPENVLSCALE and WINED3DTSS_BUMPENVLOFFSET are next to each other.
348              * point gl to the scale, and load 4 floats. x = scale, y = offset, z and w are junk, we
349              * don't care about them. The pointers are valid for sure because the stateblock is bigger.
350              * (they're WINED3DTSS_TEXTURETRANSFORMFLAGS and WINED3DTSS_ADDRESSW, so most likely 0 or NaN
351             */
352             const float *scale = (const float *)&stateBlock->textureState[texunit][WINED3DTSS_BUMPENVLSCALE];
353             GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, gl_shader->luminanceconst[i].const_num, scale));
354         }
355     }
356     checkGLcall("Load bumpmap consts\n");
357
358     if(gl_shader->ycorrection != WINED3D_CONST_NUM_UNUSED)
359     {
360         /* ycorrection.x: Backbuffer height(onscreen) or 0(offscreen).
361         * ycorrection.y: -1.0(onscreen), 1.0(offscreen)
362         * ycorrection.z: 1.0
363         * ycorrection.w: 0.0
364         */
365         float val[4];
366         val[0] = deviceImpl->render_offscreen ? 0.0 : ((IWineD3DSurfaceImpl *) deviceImpl->render_targets[0])->currentDesc.Height;
367         val[1] = deviceImpl->render_offscreen ? 1.0 : -1.0;
368         val[2] = 1.0;
369         val[3] = 0.0;
370         GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, gl_shader->ycorrection, val));
371         checkGLcall("y correction loading\n");
372     }
373
374     if(gl_shader->num_int_consts == 0) return;
375
376     for(i = 0; i < MAX_CONST_I; i++)
377     {
378         if(gl_shader->int_consts[i] != WINED3D_CONST_NUM_UNUSED)
379         {
380             float val[4];
381             val[0] = stateBlock->pixelShaderConstantI[4 * i];
382             val[1] = stateBlock->pixelShaderConstantI[4 * i + 1];
383             val[2] = stateBlock->pixelShaderConstantI[4 * i + 2];
384             val[3] = -1.0;
385
386             GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, gl_shader->int_consts[i], val));
387         }
388     }
389     checkGLcall("Load ps int consts\n");
390 }
391
392 static inline void shader_arb_vs_local_constants(IWineD3DDeviceImpl* deviceImpl)
393 {
394     IWineD3DStateBlockImpl* stateBlock;
395     const WineD3D_GL_Info *gl_info;
396     unsigned char i;
397     struct shader_arb_priv *priv = deviceImpl->shader_priv;
398     const struct arb_vs_compiled_shader *gl_shader = priv->compiled_vprog;
399
400     if(gl_shader->num_int_consts == 0) return;
401
402     gl_info = &deviceImpl->adapter->gl_info;
403     stateBlock = deviceImpl->stateBlock;
404
405     for(i = 0; i < MAX_CONST_I; i++)
406     {
407         if(gl_shader->int_consts[i] != WINED3D_CONST_NUM_UNUSED)
408         {
409             float val[4];
410             val[0] = stateBlock->vertexShaderConstantI[4 * i];
411             val[1] = stateBlock->vertexShaderConstantI[4 * i + 1];
412             val[2] = stateBlock->vertexShaderConstantI[4 * i + 2];
413             val[3] = -1.0;
414
415             GL_EXTCALL(glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, gl_shader->int_consts[i], val));
416         }
417     }
418     checkGLcall("Load vs int consts\n");
419 }
420
421 /**
422  * Loads the app-supplied constants into the currently set ARB_[vertex/fragment]_programs.
423  * 
424  * We only support float constants in ARB at the moment, so don't 
425  * worry about the Integers or Booleans
426  */
427 /* GL locking is done by the caller (state handler) */
428 static void shader_arb_load_constants(
429     IWineD3DDevice* device,
430     char usePixelShader,
431     char useVertexShader) {
432    
433     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) device; 
434     IWineD3DStateBlockImpl* stateBlock = deviceImpl->stateBlock;
435     const WineD3D_GL_Info *gl_info = &deviceImpl->adapter->gl_info;
436
437     if (useVertexShader) {
438         IWineD3DBaseShaderImpl* vshader = (IWineD3DBaseShaderImpl*) stateBlock->vertexShader;
439
440         /* Load DirectX 9 float constants for vertex shader */
441         deviceImpl->highest_dirty_vs_const = shader_arb_load_constantsF(
442                 vshader, gl_info, GL_VERTEX_PROGRAM_ARB,
443                 deviceImpl->highest_dirty_vs_const,
444                 stateBlock->vertexShaderConstantF,
445                 deviceImpl->activeContext->vshader_const_dirty);
446
447         /* Upload the position fixup */
448         GL_EXTCALL(glProgramEnvParameter4fvARB(GL_VERTEX_PROGRAM_ARB, ARB_SHADER_PRIVCONST_POS, deviceImpl->posFixup));
449
450         shader_arb_vs_local_constants(deviceImpl);
451     }
452
453     if (usePixelShader) {
454         IWineD3DBaseShaderImpl* pshader = (IWineD3DBaseShaderImpl*) stateBlock->pixelShader;
455
456         /* Load DirectX 9 float constants for pixel shader */
457         deviceImpl->highest_dirty_ps_const = shader_arb_load_constantsF(
458                 pshader, gl_info, GL_FRAGMENT_PROGRAM_ARB,
459                 deviceImpl->highest_dirty_ps_const,
460                 stateBlock->pixelShaderConstantF,
461                 deviceImpl->activeContext->pshader_const_dirty);
462         shader_arb_ps_local_constants(deviceImpl);
463     }
464 }
465
466 static void shader_arb_update_float_vertex_constants(IWineD3DDevice *iface, UINT start, UINT count)
467 {
468     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
469
470     /* We don't want shader constant dirtification to be an O(contexts), so just dirtify the active
471      * context. On a context switch the old context will be fully dirtified */
472     memset(This->activeContext->vshader_const_dirty + start, 1,
473             sizeof(*This->activeContext->vshader_const_dirty) * count);
474     This->highest_dirty_vs_const = max(This->highest_dirty_vs_const, start + count + 1);
475 }
476
477 static void shader_arb_update_float_pixel_constants(IWineD3DDevice *iface, UINT start, UINT count)
478 {
479     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
480
481     /* We don't want shader constant dirtification to be an O(contexts), so just dirtify the active
482      * context. On a context switch the old context will be fully dirtified */
483     memset(This->activeContext->pshader_const_dirty + start, 1,
484             sizeof(*This->activeContext->pshader_const_dirty) * count);
485     This->highest_dirty_ps_const = max(This->highest_dirty_ps_const, start + count + 1);
486 }
487
488 static DWORD *local_const_mapping(IWineD3DBaseShaderImpl *This)
489 {
490     DWORD *ret;
491     DWORD idx = 0;
492     const local_constant *lconst;
493
494     if(This->baseShader.load_local_constsF || list_empty(&This->baseShader.constantsF)) return NULL;
495
496     ret = HeapAlloc(GetProcessHeap(), 0, sizeof(DWORD) * This->baseShader.limits.constant_float);
497     if(!ret) {
498         ERR("Out of memory\n");
499         return NULL;
500     }
501
502     LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
503         ret[lconst->idx] = idx++;
504     }
505     return ret;
506 }
507
508 /* Generate the variable & register declarations for the ARB_vertex_program output target */
509 static DWORD shader_generate_arb_declarations(IWineD3DBaseShader *iface, const shader_reg_maps *reg_maps,
510         SHADER_BUFFER *buffer, const WineD3D_GL_Info *gl_info, DWORD *lconst_map, DWORD *num_clipplanes,
511         struct shader_arb_ctx_priv *ctx)
512 {
513     IWineD3DBaseShaderImpl* This = (IWineD3DBaseShaderImpl*) iface;
514     DWORD i, next_local = 0;
515     char pshader = shader_is_pshader_version(reg_maps->shader_version.type);
516     unsigned max_constantsF;
517     const local_constant *lconst;
518
519     /* In pixel shaders, all private constants are program local, we don't need anything
520      * from program.env. Thus we can advertise the full set of constants in pixel shaders.
521      * If we need a private constant the GL implementation will squeeze it in somewhere
522      *
523      * With vertex shaders we need the posFixup and on some GL implementations 4 helper
524      * immediate values. The posFixup is loaded using program.env for now, so always
525      * subtract one from the number of constants. If the shader uses indirect addressing,
526      * account for the helper const too because we have to declare all availabke d3d constants
527      * and don't know which are actually used.
528      */
529     if(pshader) {
530         max_constantsF = GL_LIMITS(pshader_constantsF);
531     } else {
532         if(This->baseShader.reg_maps.usesrelconstF) {
533             max_constantsF = GL_LIMITS(vshader_constantsF) - reserved_vs_const(iface, gl_info);
534             max_constantsF -= count_bits(This->baseShader.reg_maps.integer_constants);
535             if(ctx->target_version >= NV2)
536             {
537                 DWORD highest_constf = 0;
538                 for(i = 0; i < This->baseShader.limits.constant_float; i++)
539                 {
540                     DWORD idx = i >> 5;
541                     DWORD shift = i & 0x1f;
542                     if(reg_maps->constf[idx] & (1 << shift)) highest_constf = i;
543                 }
544
545                 *num_clipplanes = min(GL_LIMITS(clipplanes), max_constantsF - highest_constf - 1);
546                 max_constantsF -= *num_clipplanes;
547                 if(*num_clipplanes < GL_LIMITS(clipplanes))
548                 {
549                     WARN("Only %u clipplanes out of %u enabled\n", *num_clipplanes, GL_LIMITS(clipplanes));
550                 }
551             }
552         } else {
553             if(ctx->target_version >= NV2) *num_clipplanes = GL_LIMITS(clipplanes);
554             else *num_clipplanes = 0;
555             max_constantsF = GL_LIMITS(vshader_constantsF) - 1;
556         }
557     }
558
559     for(i = 0; i < This->baseShader.limits.temporary; i++) {
560         if (reg_maps->temporary[i])
561             shader_addline(buffer, "TEMP R%u;\n", i);
562     }
563
564     for (i = 0; i < This->baseShader.limits.address; i++) {
565         if (reg_maps->address[i])
566             shader_addline(buffer, "ADDRESS A%d;\n", i);
567     }
568
569     if(pshader && reg_maps->shader_version.major == 1 && reg_maps->shader_version.minor <= 3) {
570         for(i = 0; i < This->baseShader.limits.texcoord; i++) {
571             if (reg_maps->texcoord[i] && pshader)
572                 shader_addline(buffer,"TEMP T%u;\n", i);
573         }
574     }
575
576     /* Load local constants using the program-local space,
577      * this avoids reloading them each time the shader is used
578      */
579     if(lconst_map) {
580         LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
581             shader_addline(buffer, "PARAM C%u = program.local[%u];\n", lconst->idx,
582                            lconst_map[lconst->idx]);
583             next_local = max(next_local, lconst_map[lconst->idx] + 1);
584         }
585     }
586
587     /* we use the array-based constants array if the local constants are marked for loading,
588      * because then we use indirect addressing, or when the local constant list is empty,
589      * because then we don't know if we're using indirect addressing or not. If we're hardcoding
590      * local constants do not declare the loaded constants as an array because ARB compilers usually
591      * do not optimize unused constants away
592      */
593     if(This->baseShader.reg_maps.usesrelconstF) {
594         /* Need to PARAM the environment parameters (constants) so we can use relative addressing */
595         shader_addline(buffer, "PARAM C[%d] = { program.env[0..%d] };\n",
596                     max_constantsF, max_constantsF - 1);
597     } else {
598         for(i = 0; i < max_constantsF; i++) {
599             DWORD idx, mask;
600             idx = i >> 5;
601             mask = 1 << (i & 0x1f);
602             if(!shader_constant_is_local(This, i) && (This->baseShader.reg_maps.constf[idx] & mask)) {
603                 shader_addline(buffer, "PARAM C%d = program.env[%d];\n",i, i);
604             }
605         }
606     }
607
608     return next_local;
609 }
610
611 static const char * const shift_tab[] = {
612     "dummy",     /*  0 (none) */
613     "coefmul.x", /*  1 (x2)   */
614     "coefmul.y", /*  2 (x4)   */
615     "coefmul.z", /*  3 (x8)   */
616     "coefmul.w", /*  4 (x16)  */
617     "dummy",     /*  5 (x32)  */
618     "dummy",     /*  6 (x64)  */
619     "dummy",     /*  7 (x128) */
620     "dummy",     /*  8 (d256) */
621     "dummy",     /*  9 (d128) */
622     "dummy",     /* 10 (d64)  */
623     "dummy",     /* 11 (d32)  */
624     "coefdiv.w", /* 12 (d16)  */
625     "coefdiv.z", /* 13 (d8)   */
626     "coefdiv.y", /* 14 (d4)   */
627     "coefdiv.x"  /* 15 (d2)   */
628 };
629
630 static void shader_arb_get_write_mask(const struct wined3d_shader_instruction *ins,
631         const struct wined3d_shader_dst_param *dst, char *write_mask)
632 {
633     char *ptr = write_mask;
634
635     if (dst->write_mask != WINED3DSP_WRITEMASK_ALL)
636     {
637         *ptr++ = '.';
638         if (dst->write_mask & WINED3DSP_WRITEMASK_0) *ptr++ = 'x';
639         if (dst->write_mask & WINED3DSP_WRITEMASK_1) *ptr++ = 'y';
640         if (dst->write_mask & WINED3DSP_WRITEMASK_2) *ptr++ = 'z';
641         if (dst->write_mask & WINED3DSP_WRITEMASK_3) *ptr++ = 'w';
642     }
643
644     *ptr = '\0';
645 }
646
647 static void shader_arb_get_swizzle(const struct wined3d_shader_src_param *param, BOOL fixup, char *swizzle_str)
648 {
649     /* For registers of type WINED3DDECLTYPE_D3DCOLOR, data is stored as "bgra",
650      * but addressed as "rgba". To fix this we need to swap the register's x
651      * and z components. */
652     const char *swizzle_chars = fixup ? "zyxw" : "xyzw";
653     char *ptr = swizzle_str;
654
655     /* swizzle bits fields: wwzzyyxx */
656     DWORD swizzle = param->swizzle;
657     DWORD swizzle_x = swizzle & 0x03;
658     DWORD swizzle_y = (swizzle >> 2) & 0x03;
659     DWORD swizzle_z = (swizzle >> 4) & 0x03;
660     DWORD swizzle_w = (swizzle >> 6) & 0x03;
661
662     /* If the swizzle is the default swizzle (ie, "xyzw"), we don't need to
663      * generate a swizzle string. Unless we need to our own swizzling. */
664     if (swizzle != WINED3DSP_NOSWIZZLE || fixup)
665     {
666         *ptr++ = '.';
667         if (swizzle_x == swizzle_y && swizzle_x == swizzle_z && swizzle_x == swizzle_w) {
668             *ptr++ = swizzle_chars[swizzle_x];
669         } else {
670             *ptr++ = swizzle_chars[swizzle_x];
671             *ptr++ = swizzle_chars[swizzle_y];
672             *ptr++ = swizzle_chars[swizzle_z];
673             *ptr++ = swizzle_chars[swizzle_w];
674         }
675     }
676
677     *ptr = '\0';
678 }
679
680 static void shader_arb_request_a0(const struct wined3d_shader_instruction *ins, const char *src)
681 {
682     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
683     SHADER_BUFFER *buffer = ins->ctx->buffer;
684
685     if(strcmp(priv->addr_reg, src) == 0) return;
686
687     strcpy(priv->addr_reg, src);
688     shader_addline(buffer, "ARL A0.x, %s;\n", src);
689 }
690
691 static void shader_arb_get_src_param(const struct wined3d_shader_instruction *ins,
692         const struct wined3d_shader_src_param *src, unsigned int tmpreg, char *outregstr);
693
694 static void shader_arb_get_register_name(const struct wined3d_shader_instruction *ins,
695         const struct wined3d_shader_register *reg, char *register_name, BOOL *is_color)
696 {
697     /* oPos, oFog and oPts in D3D */
698     static const char * const rastout_reg_names[] = {"TMP_OUT", "result.fogcoord", "result.pointsize"};
699     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
700     BOOL pshader = shader_is_pshader_version(This->baseShader.reg_maps.shader_version.type);
701     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
702
703     *is_color = FALSE;
704
705     switch (reg->type)
706     {
707         case WINED3DSPR_TEMP:
708             sprintf(register_name, "R%u", reg->idx);
709             break;
710
711         case WINED3DSPR_INPUT:
712             if (pshader)
713             {
714                 if(This->baseShader.reg_maps.shader_version.major < 3)
715                 {
716                     if (reg->idx == 0) strcpy(register_name, "fragment.color.primary");
717                     else strcpy(register_name, "fragment.color.secondary");
718                 }
719                 else
720                 {
721                     if(reg->rel_addr)
722                     {
723                         char rel_reg[50];
724                         shader_arb_get_src_param(ins, reg->rel_addr, 0, rel_reg);
725
726                         if(strcmp(rel_reg, "**aL_emul**") == 0)
727                         {
728                             DWORD idx = ctx->aL + reg->idx;
729                             if(idx < MAX_REG_INPUT)
730                             {
731                                 strcpy(register_name, ctx->ps_input[idx]);
732                             }
733                             else
734                             {
735                                 ERR("Pixel shader input register out of bounds: %u\n", idx);
736                                 sprintf(register_name, "out_of_bounds_%u", idx);
737                             }
738                         }
739                         else if(This->baseShader.reg_maps.input_registers & 0x0300)
740                         {
741                             /* There are two ways basically:
742                              *
743                              * 1) Use the unrolling code that is used for loop emulation and unroll the loop.
744                              *    That means trouble if the loop also contains a breakc or if the control values
745                              *    aren't local constants.
746                              * 2) Generate an if block that checks if aL.y < 8, == 8 or == 9 and selects the
747                              *    source dynamically. The trouble is that we cannot simply read aL.y because it
748                              *    is an ADDRESS register. We could however push it, load .zw with a value and use
749                              *    ADAC to load the condition code register and pop it again afterwards
750                              */
751                             FIXME("Relative input register addressing with more than 8 registers\n");
752
753                             /* This is better than nothing for now */
754                             sprintf(register_name, "fragment.texcoord[%s + %u]", rel_reg, reg->idx);
755                         }
756                         else if(ctx->cur_ps_args->super.vp_mode != vertexshader)
757                         {
758                             /* This is problematic because we'd have to consult the ctx->ps_input strings
759                              * for where to find the varying. Some may be "0.0", others can be texcoords or
760                              * colors. This needs either a pipeline replacement to make the vertex shader feed
761                              * proper varyings, or loop unrolling
762                              *
763                              * For now use the texcoords and hope for the best
764                              */
765                             FIXME("Non-vertex shader varying input with indirect addressing\n");
766                             sprintf(register_name, "fragment.texcoord[%s + %u]", rel_reg, reg->idx);
767                         }
768                         else
769                         {
770                             /* D3D supports indirect addressing only with aL in loop registers. The loop instruction
771                              * pulls GL_NV_fragment_program2 in
772                              */
773                             sprintf(register_name, "fragment.texcoord[%s + %u]", rel_reg, reg->idx);
774                         }
775                     }
776                     else
777                     {
778                         if(reg->idx < MAX_REG_INPUT)
779                         {
780                             strcpy(register_name, ctx->ps_input[reg->idx]);
781                         }
782                         else
783                         {
784                             ERR("Pixel shader input register out of bounds: %u\n", reg->idx);
785                             sprintf(register_name, "out_of_bounds_%u", reg->idx);
786                         }
787                     }
788                 }
789             }
790             else
791             {
792                 if (ctx->cur_vs_args->super.swizzle_map & (1 << reg->idx)) *is_color = TRUE;
793                 sprintf(register_name, "vertex.attrib[%u]", reg->idx);
794             }
795             break;
796
797         case WINED3DSPR_CONST:
798             if (!pshader && reg->rel_addr)
799             {
800                 BOOL aL = FALSE;
801                 char rel_reg[50];
802                 UINT rel_offset = ((IWineD3DVertexShaderImpl *)This)->rel_offset;
803                 if(This->baseShader.reg_maps.shader_version.major < 2) {
804                     sprintf(rel_reg, "A0.x");
805                 } else {
806                     shader_arb_get_src_param(ins, reg->rel_addr, 0, rel_reg);
807                     if(ctx->target_version == ARB) {
808                         if(strcmp(rel_reg, "**aL_emul**") == 0) {
809                             aL = TRUE;
810                         } else {
811                             shader_arb_request_a0(ins, rel_reg);
812                             sprintf(rel_reg, "A0.x");
813                         }
814                     }
815                 }
816                 if(aL)
817                     sprintf(register_name, "C[%u]", ctx->aL + reg->idx);
818                 else if (reg->idx >= rel_offset)
819                     sprintf(register_name, "C[%s + %u]", rel_reg, reg->idx - rel_offset);
820                 else
821                     sprintf(register_name, "C[%s - %u]", rel_reg, -reg->idx + rel_offset);
822             }
823             else
824             {
825                 if (This->baseShader.reg_maps.usesrelconstF)
826                     sprintf(register_name, "C[%u]", reg->idx);
827                 else
828                     sprintf(register_name, "C%u", reg->idx);
829             }
830             break;
831
832         case WINED3DSPR_TEXTURE: /* case WINED3DSPR_ADDR: */
833             if (pshader) {
834                 if(This->baseShader.reg_maps.shader_version.major == 1 &&
835                    This->baseShader.reg_maps.shader_version.minor <= 3) {
836                     /* In ps <= 1.3, Tx is a temporary register as destination to all instructions,
837                      * and as source to most instructions. For some instructions it is the texcoord
838                      * input. Those instructions know about the special use
839                      */
840                     sprintf(register_name, "T%u", reg->idx);
841                 } else {
842                     /* in ps 1.4 and 2.x Tx is always a (read-only) varying */
843                     sprintf(register_name, "fragment.texcoord[%u]", reg->idx);
844                 }
845             }
846             else
847             {
848                 if(This->baseShader.reg_maps.shader_version.major == 1 || ctx->target_version >= NV2)
849                 {
850                     sprintf(register_name, "A%u", reg->idx);
851                 }
852                 else
853                 {
854                     sprintf(register_name, "A%u_SHADOW", reg->idx);
855                 }
856             }
857             break;
858
859         case WINED3DSPR_COLOROUT:
860             if (reg->idx == 0)
861             {
862                 if(ctx->cur_ps_args->super.srgb_correction)
863                 {
864                     strcpy(register_name, "TMP_COLOR");
865                 }
866                 else
867                 {
868                     strcpy(register_name, "result.color");
869                 }
870             }
871             else
872             {
873                 /* TODO: See GL_ARB_draw_buffers */
874                 FIXME("Unsupported write to render target %u\n", reg->idx);
875                 sprintf(register_name, "unsupported_register");
876             }
877             break;
878
879         case WINED3DSPR_RASTOUT:
880             if(reg->idx == 1) sprintf(register_name, "%s", ctx->fog_output);
881             else sprintf(register_name, "%s", rastout_reg_names[reg->idx]);
882             break;
883
884         case WINED3DSPR_DEPTHOUT:
885             strcpy(register_name, "result.depth");
886             break;
887
888         case WINED3DSPR_ATTROUT:
889         /* case WINED3DSPR_OUTPUT: */
890             if (pshader) sprintf(register_name, "oD[%u]", reg->idx);
891             else strcpy(register_name, ctx->color_output[reg->idx]);
892             break;
893
894         case WINED3DSPR_TEXCRDOUT:
895             if (pshader)
896             {
897                 sprintf(register_name, "oT[%u]", reg->idx);
898             }
899             else
900             {
901                 if(This->baseShader.reg_maps.shader_version.major < 3)
902                 {
903                     sprintf(register_name, ctx->texcrd_output[reg->idx]);
904                 }
905                 else
906                 {
907                     sprintf(register_name, ctx->vs_output[reg->idx]);
908                 }
909             }
910             break;
911
912         case WINED3DSPR_LOOP:
913             if(ctx->target_version >= NV2)
914             {
915                 /* Pshader has an implicitly declared loop index counter A0.x that cannot be renamed */
916                 if(pshader) sprintf(register_name, "A0.x");
917                 else sprintf(register_name, "aL.y");
918             }
919             else
920             {
921                 /* Unfortunately this code cannot return the value of ctx->aL here. An immediate value
922                  * would be valid, but if aL is used for indexing(its only use), there's likely an offset,
923                  * thus the result would be something like C[15 + 30], which is not valid in the ARB program
924                  * grammar. So return a marker for the emulated aL and intercept it in constant and varying
925                  * indexing
926                  */
927                 sprintf(register_name, "**aL_emul**");
928             }
929
930             break;
931
932         case WINED3DSPR_CONSTINT:
933             sprintf(register_name, "I%u", reg->idx);
934             break;
935
936         case WINED3DSPR_MISCTYPE:
937             if(reg->idx == 0)
938             {
939                 sprintf(register_name, "vpos");
940             }
941             else if(reg->idx == 1)
942             {
943                 sprintf(register_name, "fragment.facing.x");
944             }
945             else
946             {
947                 FIXME("Unknown MISCTYPE register index %u\n", reg->idx);
948             }
949             break;
950
951         default:
952             FIXME("Unhandled register type %#x[%u]\n", reg->type, reg->idx);
953             sprintf(register_name, "unrecognized_register[%u]", reg->idx);
954             break;
955     }
956 }
957
958 static void shader_arb_get_dst_param(const struct wined3d_shader_instruction *ins,
959         const struct wined3d_shader_dst_param *wined3d_dst, char *str)
960 {
961     char register_name[255];
962     char write_mask[6];
963     BOOL is_color;
964
965     shader_arb_get_register_name(ins, &wined3d_dst->reg, register_name, &is_color);
966     strcpy(str, register_name);
967
968     shader_arb_get_write_mask(ins, wined3d_dst, write_mask);
969     strcat(str, write_mask);
970 }
971
972 static const char *shader_arb_get_fixup_swizzle(enum fixup_channel_source channel_source)
973 {
974     switch(channel_source)
975     {
976         case CHANNEL_SOURCE_ZERO: return "0";
977         case CHANNEL_SOURCE_ONE: return "1";
978         case CHANNEL_SOURCE_X: return "x";
979         case CHANNEL_SOURCE_Y: return "y";
980         case CHANNEL_SOURCE_Z: return "z";
981         case CHANNEL_SOURCE_W: return "w";
982         default:
983             FIXME("Unhandled channel source %#x\n", channel_source);
984             return "undefined";
985     }
986 }
987
988 static void gen_color_correction(SHADER_BUFFER *buffer, const char *reg, DWORD dst_mask,
989                                  const char *one, const char *two, struct color_fixup_desc fixup)
990 {
991     DWORD mask;
992
993     if (is_yuv_fixup(fixup))
994     {
995         enum yuv_fixup yuv_fixup = get_yuv_fixup(fixup);
996         FIXME("YUV fixup (%#x) not supported\n", yuv_fixup);
997         return;
998     }
999
1000     mask = 0;
1001     if (fixup.x_source != CHANNEL_SOURCE_X) mask |= WINED3DSP_WRITEMASK_0;
1002     if (fixup.y_source != CHANNEL_SOURCE_Y) mask |= WINED3DSP_WRITEMASK_1;
1003     if (fixup.z_source != CHANNEL_SOURCE_Z) mask |= WINED3DSP_WRITEMASK_2;
1004     if (fixup.w_source != CHANNEL_SOURCE_W) mask |= WINED3DSP_WRITEMASK_3;
1005     mask &= dst_mask;
1006
1007     if (mask)
1008     {
1009         shader_addline(buffer, "SWZ %s, %s, %s, %s, %s, %s;\n", reg, reg,
1010                 shader_arb_get_fixup_swizzle(fixup.x_source), shader_arb_get_fixup_swizzle(fixup.y_source),
1011                 shader_arb_get_fixup_swizzle(fixup.z_source), shader_arb_get_fixup_swizzle(fixup.w_source));
1012     }
1013
1014     mask = 0;
1015     if (fixup.x_sign_fixup) mask |= WINED3DSP_WRITEMASK_0;
1016     if (fixup.y_sign_fixup) mask |= WINED3DSP_WRITEMASK_1;
1017     if (fixup.z_sign_fixup) mask |= WINED3DSP_WRITEMASK_2;
1018     if (fixup.w_sign_fixup) mask |= WINED3DSP_WRITEMASK_3;
1019     mask &= dst_mask;
1020
1021     if (mask)
1022     {
1023         char reg_mask[6];
1024         char *ptr = reg_mask;
1025
1026         if (mask != WINED3DSP_WRITEMASK_ALL)
1027         {
1028             *ptr++ = '.';
1029             if (mask & WINED3DSP_WRITEMASK_0) *ptr++ = 'x';
1030             if (mask & WINED3DSP_WRITEMASK_1) *ptr++ = 'y';
1031             if (mask & WINED3DSP_WRITEMASK_2) *ptr++ = 'z';
1032             if (mask & WINED3DSP_WRITEMASK_3) *ptr++ = 'w';
1033         }
1034         *ptr = '\0';
1035
1036         shader_addline(buffer, "MAD %s%s, %s, %s, -%s;\n", reg, reg_mask, reg, two, one);
1037     }
1038 }
1039
1040 static const char *shader_arb_get_modifier(const struct wined3d_shader_instruction *ins)
1041 {
1042     DWORD mod;
1043     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
1044     if (!ins->dst_count) return "";
1045
1046     mod = ins->dst[0].modifiers;
1047
1048     /* Silently ignore PARTIALPRECISION if its not supported */
1049     if(priv->target_version == ARB) mod &= ~WINED3DSPDM_PARTIALPRECISION;
1050
1051     if(mod & WINED3DSPDM_MSAMPCENTROID)
1052     {
1053         FIXME("Unhandled modifier WINED3DSPDM_MSAMPCENTROID\n");
1054         mod &= ~WINED3DSPDM_MSAMPCENTROID;
1055     }
1056
1057     switch(mod)
1058     {
1059         case WINED3DSPDM_SATURATE | WINED3DSPDM_PARTIALPRECISION:
1060             return "H_SAT";
1061
1062         case WINED3DSPDM_SATURATE:
1063             return "_SAT";
1064
1065         case WINED3DSPDM_PARTIALPRECISION:
1066             return "H";
1067
1068         case 0:
1069             return "";
1070
1071         default:
1072             FIXME("Unknown modifiers 0x%08x\n", mod);
1073             return "";
1074     }
1075 }
1076
1077 #define TEX_PROJ        0x1
1078 #define TEX_BIAS        0x2
1079 #define TEX_LOD         0x4
1080 #define TEX_DERIV       0x10
1081
1082 static void shader_hw_sample(const struct wined3d_shader_instruction *ins, DWORD sampler_idx,
1083         const char *dst_str, const char *coord_reg, WORD flags, const char *dsx, const char *dsy)
1084 {
1085     SHADER_BUFFER *buffer = ins->ctx->buffer;
1086     DWORD sampler_type = ins->ctx->reg_maps->sampler_type[sampler_idx];
1087     const char *tex_type;
1088     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
1089     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) This->baseShader.device;
1090     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
1091     const char *mod;
1092
1093     switch(sampler_type) {
1094         case WINED3DSTT_1D:
1095             tex_type = "1D";
1096             break;
1097
1098         case WINED3DSTT_2D:
1099             if(device->stateBlock->textures[sampler_idx] &&
1100                IWineD3DBaseTexture_GetTextureDimensions(device->stateBlock->textures[sampler_idx]) == GL_TEXTURE_RECTANGLE_ARB) {
1101                 tex_type = "RECT";
1102             } else {
1103                 tex_type = "2D";
1104             }
1105             if (shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type))
1106             {
1107                 if(priv->cur_ps_args->super.np2_fixup & (1 << sampler_idx))
1108                 {
1109                     FIXME("NP2 texcoord fixup is currently not implemented in ARB mode (use GLSL instead).\n");
1110                 }
1111             }
1112             break;
1113
1114         case WINED3DSTT_VOLUME:
1115             tex_type = "3D";
1116             break;
1117
1118         case WINED3DSTT_CUBE:
1119             tex_type = "CUBE";
1120             break;
1121
1122         default:
1123             ERR("Unexpected texture type %d\n", sampler_type);
1124             tex_type = "";
1125     }
1126
1127     /* TEX, TXL, TXD and TXP do not support the "H" modifier,
1128      * so don't use shader_arb_get_modifier
1129      */
1130     if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE) mod = "_SAT";
1131     else mod = "";
1132
1133     if (flags & TEX_DERIV)
1134     {
1135         if(flags & TEX_PROJ) FIXME("Projected texture sampling with custom derivates\n");
1136         if(flags & TEX_BIAS) FIXME("Biased texture sampling with custom derivates\n");
1137         shader_addline(buffer, "TXD%s %s, %s, %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg,
1138                        dsx, dsy,sampler_idx, tex_type);
1139     }
1140     else if(flags & TEX_LOD)
1141     {
1142         if(flags & TEX_PROJ) FIXME("Projected texture sampling with explicit lod\n");
1143         if(flags & TEX_BIAS) FIXME("Biased texture sampling with explicit lod\n");
1144         shader_addline(buffer, "TXL%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg,
1145                        sampler_idx, tex_type);
1146     }
1147     else if (flags & TEX_BIAS)
1148     {
1149         /* Shouldn't be possible, but let's check for it */
1150         if(flags & TEX_PROJ) FIXME("Biased and Projected texture sampling\n");
1151         /* TXB takes the 4th component of the source vector automatically, as d3d. Nothing more to do */
1152         shader_addline(buffer, "TXB%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg, sampler_idx, tex_type);
1153     }
1154     else if (flags & TEX_PROJ)
1155     {
1156         shader_addline(buffer, "TXP%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg, sampler_idx, tex_type);
1157     }
1158     else
1159     {
1160         shader_addline(buffer, "TEX%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg, sampler_idx, tex_type);
1161     }
1162
1163     if (shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type))
1164     {
1165         gen_color_correction(buffer, dst_str, ins->dst[0].write_mask,
1166                 "one", "coefmul.x", priv->cur_ps_args->super.color_fixup[sampler_idx]);
1167     }
1168 }
1169
1170 static void shader_arb_get_src_param(const struct wined3d_shader_instruction *ins,
1171         const struct wined3d_shader_src_param *src, unsigned int tmpreg, char *outregstr)
1172 {
1173     /* Generate a line that does the input modifier computation and return the input register to use */
1174     BOOL is_color = FALSE;
1175     char regstr[256];
1176     char swzstr[20];
1177     int insert_line;
1178     SHADER_BUFFER *buffer = ins->ctx->buffer;
1179     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1180
1181     /* Assume a new line will be added */
1182     insert_line = 1;
1183
1184     /* Get register name */
1185     shader_arb_get_register_name(ins, &src->reg, regstr, &is_color);
1186     shader_arb_get_swizzle(src, is_color, swzstr);
1187
1188     switch (src->modifiers)
1189     {
1190     case WINED3DSPSM_NONE:
1191         sprintf(outregstr, "%s%s", regstr, swzstr);
1192         insert_line = 0;
1193         break;
1194     case WINED3DSPSM_NEG:
1195         sprintf(outregstr, "-%s%s", regstr, swzstr);
1196         insert_line = 0;
1197         break;
1198     case WINED3DSPSM_BIAS:
1199         shader_addline(buffer, "ADD T%c, %s, -coefdiv.x;\n", 'A' + tmpreg, regstr);
1200         break;
1201     case WINED3DSPSM_BIASNEG:
1202         shader_addline(buffer, "ADD T%c, -%s, coefdiv.x;\n", 'A' + tmpreg, regstr);
1203         break;
1204     case WINED3DSPSM_SIGN:
1205         shader_addline(buffer, "MAD T%c, %s, coefmul.x, -one.x;\n", 'A' + tmpreg, regstr);
1206         break;
1207     case WINED3DSPSM_SIGNNEG:
1208         shader_addline(buffer, "MAD T%c, %s, -coefmul.x, one.x;\n", 'A' + tmpreg, regstr);
1209         break;
1210     case WINED3DSPSM_COMP:
1211         shader_addline(buffer, "SUB T%c, one.x, %s;\n", 'A' + tmpreg, regstr);
1212         break;
1213     case WINED3DSPSM_X2:
1214         shader_addline(buffer, "ADD T%c, %s, %s;\n", 'A' + tmpreg, regstr, regstr);
1215         break;
1216     case WINED3DSPSM_X2NEG:
1217         shader_addline(buffer, "ADD T%c, -%s, -%s;\n", 'A' + tmpreg, regstr, regstr);
1218         break;
1219     case WINED3DSPSM_DZ:
1220         shader_addline(buffer, "RCP T%c, %s.z;\n", 'A' + tmpreg, regstr);
1221         shader_addline(buffer, "MUL T%c, %s, T%c;\n", 'A' + tmpreg, regstr, 'A' + tmpreg);
1222         break;
1223     case WINED3DSPSM_DW:
1224         shader_addline(buffer, "RCP T%c, %s.w;\n", 'A' + tmpreg, regstr);
1225         shader_addline(buffer, "MUL T%c, %s, T%c;\n", 'A' + tmpreg, regstr, 'A' + tmpreg);
1226         break;
1227     case WINED3DSPSM_ABS:
1228         if(ctx->target_version >= NV2) {
1229             sprintf(outregstr, "|%s%s|", regstr, swzstr);
1230             insert_line = 0;
1231         } else {
1232             shader_addline(buffer, "ABS T%c, %s;\n", 'A' + tmpreg, regstr);
1233         }
1234         break;
1235     case WINED3DSPSM_ABSNEG:
1236         if(ctx->target_version >= NV2) {
1237             sprintf(outregstr, "-|%s%s|", regstr, swzstr);
1238         } else {
1239             shader_addline(buffer, "ABS T%c, %s;\n", 'A' + tmpreg, regstr);
1240             sprintf(outregstr, "-T%c%s", 'A' + tmpreg, swzstr);
1241         }
1242         insert_line = 0;
1243         break;
1244     default:
1245         sprintf(outregstr, "%s%s", regstr, swzstr);
1246         insert_line = 0;
1247     }
1248
1249     /* Return modified or original register, with swizzle */
1250     if (insert_line)
1251         sprintf(outregstr, "T%c%s", 'A' + tmpreg, swzstr);
1252 }
1253
1254 static void pshader_hw_bem(const struct wined3d_shader_instruction *ins)
1255 {
1256     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1257     SHADER_BUFFER *buffer = ins->ctx->buffer;
1258     char dst_name[50];
1259     char src_name[2][50];
1260     DWORD sampler_code = dst->reg.idx;
1261
1262     shader_arb_get_dst_param(ins, dst, dst_name);
1263
1264     /* Sampling the perturbation map in Tsrc was done already, including the signedness correction if needed
1265      *
1266      * Keep in mind that src_name[1] can be "TB" and src_name[0] can be "TA" because modifiers like _x2 are valid
1267      * with bem. So delay loading the first parameter until after the perturbation calculation which needs two
1268      * temps is done.
1269      */
1270     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1271     shader_addline(buffer, "SWZ TA, bumpenvmat%d, x, z, 0, 0;\n", sampler_code);
1272     shader_addline(buffer, "DP3 TC.r, TA, %s;\n", src_name[1]);
1273     shader_addline(buffer, "SWZ TA, bumpenvmat%d, y, w, 0, 0;\n", sampler_code);
1274     shader_addline(buffer, "DP3 TC.g, TA, %s;\n", src_name[1]);
1275
1276     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
1277     shader_addline(buffer, "ADD %s, %s, TC;\n", dst_name, src_name[0]);
1278 }
1279
1280 static void pshader_hw_cnd(const struct wined3d_shader_instruction *ins)
1281 {
1282     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1283     SHADER_BUFFER *buffer = ins->ctx->buffer;
1284     char dst_name[50];
1285     char src_name[3][50];
1286     DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
1287             ins->ctx->reg_maps->shader_version.minor);
1288     BOOL is_color;
1289
1290     shader_arb_get_dst_param(ins, dst, dst_name);
1291     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1292
1293     /* The coissue flag changes the semantic of the cnd instruction in <= 1.3 shaders */
1294     if (shader_version <= WINED3D_SHADER_VERSION(1, 3) && ins->coissue)
1295     {
1296         shader_addline(buffer, "MOV%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name[1]);
1297     } else {
1298         shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
1299         shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1300         shader_addline(buffer, "ADD TA, -%s, coefdiv.x;\n", src_name[0]);
1301         /* No modifiers supported on CMP */
1302         shader_addline(buffer, "CMP %s, TA, %s, %s;\n", dst_name, src_name[1], src_name[2]);
1303
1304         /* _SAT on CMP doesn't make much sense, but it is not a pure NOP */
1305         if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE)
1306         {
1307             shader_arb_get_register_name(ins, &dst->reg, src_name[0], &is_color);
1308             shader_addline(buffer, "MOV_SAT %s, %s;\n", dst_name, dst_name);
1309         }
1310     }
1311 }
1312
1313 static void pshader_hw_cmp(const struct wined3d_shader_instruction *ins)
1314 {
1315     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1316     SHADER_BUFFER *buffer = ins->ctx->buffer;
1317     char dst_name[50];
1318     char src_name[3][50];
1319     BOOL is_color;
1320
1321     shader_arb_get_dst_param(ins, dst, dst_name);
1322
1323     /* Generate input register names (with modifiers) */
1324     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
1325     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1326     shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1327
1328     /* No modifiers are supported on CMP */
1329     shader_addline(buffer, "CMP %s, %s, %s, %s;\n", dst_name,
1330                    src_name[0], src_name[2], src_name[1]);
1331
1332     if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE)
1333     {
1334         shader_arb_get_register_name(ins, &dst->reg, src_name[0], &is_color);
1335         shader_addline(buffer, "MOV_SAT %s, %s;\n", dst_name, src_name[0]);
1336     }
1337 }
1338
1339 /** Process the WINED3DSIO_DP2ADD instruction in ARB.
1340  * dst = dot2(src0, src1) + src2 */
1341 static void pshader_hw_dp2add(const struct wined3d_shader_instruction *ins)
1342 {
1343     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1344     SHADER_BUFFER *buffer = ins->ctx->buffer;
1345     char dst_name[50];
1346     char src_name[3][50];
1347     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1348
1349     shader_arb_get_dst_param(ins, dst, dst_name);
1350     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
1351     shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1352
1353     if(ctx->target_version >= NV3)
1354     {
1355         /* GL_NV_fragment_program2 has a 1:1 matching instruction */
1356         shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1357         shader_addline(buffer, "DP2A%s %s, %s, %s, %s;\n", shader_arb_get_modifier(ins),
1358                        dst_name, src_name[0], src_name[1], src_name[2]);
1359     }
1360     else if(ctx->target_version >= NV2)
1361     {
1362         /* dst.x = src2.?, src0.x, src1.x + src0.y * src1.y
1363          * dst.y = src2.?, src0.x, src1.z + src0.y * src1.w
1364          * dst.z = src2.?, src0.x, src1.x + src0.y * src1.y
1365          * dst.z = src2.?, src0.x, src1.z + src0.y * src1.w
1366          *
1367          * Make sure that src1.zw = src1.xy, then we get a classic dp2add
1368          *
1369          * .xyxy and other swizzles that we could get with this are not valid in
1370          * plain ARBfp, but luckily the NV extension grammar lifts this limitation.
1371          */
1372         struct wined3d_shader_src_param tmp_param = ins->src[1];
1373         DWORD swizzle = tmp_param.swizzle & 0xf; /* Selects .xy */
1374         tmp_param.swizzle = swizzle | (swizzle << 4); /* Creates .xyxy */
1375
1376         shader_arb_get_src_param(ins, &tmp_param, 1, src_name[1]);
1377
1378         shader_addline(buffer, "X2D%s %s, %s, %s, %s;\n", shader_arb_get_modifier(ins),
1379                        dst_name, src_name[2], src_name[0], src_name[1]);
1380     }
1381     else
1382     {
1383         shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1384         /* Emulate a DP2 with a DP3 and 0.0. Don't use the dest as temp register, it could be src[1] or src[2]
1385         * src_name[0] can be TA, but TA is a private temp for modifiers, so it is save to overwrite
1386         */
1387         shader_addline(buffer, "MOV TA, %s;\n", src_name[0]);
1388         shader_addline(buffer, "MOV TA.z, 0.0;\n");
1389         shader_addline(buffer, "DP3 TA, TA, %s;\n", src_name[1]);
1390         shader_addline(buffer, "ADD%s %s, TA, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name[2]);
1391     }
1392 }
1393
1394 /* Map the opcode 1-to-1 to the GL code */
1395 static void shader_hw_map2gl(const struct wined3d_shader_instruction *ins)
1396 {
1397     SHADER_BUFFER *buffer = ins->ctx->buffer;
1398     const char *instruction;
1399     char arguments[256], dst_str[50];
1400     unsigned int i;
1401     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1402
1403     switch (ins->handler_idx)
1404     {
1405         case WINED3DSIH_ABS: instruction = "ABS"; break;
1406         case WINED3DSIH_ADD: instruction = "ADD"; break;
1407         case WINED3DSIH_CRS: instruction = "XPD"; break;
1408         case WINED3DSIH_DP3: instruction = "DP3"; break;
1409         case WINED3DSIH_DP4: instruction = "DP4"; break;
1410         case WINED3DSIH_DST: instruction = "DST"; break;
1411         case WINED3DSIH_EXP: instruction = "EX2"; break;
1412         case WINED3DSIH_EXPP: instruction = "EXP"; break;
1413         case WINED3DSIH_FRC: instruction = "FRC"; break;
1414         case WINED3DSIH_LIT: instruction = "LIT"; break;
1415         case WINED3DSIH_LOG: instruction = "LG2"; break;
1416         case WINED3DSIH_LOGP: instruction = "LOG"; break;
1417         case WINED3DSIH_LRP: instruction = "LRP"; break;
1418         case WINED3DSIH_MAD: instruction = "MAD"; break;
1419         case WINED3DSIH_MAX: instruction = "MAX"; break;
1420         case WINED3DSIH_MIN: instruction = "MIN"; break;
1421         case WINED3DSIH_MOV: instruction = "MOV"; break;
1422         case WINED3DSIH_MUL: instruction = "MUL"; break;
1423         case WINED3DSIH_POW: instruction = "POW"; break;
1424         case WINED3DSIH_SGE: instruction = "SGE"; break;
1425         case WINED3DSIH_SLT: instruction = "SLT"; break;
1426         case WINED3DSIH_SUB: instruction = "SUB"; break;
1427         case WINED3DSIH_MOVA:instruction = "ARR"; break;
1428         case WINED3DSIH_SGN: instruction = "SSG"; break;
1429         case WINED3DSIH_DSX: instruction = "DDX"; break;
1430         default: instruction = "";
1431             FIXME("Unhandled opcode %#x\n", ins->handler_idx);
1432             break;
1433     }
1434
1435     /* Note that shader_arb_add_dst_param() adds spaces. */
1436     arguments[0] = '\0';
1437     shader_arb_get_dst_param(ins, dst, dst_str);
1438     for (i = 0; i < ins->src_count; ++i)
1439     {
1440         char operand[100];
1441         strcat(arguments, ", ");
1442         shader_arb_get_src_param(ins, &ins->src[i], i, operand);
1443         strcat(arguments, operand);
1444     }
1445     shader_addline(buffer, "%s%s %s%s;\n", instruction, shader_arb_get_modifier(ins), dst_str, arguments);
1446 }
1447
1448 static void shader_hw_nop(const struct wined3d_shader_instruction *ins)
1449 {
1450     SHADER_BUFFER *buffer = ins->ctx->buffer;
1451     shader_addline(buffer, "NOP;\n");
1452 }
1453
1454 static void shader_hw_mov(const struct wined3d_shader_instruction *ins)
1455 {
1456     IWineD3DBaseShaderImpl *shader = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
1457     BOOL pshader = shader_is_pshader_version(shader->baseShader.reg_maps.shader_version.type);
1458     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1459
1460     SHADER_BUFFER *buffer = ins->ctx->buffer;
1461     char src0_param[256];
1462
1463     if(ins->handler_idx == WINED3DSIH_MOVA) {
1464         struct wined3d_shader_src_param tmp_src = ins->src[0];
1465         char write_mask[6];
1466
1467         if(ctx->target_version >= NV2) {
1468             shader_hw_map2gl(ins);
1469             return;
1470         }
1471         tmp_src.swizzle = (tmp_src.swizzle & 0x3) * 0x55;
1472         shader_arb_get_src_param(ins, &tmp_src, 0, src0_param);
1473         shader_arb_get_write_mask(ins, &ins->dst[0], write_mask);
1474
1475         /* This implements the mova formula used in GLSL. The first two instructions
1476          * prepare the sign() part. Note that it is fine to have my_sign(0.0) = 1.0
1477          * in this case:
1478          * mova A0.x, 0.0
1479          *
1480          * A0.x = arl(floor(abs(0.0) + 0.5) * 1.0) = floor(0.5) = 0.0 since arl does a floor
1481          *
1482          * The ARL is performed when A0 is used - the requested component is read from A0_SHADOW into
1483          * A0.x. We can use the overwritten component of A0_shadow as temporary storage for the sign.
1484          */
1485         shader_addline(buffer, "SGE A0_SHADOW%s, %s, mova_const.y;\n", write_mask, src0_param);
1486         shader_addline(buffer, "MAD A0_SHADOW%s, A0_SHADOW, mova_const.z, -mova_const.w;\n", write_mask);
1487
1488         shader_addline(buffer, "ABS TA%s, %s;\n", write_mask, src0_param);
1489         shader_addline(buffer, "ADD TA%s, TA, mova_const.x;\n", write_mask);
1490         shader_addline(buffer, "FLR TA%s, TA;\n", write_mask);
1491         shader_addline(buffer, "MUL A0_SHADOW%s, TA, A0_SHADOW;\n", write_mask);
1492
1493         ((struct shader_arb_ctx_priv *)ins->ctx->backend_data)->addr_reg[0] = '\0';
1494     } else if (ins->ctx->reg_maps->shader_version.major == 1
1495           && !shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)
1496           && ins->dst[0].reg.type == WINED3DSPR_ADDR)
1497     {
1498         src0_param[0] = '\0';
1499         if (((IWineD3DVertexShaderImpl *)shader)->rel_offset)
1500         {
1501             shader_arb_get_src_param(ins, &ins->src[0], 0, src0_param);
1502             shader_addline(buffer, "ADD TA.x, %s, helper_const.z;\n", src0_param);
1503             shader_addline(buffer, "ARL A0.x, TA.x;\n");
1504         }
1505         else
1506         {
1507             /* Apple's ARB_vertex_program implementation does not accept an ARL source argument
1508              * with more than one component. Thus replicate the first source argument over all
1509              * 4 components. For example, .xyzw -> .x (or better: .xxxx), .zwxy -> .z, etc) */
1510             struct wined3d_shader_src_param tmp_src = ins->src[0];
1511             tmp_src.swizzle = (tmp_src.swizzle & 0x3) * 0x55;
1512             shader_arb_get_src_param(ins, &tmp_src, 0, src0_param);
1513             shader_addline(buffer, "ARL A0.x, %s;\n", src0_param);
1514         }
1515     }
1516     else if(ins->dst[0].reg.type == WINED3DSPR_COLOROUT && ins->dst[0].reg.idx == 0 && pshader)
1517     {
1518         IWineD3DPixelShaderImpl *ps = (IWineD3DPixelShaderImpl *) shader;
1519         if(ctx->cur_ps_args->super.srgb_correction && ps->color0_mov)
1520         {
1521             shader_addline(buffer, "#mov handled in srgb write code\n");
1522             return;
1523         }
1524         shader_hw_map2gl(ins);
1525     }
1526     else
1527     {
1528         shader_hw_map2gl(ins);
1529     }
1530 }
1531
1532 static void pshader_hw_texkill(const struct wined3d_shader_instruction *ins)
1533 {
1534     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1535     SHADER_BUFFER *buffer = ins->ctx->buffer;
1536     char reg_dest[40];
1537
1538     /* No swizzles are allowed in d3d's texkill. PS 1.x ignores the 4th component as documented,
1539      * but >= 2.0 honors it(undocumented, but tested by the d3d9 testsuit)
1540      */
1541     shader_arb_get_dst_param(ins, dst, reg_dest);
1542
1543     if (ins->ctx->reg_maps->shader_version.major >= 2)
1544     {
1545         /* The arb backend doesn't claim ps 2.0 support, but try to eat what the app feeds to us */
1546         shader_arb_get_dst_param(ins, dst, reg_dest);
1547         shader_addline(buffer, "KIL %s;\n", reg_dest);
1548     } else {
1549         /* ARB fp doesn't like swizzles on the parameter of the KIL instruction. To mask the 4th component,
1550          * copy the register into our general purpose TMP variable, overwrite .w and pass TMP to KIL
1551          *
1552          * ps_1_3 shaders use the texcoord incarnation of the Tx register. ps_1_4 shaders can use the same,
1553          * or pass in any temporary register(in shader phase 2)
1554          */
1555         if(ins->ctx->reg_maps->shader_version.minor <= 3) {
1556             sprintf(reg_dest, "fragment.texcoord[%u]", dst->reg.idx);
1557         } else {
1558             shader_arb_get_dst_param(ins, dst, reg_dest);
1559         }
1560         shader_addline(buffer, "SWZ TA, %s, x, y, z, 1;\n", reg_dest);
1561         shader_addline(buffer, "KIL TA;\n");
1562     }
1563 }
1564
1565 static void pshader_hw_tex(const struct wined3d_shader_instruction *ins)
1566 {
1567     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1568     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1569     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1570     DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
1571             ins->ctx->reg_maps->shader_version.minor);
1572     struct wined3d_shader_src_param src;
1573
1574     char reg_dest[40];
1575     char reg_coord[40];
1576     DWORD reg_sampler_code;
1577     DWORD myflags = 0;
1578
1579     /* All versions have a destination register */
1580     shader_arb_get_dst_param(ins, dst, reg_dest);
1581
1582     /* 1.0-1.4: Use destination register number as texture code.
1583        2.0+: Use provided sampler number as texure code. */
1584     if (shader_version < WINED3D_SHADER_VERSION(2,0))
1585         reg_sampler_code = dst->reg.idx;
1586     else
1587         reg_sampler_code = ins->src[1].reg.idx;
1588
1589     /* 1.0-1.3: Use the texcoord varying.
1590        1.4+: Use provided coordinate source register. */
1591     if (shader_version < WINED3D_SHADER_VERSION(1,4))
1592         sprintf(reg_coord, "fragment.texcoord[%u]", reg_sampler_code);
1593     else {
1594         /* TEX is the only instruction that can handle DW and DZ natively */
1595         src = ins->src[0];
1596         if(src.modifiers == WINED3DSPSM_DW) src.modifiers = WINED3DSPSM_NONE;
1597         if(src.modifiers == WINED3DSPSM_DZ) src.modifiers = WINED3DSPSM_NONE;
1598         shader_arb_get_src_param(ins, &src, 0, reg_coord);
1599     }
1600
1601     /* projection flag:
1602      * 1.1, 1.2, 1.3: Use WINED3DTSS_TEXTURETRANSFORMFLAGS
1603      * 1.4: Use WINED3DSPSM_DZ or WINED3DSPSM_DW on src[0]
1604      * 2.0+: Use WINED3DSI_TEXLD_PROJECT on the opcode
1605      */
1606     if (shader_version < WINED3D_SHADER_VERSION(1,4))
1607     {
1608         DWORD flags = 0;
1609         if(reg_sampler_code < MAX_TEXTURES) {
1610             flags = deviceImpl->stateBlock->textureState[reg_sampler_code][WINED3DTSS_TEXTURETRANSFORMFLAGS];
1611         }
1612         if (flags & WINED3DTTFF_PROJECTED) {
1613             myflags |= TEX_PROJ;
1614         }
1615     }
1616     else if (shader_version < WINED3D_SHADER_VERSION(2,0))
1617     {
1618         DWORD src_mod = ins->src[0].modifiers;
1619         if (src_mod == WINED3DSPSM_DZ) {
1620             /* TXP cannot handle DZ natively, so move the z coordinate to .w. reg_coord is a read-only
1621              * varying register, so we need a temp reg
1622              */
1623             shader_addline(ins->ctx->buffer, "SWZ TA, %s, x, y, z, z;\n", reg_coord);
1624             strcpy(reg_coord, "TA");
1625             myflags |= TEX_PROJ;
1626         } else if(src_mod == WINED3DSPSM_DW) {
1627             myflags |= TEX_PROJ;
1628         }
1629     } else {
1630         if (ins->flags & WINED3DSI_TEXLD_PROJECT) myflags |= TEX_PROJ;
1631         if (ins->flags & WINED3DSI_TEXLD_BIAS) myflags |= TEX_BIAS;
1632     }
1633     shader_hw_sample(ins, reg_sampler_code, reg_dest, reg_coord, myflags, NULL, NULL);
1634 }
1635
1636 static void pshader_hw_texcoord(const struct wined3d_shader_instruction *ins)
1637 {
1638     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1639     SHADER_BUFFER *buffer = ins->ctx->buffer;
1640     DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
1641             ins->ctx->reg_maps->shader_version.minor);
1642     char dst_str[50];
1643
1644     if (shader_version < WINED3D_SHADER_VERSION(1,4))
1645     {
1646         DWORD reg = dst->reg.idx;
1647
1648         shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1649         shader_addline(buffer, "MOV_SAT %s, fragment.texcoord[%u];\n", dst_str, reg);
1650     } else {
1651         char reg_src[40];
1652
1653         shader_arb_get_src_param(ins, &ins->src[0], 0, reg_src);
1654         shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1655         shader_addline(buffer, "MOV %s, %s;\n", dst_str, reg_src);
1656    }
1657 }
1658
1659 static void pshader_hw_texreg2ar(const struct wined3d_shader_instruction *ins)
1660 {
1661      SHADER_BUFFER *buffer = ins->ctx->buffer;
1662      IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1663      IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1664      DWORD flags;
1665
1666      DWORD reg1 = ins->dst[0].reg.idx;
1667      char dst_str[50];
1668      char src_str[50];
1669
1670      /* Note that texreg2ar treats Tx as a temporary register, not as a varying */
1671      shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1672      shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
1673      /* Move .x first in case src_str is "TA" */
1674      shader_addline(buffer, "MOV TA.y, %s.x;\n", src_str);
1675      shader_addline(buffer, "MOV TA.x, %s.w;\n", src_str);
1676      flags = reg1 < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg1][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1677      shader_hw_sample(ins, reg1, dst_str, "TA", flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
1678 }
1679
1680 static void pshader_hw_texreg2gb(const struct wined3d_shader_instruction *ins)
1681 {
1682      SHADER_BUFFER *buffer = ins->ctx->buffer;
1683
1684      DWORD reg1 = ins->dst[0].reg.idx;
1685      char dst_str[50];
1686      char src_str[50];
1687
1688      /* Note that texreg2gb treats Tx as a temporary register, not as a varying */
1689      shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1690      shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
1691      shader_addline(buffer, "MOV TA.x, %s.y;\n", src_str);
1692      shader_addline(buffer, "MOV TA.y, %s.z;\n", src_str);
1693      shader_hw_sample(ins, reg1, dst_str, "TA", 0, NULL, NULL);
1694 }
1695
1696 static void pshader_hw_texreg2rgb(const struct wined3d_shader_instruction *ins)
1697 {
1698     DWORD reg1 = ins->dst[0].reg.idx;
1699     char dst_str[50];
1700     char src_str[50];
1701
1702     /* Note that texreg2rg treats Tx as a temporary register, not as a varying */
1703     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1704     shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
1705     shader_hw_sample(ins, reg1, dst_str, src_str, 0, NULL, NULL);
1706 }
1707
1708 static void pshader_hw_texbem(const struct wined3d_shader_instruction *ins)
1709 {
1710     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1711     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1712     SHADER_BUFFER *buffer = ins->ctx->buffer;
1713     char reg_coord[40], dst_reg[50], src_reg[50];
1714     DWORD reg_dest_code;
1715
1716     /* All versions have a destination register. The Tx where the texture coordinates come
1717      * from is the varying incarnation of the texture register
1718      */
1719     reg_dest_code = dst->reg.idx;
1720     shader_arb_get_dst_param(ins, &ins->dst[0], dst_reg);
1721     shader_arb_get_src_param(ins, &ins->src[0], 0, src_reg);
1722     sprintf(reg_coord, "fragment.texcoord[%u]", reg_dest_code);
1723
1724     /* Sampling the perturbation map in Tsrc was done already, including the signedness correction if needed
1725      * The Tx in which the perturbation map is stored is the tempreg incarnation of the texture register
1726      *
1727      * GL_NV_fragment_program_option could handle this in one instruction via X2D:
1728      * X2D TA.xy, fragment.texcoord, T%u, bumpenvmat%u.xzyw
1729      *
1730      * However, the NV extensions are never enabled for <= 2.0 shaders because of the performance penalty that
1731      * comes with it, and texbem is an 1.x only instruction. No 1.x instruction forces us to enable the NV
1732      * extension.
1733      */
1734     shader_addline(buffer, "SWZ TB, bumpenvmat%d, x, z, 0, 0;\n", reg_dest_code);
1735     shader_addline(buffer, "DP3 TA.x, TB, %s;\n", src_reg);
1736     shader_addline(buffer, "SWZ TB, bumpenvmat%d, y, w, 0, 0;\n", reg_dest_code);
1737     shader_addline(buffer, "DP3 TA.y, TB, %s;\n", src_reg);
1738
1739     /* with projective textures, texbem only divides the static texture coord, not the displacement,
1740      * so we can't let the GL handle this.
1741      */
1742     if (((IWineD3DDeviceImpl*) This->baseShader.device)->stateBlock->textureState[reg_dest_code][WINED3DTSS_TEXTURETRANSFORMFLAGS]
1743             & WINED3DTTFF_PROJECTED) {
1744         shader_addline(buffer, "RCP TB.w, %s.w;\n", reg_coord);
1745         shader_addline(buffer, "MUL TB.xy, %s, TB.w;\n", reg_coord);
1746         shader_addline(buffer, "ADD TA.xy, TA, TB;\n");
1747     } else {
1748         shader_addline(buffer, "ADD TA.xy, TA, %s;\n", reg_coord);
1749     }
1750
1751     shader_hw_sample(ins, reg_dest_code, dst_reg, "TA", 0, NULL, NULL);
1752
1753     if (ins->handler_idx == WINED3DSIH_TEXBEML)
1754     {
1755         /* No src swizzles are allowed, so this is ok */
1756         shader_addline(buffer, "MAD TA, %s.z, luminance%d.x, luminance%d.y;\n",
1757                        src_reg, reg_dest_code, reg_dest_code);
1758         shader_addline(buffer, "MUL %s, %s, TA;\n", dst_reg, dst_reg);
1759     }
1760 }
1761
1762 static void pshader_hw_texm3x2pad(const struct wined3d_shader_instruction *ins)
1763 {
1764     DWORD reg = ins->dst[0].reg.idx;
1765     SHADER_BUFFER *buffer = ins->ctx->buffer;
1766     char src0_name[50], dst_name[50];
1767     BOOL is_color;
1768     struct wined3d_shader_register tmp_reg = ins->dst[0].reg;
1769
1770     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1771     /* The next instruction will be a texm3x2tex or texm3x2depth that writes to the uninitialized
1772      * T<reg+1> register. Use this register to store the calculated vector
1773      */
1774     tmp_reg.idx = reg + 1;
1775     shader_arb_get_register_name(ins, &tmp_reg, dst_name, &is_color);
1776     shader_addline(buffer, "DP3 %s.x, fragment.texcoord[%u], %s;\n", dst_name, reg, src0_name);
1777 }
1778
1779 static void pshader_hw_texm3x2tex(const struct wined3d_shader_instruction *ins)
1780 {
1781     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1782     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1783     DWORD flags;
1784     DWORD reg = ins->dst[0].reg.idx;
1785     SHADER_BUFFER *buffer = ins->ctx->buffer;
1786     char dst_str[50];
1787     char src0_name[50];
1788     char dst_reg[50];
1789     BOOL is_color;
1790
1791     /* We know that we're writing to the uninitialized T<reg> register, so use it for temporary storage */
1792     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
1793
1794     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1795     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1796     shader_addline(buffer, "DP3 %s.y, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
1797     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1798     shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
1799 }
1800
1801 static void pshader_hw_texm3x3pad(const struct wined3d_shader_instruction *ins)
1802 {
1803     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1804     DWORD reg = ins->dst[0].reg.idx;
1805     SHADER_BUFFER *buffer = ins->ctx->buffer;
1806     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
1807     char src0_name[50], dst_name[50];
1808     struct wined3d_shader_register tmp_reg = ins->dst[0].reg;
1809     BOOL is_color;
1810
1811     /* There are always 2 texm3x3pad instructions followed by one texm3x3[tex,vspec, ...] instruction, with
1812      * incrementing ins->dst[0].register_idx numbers. So the pad instruction already knows the final destination
1813      * register, and this register is uninitialized(otherwise the assembler complains that it is 'redeclared')
1814      */
1815     tmp_reg.idx = reg + 2 - current_state->current_row;
1816     shader_arb_get_register_name(ins, &tmp_reg, dst_name, &is_color);
1817
1818     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1819     shader_addline(buffer, "DP3 %s.%c, fragment.texcoord[%u], %s;\n",
1820                    dst_name, 'x' + current_state->current_row, reg, src0_name);
1821     current_state->texcoord_w[current_state->current_row++] = reg;
1822 }
1823
1824 static void pshader_hw_texm3x3tex(const struct wined3d_shader_instruction *ins)
1825 {
1826     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1827     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1828     DWORD flags;
1829     DWORD reg = ins->dst[0].reg.idx;
1830     SHADER_BUFFER *buffer = ins->ctx->buffer;
1831     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
1832     char dst_str[50];
1833     char src0_name[50], dst_name[50];
1834     BOOL is_color;
1835
1836     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
1837     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1838     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_name, reg, src0_name);
1839
1840     /* Sample the texture using the calculated coordinates */
1841     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1842     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1843     shader_hw_sample(ins, reg, dst_str, dst_name, flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
1844     current_state->current_row = 0;
1845 }
1846
1847 static void pshader_hw_texm3x3vspec(const struct wined3d_shader_instruction *ins)
1848 {
1849     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1850     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1851     DWORD flags;
1852     DWORD reg = ins->dst[0].reg.idx;
1853     SHADER_BUFFER *buffer = ins->ctx->buffer;
1854     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
1855     char dst_str[50];
1856     char src0_name[50];
1857     char dst_reg[8];
1858     BOOL is_color;
1859
1860     /* Get the dst reg without writemask strings. We know this register is uninitialized, so we can use all
1861      * components for temporary data storage
1862      */
1863     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
1864     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1865     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
1866
1867     /* Construct the eye-ray vector from w coordinates */
1868     shader_addline(buffer, "MOV TB.x, fragment.texcoord[%u].w;\n", current_state->texcoord_w[0]);
1869     shader_addline(buffer, "MOV TB.y, fragment.texcoord[%u].w;\n", current_state->texcoord_w[1]);
1870     shader_addline(buffer, "MOV TB.z, fragment.texcoord[%u].w;\n", reg);
1871
1872     /* Calculate reflection vector
1873      */
1874     shader_addline(buffer, "DP3 %s.w, %s, TB;\n", dst_reg, dst_reg);
1875     /* The .w is ignored when sampling, so I can use TB.w to calculate dot(N, N) */
1876     shader_addline(buffer, "DP3 TB.w, %s, %s;\n", dst_reg, dst_reg);
1877     shader_addline(buffer, "RCP TB.w, TB.w;\n");
1878     shader_addline(buffer, "MUL %s.w, %s.w, TB.w;\n", dst_reg, dst_reg);
1879     shader_addline(buffer, "MUL %s, %s.w, %s;\n", dst_reg, dst_reg, dst_reg);
1880     shader_addline(buffer, "MAD %s, coefmul.x, %s, -TB;\n", dst_reg, dst_reg);
1881
1882     /* Sample the texture using the calculated coordinates */
1883     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1884     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1885     shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
1886     current_state->current_row = 0;
1887 }
1888
1889 static void pshader_hw_texm3x3spec(const struct wined3d_shader_instruction *ins)
1890 {
1891     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1892     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1893     DWORD flags;
1894     DWORD reg = ins->dst[0].reg.idx;
1895     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
1896     SHADER_BUFFER *buffer = ins->ctx->buffer;
1897     char dst_str[50];
1898     char src0_name[50];
1899     char src1_name[50];
1900     char dst_reg[8];
1901     BOOL is_color;
1902
1903     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1904     shader_arb_get_src_param(ins, &ins->src[0], 1, src1_name);
1905     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
1906     /* Note: dst_reg.xy is input here, generated by two texm3x3pad instructions */
1907     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
1908
1909     /* Calculate reflection vector.
1910      *
1911      *                   dot(N, E)
1912      * dst_reg.xyz = 2 * --------- * N - E
1913      *                   dot(N, N)
1914      *
1915      * Which normalizes the normal vector
1916      */
1917     shader_addline(buffer, "DP3 %s.w, %s, %s;\n", dst_reg, dst_reg, src1_name);
1918     shader_addline(buffer, "DP3 TC.w, %s, %s;\n", dst_reg, dst_reg);
1919     shader_addline(buffer, "RCP TC.w, TC.w;\n");
1920     shader_addline(buffer, "MUL %s.w, %s.w, TC.w;\n", dst_reg, dst_reg);
1921     shader_addline(buffer, "MUL %s, %s.w, %s;\n", dst_reg, dst_reg, dst_reg);
1922     shader_addline(buffer, "MAD %s, coefmul.x, %s, -%s;\n", dst_reg, dst_reg, src1_name);
1923
1924     /* Sample the texture using the calculated coordinates */
1925     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1926     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1927     shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
1928     current_state->current_row = 0;
1929 }
1930
1931 static void pshader_hw_texdepth(const struct wined3d_shader_instruction *ins)
1932 {
1933     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1934     SHADER_BUFFER *buffer = ins->ctx->buffer;
1935     char dst_name[50];
1936
1937     /* texdepth has an implicit destination, the fragment depth value. It's only parameter,
1938      * which is essentially an input, is the destination register because it is the first
1939      * parameter. According to the msdn, this must be register r5, but let's keep it more flexible
1940      * here(writemasks/swizzles are not valid on texdepth)
1941      */
1942     shader_arb_get_dst_param(ins, dst, dst_name);
1943
1944     /* According to the msdn, the source register(must be r5) is unusable after
1945      * the texdepth instruction, so we're free to modify it
1946      */
1947     shader_addline(buffer, "MIN %s.y, %s.y, one.y;\n", dst_name, dst_name);
1948
1949     /* How to deal with the special case dst_name.g == 0? if r != 0, then
1950      * the r * (1 / 0) will give infinity, which is clamped to 1.0, the correct
1951      * result. But if r = 0.0, then 0 * inf = 0, which is incorrect.
1952      */
1953     shader_addline(buffer, "RCP %s.y, %s.y;\n", dst_name, dst_name);
1954     shader_addline(buffer, "MUL TA.x, %s.x, %s.y;\n", dst_name, dst_name);
1955     shader_addline(buffer, "MIN TA.x, TA.x, one.x;\n");
1956     shader_addline(buffer, "MAX result.depth, TA.x, 0.0;\n");
1957 }
1958
1959 /** Process the WINED3DSIO_TEXDP3TEX instruction in ARB:
1960  * Take a 3-component dot product of the TexCoord[dstreg] and src,
1961  * then perform a 1D texture lookup from stage dstregnum, place into dst. */
1962 static void pshader_hw_texdp3tex(const struct wined3d_shader_instruction *ins)
1963 {
1964     SHADER_BUFFER *buffer = ins->ctx->buffer;
1965     DWORD sampler_idx = ins->dst[0].reg.idx;
1966     char src0[50];
1967     char dst_str[50];
1968
1969     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
1970     shader_addline(buffer, "MOV TB, 0.0;\n");
1971     shader_addline(buffer, "DP3 TB.x, fragment.texcoord[%u], %s;\n", sampler_idx, src0);
1972
1973     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1974     shader_hw_sample(ins, sampler_idx, dst_str, "TB", 0 /* Only one coord, can't be projected */, NULL, NULL);
1975 }
1976
1977 /** Process the WINED3DSIO_TEXDP3 instruction in ARB:
1978  * Take a 3-component dot product of the TexCoord[dstreg] and src. */
1979 static void pshader_hw_texdp3(const struct wined3d_shader_instruction *ins)
1980 {
1981     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1982     char src0[50];
1983     char dst_str[50];
1984     SHADER_BUFFER *buffer = ins->ctx->buffer;
1985
1986     /* Handle output register */
1987     shader_arb_get_dst_param(ins, dst, dst_str);
1988     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
1989     shader_addline(buffer, "DP3 %s, fragment.texcoord[%u], %s;\n", dst_str, dst->reg.idx, src0);
1990 }
1991
1992 /** Process the WINED3DSIO_TEXM3X3 instruction in ARB
1993  * Perform the 3rd row of a 3x3 matrix multiply */
1994 static void pshader_hw_texm3x3(const struct wined3d_shader_instruction *ins)
1995 {
1996     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1997     SHADER_BUFFER *buffer = ins->ctx->buffer;
1998     char dst_str[50], dst_name[50];
1999     char src0[50];
2000     BOOL is_color;
2001
2002     shader_arb_get_dst_param(ins, dst, dst_str);
2003     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
2004     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
2005     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_name, dst->reg.idx, src0);
2006     shader_addline(buffer, "MOV %s, %s;\n", dst_str, dst_name);
2007 }
2008
2009 /** Process the WINED3DSIO_TEXM3X2DEPTH instruction in ARB:
2010  * Last row of a 3x2 matrix multiply, use the result to calculate the depth:
2011  * Calculate tmp0.y = TexCoord[dstreg] . src.xyz;  (tmp0.x has already been calculated)
2012  * depth = (tmp0.y == 0.0) ? 1.0 : tmp0.x / tmp0.y
2013  */
2014 static void pshader_hw_texm3x2depth(const struct wined3d_shader_instruction *ins)
2015 {
2016     SHADER_BUFFER *buffer = ins->ctx->buffer;
2017     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2018     char src0[50], dst_name[50];
2019     BOOL is_color;
2020
2021     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
2022     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
2023     shader_addline(buffer, "DP3 %s.y, fragment.texcoord[%u], %s;\n", dst_name, dst->reg.idx, src0);
2024
2025     /* How to deal with the special case dst_name.g == 0? if r != 0, then
2026      * the r * (1 / 0) will give infinity, which is clamped to 1.0, the correct
2027      * result. But if r = 0.0, then 0 * inf = 0, which is incorrect.
2028      */
2029     shader_addline(buffer, "RCP %s.y, %s.y;\n", dst_name, dst_name);
2030     shader_addline(buffer, "MUL %s.x, %s.x, %s.y;\n", dst_name, dst_name, dst_name);
2031     shader_addline(buffer, "MIN %s.x, %s.x, one.x;\n", dst_name, dst_name);
2032     shader_addline(buffer, "MAX result.depth, %s.x, 0.0;\n", dst_name);
2033 }
2034
2035 /** Handles transforming all WINED3DSIO_M?x? opcodes for
2036     Vertex/Pixel shaders to ARB_vertex_program codes */
2037 static void shader_hw_mnxn(const struct wined3d_shader_instruction *ins)
2038 {
2039     int i;
2040     int nComponents = 0;
2041     struct wined3d_shader_dst_param tmp_dst = {{0}};
2042     struct wined3d_shader_src_param tmp_src[2] = {{{0}}};
2043     struct wined3d_shader_instruction tmp_ins;
2044
2045     memset(&tmp_ins, 0, sizeof(tmp_ins));
2046
2047     /* Set constants for the temporary argument */
2048     tmp_ins.ctx = ins->ctx;
2049     tmp_ins.dst_count = 1;
2050     tmp_ins.dst = &tmp_dst;
2051     tmp_ins.src_count = 2;
2052     tmp_ins.src = tmp_src;
2053
2054     switch(ins->handler_idx)
2055     {
2056         case WINED3DSIH_M4x4:
2057             nComponents = 4;
2058             tmp_ins.handler_idx = WINED3DSIH_DP4;
2059             break;
2060         case WINED3DSIH_M4x3:
2061             nComponents = 3;
2062             tmp_ins.handler_idx = WINED3DSIH_DP4;
2063             break;
2064         case WINED3DSIH_M3x4:
2065             nComponents = 4;
2066             tmp_ins.handler_idx = WINED3DSIH_DP3;
2067             break;
2068         case WINED3DSIH_M3x3:
2069             nComponents = 3;
2070             tmp_ins.handler_idx = WINED3DSIH_DP3;
2071             break;
2072         case WINED3DSIH_M3x2:
2073             nComponents = 2;
2074             tmp_ins.handler_idx = WINED3DSIH_DP3;
2075             break;
2076         default:
2077             FIXME("Unhandled opcode %#x\n", ins->handler_idx);
2078             break;
2079     }
2080
2081     tmp_dst = ins->dst[0];
2082     tmp_src[0] = ins->src[0];
2083     tmp_src[1] = ins->src[1];
2084     for (i = 0; i < nComponents; i++) {
2085         tmp_dst.write_mask = WINED3DSP_WRITEMASK_0 << i;
2086         shader_hw_map2gl(&tmp_ins);
2087         ++tmp_src[1].reg.idx;
2088     }
2089 }
2090
2091 static void shader_hw_rsq_rcp(const struct wined3d_shader_instruction *ins)
2092 {
2093     SHADER_BUFFER *buffer = ins->ctx->buffer;
2094     const char *instruction;
2095
2096     char dst[50];
2097     char src[50];
2098
2099     switch(ins->handler_idx)
2100     {
2101         case WINED3DSIH_RSQ: instruction = "RSQ"; break;
2102         case WINED3DSIH_RCP: instruction = "RCP"; break;
2103         default: instruction = "";
2104             FIXME("Unhandled opcode %#x\n", ins->handler_idx);
2105             break;
2106     }
2107
2108     shader_arb_get_dst_param(ins, &ins->dst[0], dst); /* Destination */
2109     shader_arb_get_src_param(ins, &ins->src[0], 0, src);
2110     if (ins->src[0].swizzle == WINED3DSP_NOSWIZZLE)
2111     {
2112         /* Dx sdk says .x is used if no swizzle is given, but our test shows that
2113          * .w is used
2114          */
2115         strcat(src, ".w");
2116     }
2117
2118     shader_addline(buffer, "%s%s %s, %s;\n", instruction, shader_arb_get_modifier(ins), dst, src);
2119 }
2120
2121 static void shader_hw_nrm(const struct wined3d_shader_instruction *ins)
2122 {
2123     SHADER_BUFFER *buffer = ins->ctx->buffer;
2124     char dst_name[50];
2125     char src_name[50];
2126     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2127     BOOL pshader = shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type);
2128
2129     shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
2130     shader_arb_get_src_param(ins, &ins->src[0], 1 /* Use TB */, src_name);
2131
2132     if(pshader && priv->target_version >= NV3)
2133     {
2134         shader_addline(buffer, "NRM%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name);
2135     }
2136     else
2137     {
2138         shader_addline(buffer, "DP3 TA, %s, %s;\n", src_name, src_name);
2139         shader_addline(buffer, "RSQ TA, TA.x;\n");
2140         /* dst.w = src[0].w * 1 / (src.x^2 + src.y^2 + src.z^2)^(1/2) according to msdn*/
2141         shader_addline(buffer, "MUL%s %s, %s, TA;\n", shader_arb_get_modifier(ins), dst_name,
2142                     src_name);
2143     }
2144 }
2145
2146 static void shader_hw_lrp(const struct wined3d_shader_instruction *ins)
2147 {
2148     SHADER_BUFFER *buffer = ins->ctx->buffer;
2149     char dst_name[50];
2150     char src_name[3][50];
2151
2152     /* ARB_fragment_program has a convenient LRP instruction */
2153     if(shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)) {
2154         shader_hw_map2gl(ins);
2155         return;
2156     }
2157
2158     shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
2159     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
2160     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
2161     shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
2162
2163     shader_addline(buffer, "SUB TA, %s, %s;\n", src_name[1], src_name[2]);
2164     shader_addline(buffer, "MAD%s %s, %s, TA, %s;\n", shader_arb_get_modifier(ins),
2165                    dst_name, src_name[0], src_name[2]);
2166 }
2167
2168 static void shader_hw_sincos(const struct wined3d_shader_instruction *ins)
2169 {
2170     /* This instruction exists in ARB, but the d3d instruction takes two extra parameters which
2171      * must contain fixed constants. So we need a separate function to filter those constants and
2172      * can't use map2gl
2173      */
2174     SHADER_BUFFER *buffer = ins->ctx->buffer;
2175     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2176     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2177     char dst_name[50];
2178     char src_name0[50], src_name1[50], src_name2[50];
2179     BOOL is_color;
2180
2181     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0);
2182     if(shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)) {
2183         shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
2184         /* No modifiers are supported on SCS */
2185         shader_addline(buffer, "SCS %s, %s;\n", dst_name, src_name0);
2186
2187         if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE)
2188         {
2189             shader_arb_get_register_name(ins, &dst->reg, src_name0, &is_color);
2190             shader_addline(buffer, "MOV_SAT %s, %s;\n", dst_name, src_name0);
2191         }
2192     } else if(priv->target_version >= NV2) {
2193         shader_arb_get_register_name(ins, &dst->reg, dst_name, &is_color);
2194
2195         /* Sincos writemask must be .x, .y or .xy */
2196         if(dst->write_mask & WINED3DSP_WRITEMASK_0)
2197             shader_addline(buffer, "COS%s %s.x, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name0);
2198         if(dst->write_mask & WINED3DSP_WRITEMASK_1)
2199             shader_addline(buffer, "SIN%s %s.y, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name0);
2200     } else {
2201         /* Approximate sine and cosine with a taylor series, as per math textbook. The application passes 8
2202          * helper constants(D3DSINCOSCONST1 and D3DSINCOSCONST2) in src1 and src2.
2203          *
2204          * sin(x) = x - x^3/3! + x^5/5! - x^7/7! + ...
2205          * cos(x) = 1 - x^2/2! + x^4/4! - x^6/6! + ...
2206          *
2207          * The constants we get are:
2208          *
2209          *  +1   +1,     -1     -1     +1      +1      -1       -1
2210          *      ---- ,  ---- , ---- , ----- , ----- , ----- , ------
2211          *      1!*2    2!*4   3!*8   4!*16   5!*32   6!*64   7!*128
2212          *
2213          * If used with x^2, x^3, x^4 etc they calculate sin(x/2) and cos(x/2):
2214          *
2215          * (x/2)^2 = x^2 / 4
2216          * (x/2)^3 = x^3 / 8
2217          * (x/2)^4 = x^4 / 16
2218          * (x/2)^5 = x^5 / 32
2219          * etc
2220          *
2221          * To get the final result:
2222          * sin(x) = 2 * sin(x/2) * cos(x/2)
2223          * cos(x) = cos(x/2)^2 - sin(x/2)^2
2224          * (from sin(x+y) and cos(x+y) rules)
2225          *
2226          * As per MSDN, dst.z is undefined after the operation, and so is
2227          * dst.x and dst.y if they're masked out by the writemask. Ie
2228          * sincos dst.y, src1, c0, c1
2229          * returns the sine in dst.y. dst.x and dst.z are undefined, dst.w is not touched. The assembler
2230          * vsa.exe also stops with an error if the dest register is the same register as the source
2231          * register. This means we can use dest.xyz as temporary storage. The assembler vsa.exe output also
2232          * indicates that sincos consumes 8 instruction slots in vs_2_0(and, strangely, in vs_3_0).
2233          */
2234         shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1);
2235         shader_arb_get_src_param(ins, &ins->src[2], 2, src_name2);
2236         shader_arb_get_register_name(ins, &dst->reg, dst_name, &is_color);
2237
2238         shader_addline(buffer, "MUL %s.x, %s, %s;\n", dst_name, src_name0, src_name0);  /* x ^ 2 */
2239         shader_addline(buffer, "MUL TA.y, %s.x, %s;\n", dst_name, src_name0);           /* x ^ 3 */
2240         shader_addline(buffer, "MUL %s.y, TA.y, %s;\n", dst_name, src_name0);           /* x ^ 4 */
2241         shader_addline(buffer, "MUL TA.z, %s.y, %s;\n", dst_name, src_name0);           /* x ^ 5 */
2242         shader_addline(buffer, "MUL %s.z, TA.z, %s;\n", dst_name, src_name0);           /* x ^ 6 */
2243         shader_addline(buffer, "MUL TA.w, %s.z, %s;\n", dst_name, src_name0);           /* x ^ 7 */
2244
2245         /* sin(x/2)
2246          *
2247          * Unfortunately we don't get the constants in a DP4-capable form. Is there a way to
2248          * properly merge that with MULs in the code above?
2249          * The swizzles .yz and xw however fit into the .yzxw swizzle added to ps_2_0. Maybe
2250          * we can merge the sine and cosine MAD rows to calculate them together.
2251          */
2252         shader_addline(buffer, "MUL TA.x, %s, %s.w;\n", src_name0, src_name2); /* x^1, +1/(1!*2) */
2253         shader_addline(buffer, "MAD TA.x, TA.y, %s.x, TA.x;\n", src_name2); /* -1/(3!*8) */
2254         shader_addline(buffer, "MAD TA.x, TA.z, %s.w, TA.x;\n", src_name1); /* +1/(5!*32) */
2255         shader_addline(buffer, "MAD TA.x, TA.w, %s.x, TA.x;\n", src_name1); /* -1/(7!*128) */
2256
2257         /* cos(x/2) */
2258         shader_addline(buffer, "MAD TA.y, %s.x, %s.y, %s.z;\n", dst_name, src_name2, src_name2); /* -1/(2!*4), +1.0 */
2259         shader_addline(buffer, "MAD TA.y, %s.y, %s.z, TA.y;\n", dst_name, src_name1); /* +1/(4!*16) */
2260         shader_addline(buffer, "MAD TA.y, %s.z, %s.y, TA.y;\n", dst_name, src_name1); /* -1/(6!*64) */
2261
2262         if(dst->write_mask & WINED3DSP_WRITEMASK_0) {
2263             /* cos x */
2264             shader_addline(buffer, "MUL TA.z, TA.y, TA.y;\n");
2265             shader_addline(buffer, "MAD %s.x, -TA.x, TA.x, TA.z;\n", dst_name);
2266         }
2267         if(dst->write_mask & WINED3DSP_WRITEMASK_1) {
2268             /* sin x */
2269             shader_addline(buffer, "MUL %s.y, TA.x, TA.y;\n", dst_name);
2270             shader_addline(buffer, "ADD %s.y, %s.y, %s.y;\n", dst_name, dst_name, dst_name);
2271         }
2272     }
2273 }
2274
2275 /* GL locking is done by the caller */
2276 static void shader_hw_sgn(const struct wined3d_shader_instruction *ins)
2277 {
2278     SHADER_BUFFER *buffer = ins->ctx->buffer;
2279     char dst_name[50];
2280     char src_name[50];
2281     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
2282
2283     /* SGN is only valid in vertex shaders */
2284     if(ctx->target_version == NV2) {
2285         shader_hw_map2gl(ins);
2286         return;
2287     }
2288     shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
2289     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name);
2290
2291     FIXME("Emulated SGN untested\n");
2292     /* If SRC > 0.0, -SRC < SRC = TRUE, otherwise false.
2293      * if SRC < 0.0,  SRC < -SRC = TRUE. If neither is true, src = 0.0
2294      */
2295     if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE) {
2296         shader_addline(buffer, "SLT %s, -%s, %s;\n", dst_name, src_name, src_name);
2297     } else {
2298         shader_addline(buffer, "SLT TB, -%s, %s;\n", src_name, src_name);
2299         shader_addline(buffer, "SLT TC,  %s, -%s;\n", src_name, src_name);
2300         shader_addline(buffer, "ADD %s, TB, -TC;\n", dst_name);
2301     }
2302 }
2303
2304 static void shader_hw_dsy(const struct wined3d_shader_instruction *ins)
2305 {
2306     SHADER_BUFFER *buffer = ins->ctx->buffer;
2307     char src[50];
2308     char dst[50];
2309     char dst_name[50];
2310     BOOL is_color;
2311
2312     shader_arb_get_dst_param(ins, &ins->dst[0], dst);
2313     shader_arb_get_src_param(ins, &ins->src[0], 0, src);
2314     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
2315
2316     shader_addline(buffer, "DDY %s, %s;\n", dst, src);
2317     shader_addline(buffer, "MUL%s %s, %s, ycorrection.y;\n", shader_arb_get_modifier(ins), dst, dst_name);
2318 }
2319
2320 static void shader_hw_loop(const struct wined3d_shader_instruction *ins)
2321 {
2322     SHADER_BUFFER *buffer = ins->ctx->buffer;
2323     char src_name[50];
2324     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2325
2326     /* src0 is aL */
2327     shader_arb_get_src_param(ins, &ins->src[1], 0, src_name);
2328
2329     if(vshader)
2330     {
2331         struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2332         struct list *e = list_head(&priv->control_frames);
2333         struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2334
2335         if(priv->loop_depth > 1) shader_addline(buffer, "PUSHA aL;\n");
2336         /* The constant loader makes sure to load -1 into iX.w */
2337         shader_addline(buffer, "ARLC aL, %s.xywz;\n", src_name);
2338         shader_addline(buffer, "BRA loop_%u_end (LE.x);\n", control_frame->loop_no);
2339         shader_addline(buffer, "loop_%u_start:\n", control_frame->loop_no);
2340     }
2341     else
2342     {
2343         shader_addline(buffer, "LOOP %s;\n", src_name);
2344     }
2345 }
2346
2347 static void shader_hw_rep(const struct wined3d_shader_instruction *ins)
2348 {
2349     SHADER_BUFFER *buffer = ins->ctx->buffer;
2350     char src_name[50];
2351     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2352
2353     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name);
2354
2355     /* The constant loader makes sure to load -1 into iX.w */
2356     if(vshader)
2357     {
2358         struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2359         struct list *e = list_head(&priv->control_frames);
2360         struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2361
2362         if(priv->loop_depth > 1) shader_addline(buffer, "PUSHA aL;\n");
2363
2364         shader_addline(buffer, "ARLC aL, %s.xywz;\n", src_name);
2365         shader_addline(buffer, "BRA loop_%u_end (LE.x);\n", control_frame->loop_no);
2366         shader_addline(buffer, "loop_%u_start:\n", control_frame->loop_no);
2367     }
2368     else
2369     {
2370         shader_addline(buffer, "REP %s;\n", src_name);
2371     }
2372 }
2373
2374 static void shader_hw_endloop(const struct wined3d_shader_instruction *ins)
2375 {
2376     SHADER_BUFFER *buffer = ins->ctx->buffer;
2377     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2378
2379     if(vshader)
2380     {
2381         struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2382         struct list *e = list_head(&priv->control_frames);
2383         struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2384
2385         shader_addline(buffer, "ARAC aL.xy, aL;\n");
2386         shader_addline(buffer, "BRA loop_%u_start (GT.x);\n", control_frame->loop_no);
2387         shader_addline(buffer, "loop_%u_end:\n", control_frame->loop_no);
2388
2389         if(priv->loop_depth > 1) shader_addline(buffer, "POPA aL;\n");
2390     }
2391     else
2392     {
2393         shader_addline(buffer, "ENDLOOP;\n");
2394     }
2395 }
2396
2397 static void shader_hw_endrep(const struct wined3d_shader_instruction *ins)
2398 {
2399     SHADER_BUFFER *buffer = ins->ctx->buffer;
2400     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2401
2402     if(vshader)
2403     {
2404         struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2405         struct list *e = list_head(&priv->control_frames);
2406         struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2407
2408         shader_addline(buffer, "ARAC aL.xy, aL;\n");
2409         shader_addline(buffer, "BRA loop_%u_start (GT.x);\n", control_frame->loop_no);
2410         shader_addline(buffer, "loop_%u_end:\n", control_frame->loop_no);
2411
2412         if(priv->loop_depth > 1) shader_addline(buffer, "POPA aL;\n");
2413     }
2414     else
2415     {
2416         shader_addline(buffer, "ENDREP;\n");
2417     }
2418 }
2419
2420 static const struct control_frame *find_last_loop(const struct shader_arb_ctx_priv *priv)
2421 {
2422     struct control_frame *control_frame;
2423
2424     LIST_FOR_EACH_ENTRY(control_frame, &priv->control_frames, struct control_frame, entry)
2425     {
2426         if(control_frame->type == LOOP || control_frame->type == REP) return control_frame;
2427     }
2428     ERR("Could not find loop for break\n");
2429     return NULL;
2430 }
2431
2432 static void shader_hw_break(const struct wined3d_shader_instruction *ins)
2433 {
2434     SHADER_BUFFER *buffer = ins->ctx->buffer;
2435     const struct control_frame *control_frame = find_last_loop(ins->ctx->backend_data);
2436     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2437
2438     if(vshader)
2439     {
2440         shader_addline(buffer, "BRA loop_%u_end;\n", control_frame->loop_no);
2441     }
2442     else
2443     {
2444         shader_addline(buffer, "BRK;\n");
2445     }
2446 }
2447
2448 static void shader_hw_breakc(const struct wined3d_shader_instruction *ins)
2449 {
2450     SHADER_BUFFER *buffer = ins->ctx->buffer;
2451     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2452     const struct control_frame *control_frame = find_last_loop(ins->ctx->backend_data);
2453     char src_name0[50];
2454     char src_name1[50];
2455     const char *comp;
2456
2457     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0);
2458     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1);
2459
2460     switch (ins->flags)
2461     {
2462         case COMPARISON_GT: comp = "GT"; break;
2463         case COMPARISON_EQ: comp = "EQ"; break;
2464         case COMPARISON_GE: comp = "GE"; break;
2465         case COMPARISON_LT: comp = "LT"; break;
2466         case COMPARISON_NE: comp = "NE"; break;
2467         case COMPARISON_LE: comp = "LE"; break;
2468         default:
2469             FIXME("Unrecognized comparison value: %u\n", ins->flags);
2470             comp = "(\?\?)";
2471     }
2472
2473     if(vshader)
2474     {
2475         /* SUBC CC, src0, src1" works only in pixel shaders, so use TA to throw
2476          * away the subtraction result
2477          */
2478         shader_addline(buffer, "SUBC TA, %s, %s;\n", src_name0, src_name1);
2479         shader_addline(buffer, "BRA loop_%u_end (%s.x);\n", control_frame->loop_no, comp);
2480     }
2481     else
2482     {
2483         shader_addline(buffer, "SUBC CC, %s, %s;\n", src_name0, src_name1);
2484         shader_addline(buffer, "BRK (%s.x);\n", comp);
2485     }
2486 }
2487
2488 static void shader_hw_ifc(const struct wined3d_shader_instruction *ins)
2489 {
2490     SHADER_BUFFER *buffer = ins->ctx->buffer;
2491     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2492     struct list *e = list_head(&priv->control_frames);
2493     struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2494     const char *comp;
2495     char src_name0[50];
2496     char src_name1[50];
2497     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2498
2499     /* Invert the flag. We jump to the else label if the condition is NOT true */
2500     switch(ins->flags)
2501     {
2502         case COMPARISON_GT: comp = "LE"; break;
2503         case COMPARISON_EQ: comp = "NE"; break;
2504         case COMPARISON_GE: comp = "LT"; break;
2505         case COMPARISON_LT: comp = "GE"; break;
2506         case COMPARISON_NE: comp = "EQ"; break;
2507         case COMPARISON_LE: comp = "GT"; break;
2508         default:
2509             FIXME("Unrecognized comparison value: %u\n", ins->flags);
2510             comp = "\?\?";
2511     }
2512
2513     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0);
2514     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1);
2515
2516     if(vshader)
2517     {
2518         shader_addline(buffer, "SUBC TA, %s, %s;\n", src_name0, src_name1);
2519         shader_addline(buffer, "BRA ifc_%u_endif (%s.x);\n", control_frame->ifc_no, comp);
2520     }
2521     else
2522     {
2523         shader_addline(buffer, "SUBC CC, %s, %s;\n", src_name0, src_name1);
2524         shader_addline(buffer, "IF %s.x;\n", comp);
2525     }
2526 }
2527
2528 static void shader_hw_else(const struct wined3d_shader_instruction *ins)
2529 {
2530     SHADER_BUFFER *buffer = ins->ctx->buffer;
2531     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2532     struct list *e = list_head(&priv->control_frames);
2533     struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2534     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2535
2536     if(vshader)
2537     {
2538         shader_addline(buffer, "BRA ifc_%u_endif;\n", control_frame->ifc_no);
2539         shader_addline(buffer, "ifc_%u_else:\n", control_frame->ifc_no);
2540         control_frame->had_else = TRUE;
2541     }
2542     else
2543     {
2544         shader_addline(buffer, "ELSE;\n");
2545     }
2546 }
2547
2548 static void shader_hw_endif(const struct wined3d_shader_instruction *ins)
2549 {
2550     SHADER_BUFFER *buffer = ins->ctx->buffer;
2551     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2552     struct list *e = list_head(&priv->control_frames);
2553     struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2554     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2555
2556     if(vshader)
2557     {
2558         if(control_frame->had_else)
2559         {
2560             shader_addline(buffer, "ifc_%u_endif:\n", control_frame->ifc_no);
2561         }
2562         else
2563         {
2564             shader_addline(buffer, "#No else branch. else is endif\n");
2565             shader_addline(buffer, "ifc_%u_else:\n", control_frame->ifc_no);
2566         }
2567     }
2568     else
2569     {
2570         shader_addline(buffer, "ENDIF;\n");
2571     }
2572 }
2573
2574 static void shader_hw_texldd(const struct wined3d_shader_instruction *ins)
2575 {
2576     DWORD sampler_idx = ins->src[1].reg.idx;
2577     char reg_dest[40];
2578     char reg_src[3][40];
2579     DWORD flags = TEX_DERIV;
2580
2581     shader_arb_get_dst_param(ins, &ins->dst[0], reg_dest);
2582     shader_arb_get_src_param(ins, &ins->src[0], 0, reg_src[0]);
2583     shader_arb_get_src_param(ins, &ins->src[2], 1, reg_src[1]);
2584     shader_arb_get_src_param(ins, &ins->src[3], 2, reg_src[2]);
2585
2586     if (ins->flags & WINED3DSI_TEXLD_PROJECT) flags |= TEX_PROJ;
2587     if (ins->flags & WINED3DSI_TEXLD_BIAS) flags |= TEX_BIAS;
2588
2589     shader_hw_sample(ins, sampler_idx, reg_dest, reg_src[0], flags, reg_src[1], reg_src[2]);
2590 }
2591
2592 static void shader_hw_texldl(const struct wined3d_shader_instruction *ins)
2593 {
2594     DWORD sampler_idx = ins->src[1].reg.idx;
2595     char reg_dest[40];
2596     char reg_coord[40];
2597     DWORD flags = TEX_LOD;
2598
2599     shader_arb_get_dst_param(ins, &ins->dst[0], reg_dest);
2600     shader_arb_get_src_param(ins, &ins->src[0], 0, reg_coord);
2601
2602     if (ins->flags & WINED3DSI_TEXLD_PROJECT) flags |= TEX_PROJ;
2603     if (ins->flags & WINED3DSI_TEXLD_BIAS) flags |= TEX_BIAS;
2604
2605     shader_hw_sample(ins, sampler_idx, reg_dest, reg_coord, flags, NULL, NULL);
2606 }
2607
2608 static GLuint create_arb_blt_vertex_program(const WineD3D_GL_Info *gl_info)
2609 {
2610     GLuint program_id = 0;
2611     const char *blt_vprogram =
2612         "!!ARBvp1.0\n"
2613         "PARAM c[1] = { { 1, 0.5 } };\n"
2614         "MOV result.position, vertex.position;\n"
2615         "MOV result.color, c[0].x;\n"
2616         "MOV result.texcoord[0], vertex.texcoord[0];\n"
2617         "END\n";
2618
2619     GL_EXTCALL(glGenProgramsARB(1, &program_id));
2620     GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, program_id));
2621     GL_EXTCALL(glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, strlen(blt_vprogram), blt_vprogram));
2622
2623     if (glGetError() == GL_INVALID_OPERATION) {
2624         GLint pos;
2625         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
2626         FIXME("Vertex program error at position %d: %s\n", pos,
2627             debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
2628     }
2629
2630     return program_id;
2631 }
2632
2633 /* GL locking is done by the caller */
2634 static GLuint create_arb_blt_fragment_program(const WineD3D_GL_Info *gl_info, enum tex_types tex_type)
2635 {
2636     GLuint program_id = 0;
2637     static const char * const blt_fprograms[tex_type_count] =
2638     {
2639         /* tex_1d */
2640         NULL,
2641         /* tex_2d */
2642         "!!ARBfp1.0\n"
2643         "TEMP R0;\n"
2644         "TEX R0.x, fragment.texcoord[0], texture[0], 2D;\n"
2645         "MOV result.depth.z, R0.x;\n"
2646         "END\n",
2647         /* tex_3d */
2648         NULL,
2649         /* tex_cube */
2650         "!!ARBfp1.0\n"
2651         "TEMP R0;\n"
2652         "TEX R0.x, fragment.texcoord[0], texture[0], CUBE;\n"
2653         "MOV result.depth.z, R0.x;\n"
2654         "END\n",
2655         /* tex_rect */
2656         "!!ARBfp1.0\n"
2657         "TEMP R0;\n"
2658         "TEX R0.x, fragment.texcoord[0], texture[0], RECT;\n"
2659         "MOV result.depth.z, R0.x;\n"
2660         "END\n",
2661     };
2662
2663     if (!blt_fprograms[tex_type])
2664     {
2665         FIXME("tex_type %#x not supported\n", tex_type);
2666         tex_type = tex_2d;
2667     }
2668
2669     GL_EXTCALL(glGenProgramsARB(1, &program_id));
2670     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, program_id));
2671     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, strlen(blt_fprograms[tex_type]), blt_fprograms[tex_type]));
2672
2673     if (glGetError() == GL_INVALID_OPERATION) {
2674         GLint pos;
2675         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
2676         FIXME("Fragment program error at position %d: %s\n", pos,
2677             debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
2678     }
2679
2680     return program_id;
2681 }
2682
2683 static void arbfp_add_sRGB_correction(SHADER_BUFFER *buffer, const char *fragcolor, const char *tmp1,
2684                                       const char *tmp2, const char *tmp3, const char *tmp4, BOOL condcode) {
2685     /* Perform sRGB write correction. See GLX_EXT_framebuffer_sRGB */
2686
2687     if(condcode)
2688     {
2689         /* Sigh. MOVC CC doesn't work, so use one of the temps as dummy dest */
2690         shader_addline(buffer, "SUBC %s, %s.x, srgb_consts1.y;\n", tmp1, fragcolor);
2691         /* Calculate the > 0.0031308 case */
2692         shader_addline(buffer, "POW %s.x (GE), %s.x, srgb_consts1.z;\n", fragcolor, fragcolor);
2693         shader_addline(buffer, "POW %s.y (GE), %s.y, srgb_consts1.z;\n", fragcolor, fragcolor);
2694         shader_addline(buffer, "POW %s.z (GE), %s.z, srgb_consts1.z;\n", fragcolor, fragcolor);
2695         shader_addline(buffer, "MUL %s.xyz (GE), %s, srgb_consts1.w;\n", fragcolor, fragcolor);
2696         shader_addline(buffer, "SUB %s.xyz (GE), %s, srgb_consts2.x;\n", fragcolor, fragcolor);
2697         /* Calculate the < case */
2698         shader_addline(buffer, "MUL %s.xyz (LT), srgb_consts1.x, %s;\n", fragcolor, fragcolor);
2699     }
2700     else
2701     {
2702         /* Calculate the > 0.0031308 case */
2703         shader_addline(buffer, "POW %s.x, %s.x, srgb_consts1.z;\n", tmp1, fragcolor);
2704         shader_addline(buffer, "POW %s.y, %s.y, srgb_consts1.z;\n", tmp1, fragcolor);
2705         shader_addline(buffer, "POW %s.z, %s.z, srgb_consts1.z;\n", tmp1, fragcolor);
2706         shader_addline(buffer, "MUL %s, %s, srgb_consts1.w;\n", tmp1, tmp1);
2707         shader_addline(buffer, "SUB %s, %s, srgb_consts2.x;\n", tmp1, tmp1);
2708         /* Calculate the < case */
2709         shader_addline(buffer, "MUL %s, srgb_consts1.x, %s;\n", tmp2, fragcolor);
2710         /* Get 1.0 / 0.0 masks for > 0.0031308 and < 0.0031308 */
2711         shader_addline(buffer, "SLT %s, srgb_consts1.y, %s;\n", tmp3, fragcolor);
2712         shader_addline(buffer, "SGE %s, srgb_consts1.y, %s;\n", tmp4, fragcolor);
2713         /* Store the components > 0.0031308 in the destination */
2714         shader_addline(buffer, "MUL %s.xyz, %s, %s;\n", fragcolor, tmp1, tmp3);
2715         /* Add the components that are < 0.0031308 */
2716         shader_addline(buffer, "MAD %s.xyz, %s, %s, %s;\n", fragcolor, tmp2, tmp4, fragcolor);
2717         /* Move everything into result.color at once. Nvidia hardware cannot handle partial
2718         * result.color writes(.rgb first, then .a), or handle overwriting already written
2719         * components. The assembler uses a temporary register in this case, which is usually
2720         * not allocated from one of our registers that were used earlier.
2721         */
2722     }
2723     shader_addline(buffer, "MOV result.color, %s;\n", fragcolor);
2724     /* [0.0;1.0] clamping. Not needed, this is done implicitly */
2725 }
2726
2727 static const DWORD *find_loop_control_values(IWineD3DBaseShaderImpl *This, DWORD idx)
2728 {
2729     const local_constant *constant;
2730
2731     LIST_FOR_EACH_ENTRY(constant, &This->baseShader.constantsI, local_constant, entry)
2732     {
2733         if (constant->idx == idx)
2734         {
2735             return constant->value;
2736         }
2737     }
2738     return NULL;
2739 }
2740
2741 static void init_ps_input(const IWineD3DPixelShaderImpl *This, const struct arb_ps_compile_args *args,
2742                           struct shader_arb_ctx_priv *priv)
2743 {
2744     const char *texcoords[8] =
2745     {
2746         "fragment.texcoord[0]", "fragment.texcoord[1]", "fragment.texcoord[2]", "fragment.texcoord[3]",
2747         "fragment.texcoord[4]", "fragment.texcoord[5]", "fragment.texcoord[6]", "fragment.texcoord[7]"
2748     };
2749     unsigned int i;
2750     const struct wined3d_shader_signature_element *sig = This->input_signature;
2751     const char *semantic_name;
2752     DWORD semantic_idx;
2753
2754     switch(args->super.vp_mode)
2755     {
2756         case pretransformed:
2757         case fixedfunction:
2758             /* The pixelshader has to collect the varyings on its own. In any case properly load
2759              * color0 and color1. In the case of pretransformed vertices also load texcoords. Set
2760              * other attribs to 0.0.
2761              *
2762              * For fixedfunction this behavior is correct, according to the tests. For pretransformed
2763              * we'd either need a replacement shader that can load other attribs like BINORMAL, or
2764              * load the texcoord attrib pointers to match the pixel shader signature
2765              */
2766             for(i = 0; i < MAX_REG_INPUT; i++)
2767             {
2768                 semantic_name = sig[i].semantic_name;
2769                 semantic_idx = sig[i].semantic_idx;
2770                 if(semantic_name == NULL) continue;
2771
2772                 if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_COLOR))
2773                 {
2774                     if(semantic_idx == 0) priv->ps_input[i] = "fragment.color.primary";
2775                     else if(semantic_idx == 1) priv->ps_input[i] = "fragment.color.secondary";
2776                     else priv->ps_input[i] = "0.0";
2777                 }
2778                 else if(args->super.vp_mode == fixedfunction)
2779                 {
2780                     priv->ps_input[i] = "0.0";
2781                 }
2782                 else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_TEXCOORD))
2783                 {
2784                     if(semantic_idx < 8) priv->ps_input[i] = texcoords[semantic_idx];
2785                     else priv->ps_input[i] = "0.0";
2786                 }
2787                 else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_FOG))
2788                 {
2789                     if(semantic_idx == 0) priv->ps_input[i] = "fragment.fogcoord";
2790                     else priv->ps_input[i] = "0.0";
2791                 }
2792                 else
2793                 {
2794                     priv->ps_input[i] = "0.0";
2795                 }
2796
2797                 TRACE("v%u, semantic %s%u is %s\n", i, semantic_name, semantic_idx, priv->ps_input[i]);
2798             }
2799             break;
2800
2801         case vertexshader:
2802             /* That one is easy. The vertex shaders provide v0-v7 in fragment.texcoord and v8 and v9 in
2803              * fragment.color
2804              */
2805             for(i = 0; i < 8; i++)
2806             {
2807                 priv->ps_input[i] = texcoords[i];
2808             }
2809             priv->ps_input[8] = "fragment.color.primary";
2810             priv->ps_input[9] = "fragment.color.secondary";
2811             break;
2812     }
2813 }
2814
2815 /* GL locking is done by the caller */
2816 static GLuint shader_arb_generate_pshader(IWineD3DPixelShaderImpl *This,
2817         SHADER_BUFFER *buffer, const struct arb_ps_compile_args *args, struct arb_ps_compiled_shader *compiled)
2818 {
2819     const shader_reg_maps* reg_maps = &This->baseShader.reg_maps;
2820     CONST DWORD *function = This->baseShader.function;
2821     const WineD3D_GL_Info *gl_info = &((IWineD3DDeviceImpl *)This->baseShader.device)->adapter->gl_info;
2822     const local_constant *lconst;
2823     GLuint retval;
2824     char fragcolor[16];
2825     DWORD *lconst_map = local_const_mapping((IWineD3DBaseShaderImpl *) This), next_local, cur;
2826     struct shader_arb_ctx_priv priv_ctx;
2827     BOOL dcl_tmp = args->super.srgb_correction, dcl_td = FALSE;
2828     BOOL want_nv_prog = FALSE;
2829
2830     char srgbtmp[4][4];
2831     unsigned int i, found = 0;
2832
2833     for(i = 0; i < This->baseShader.limits.temporary; i++) {
2834
2835         /* Don't overwrite the color source */
2836         if(This->color0_mov && i == This->color0_reg) continue;
2837         else if(reg_maps->shader_version.major < 2 && i == 0) continue;
2838
2839         if(reg_maps->temporary[i]) {
2840             sprintf(srgbtmp[found], "R%u", i);
2841             found++;
2842             if(found == 4) break;
2843         }
2844     }
2845
2846     switch(found) {
2847         case 4: dcl_tmp = FALSE; break;
2848         case 0:
2849             sprintf(srgbtmp[0], "TA");
2850             sprintf(srgbtmp[1], "TB");
2851             sprintf(srgbtmp[2], "TC");
2852             sprintf(srgbtmp[3], "TD");
2853             dcl_td = TRUE;
2854             break;
2855         case 1:
2856             sprintf(srgbtmp[1], "TA");
2857             sprintf(srgbtmp[2], "TB");
2858             sprintf(srgbtmp[3], "TC");
2859             break;
2860         case 2:
2861             sprintf(srgbtmp[2], "TA");
2862             sprintf(srgbtmp[3], "TB");
2863             break;
2864         case 3:
2865             sprintf(srgbtmp[3], "TA");
2866             break;
2867     }
2868
2869     /*  Create the hw ARB shader */
2870     memset(&priv_ctx, 0, sizeof(priv_ctx));
2871     priv_ctx.cur_ps_args = args;
2872     priv_ctx.compiled_fprog = compiled;
2873     init_ps_input(This, args, &priv_ctx);
2874     list_init(&priv_ctx.control_frames);
2875
2876     /* Avoid enabling NV_fragment_program* if we do not need it.
2877      *
2878      * Enabling GL_NV_fragment_program_option causes the driver to occupy a temporary register,
2879      * and it slows down the shader execution noticeably(about 5%). Usually our instruction emulation
2880      * is faster than what we gain from using higher native instructions. There are some things though
2881      * that cannot be emulated. In that case enable the extensions.
2882      * If the extension is enabled, instruction handlers that support both ways will use it.
2883      *
2884      * Testing shows no performance difference between OPTION NV_fragment_program2 and NV_fragment_program.
2885      * So enable the best we can get.
2886      */
2887     if(reg_maps->usesdsx || reg_maps->usesdsy || reg_maps->loop_depth > 0 || reg_maps->usestexldd ||
2888        reg_maps->usestexldl || reg_maps->usesfacing)
2889     {
2890         want_nv_prog = TRUE;
2891     }
2892
2893     shader_addline(buffer, "!!ARBfp1.0\n");
2894     if(want_nv_prog && GL_SUPPORT(NV_FRAGMENT_PROGRAM2)) {
2895         shader_addline(buffer, "OPTION NV_fragment_program2;\n");
2896         priv_ctx.target_version = NV3;
2897     } else if(want_nv_prog && GL_SUPPORT(NV_FRAGMENT_PROGRAM_OPTION)) {
2898         shader_addline(buffer, "OPTION NV_fragment_program;\n");
2899         priv_ctx.target_version = NV2;
2900     } else {
2901         if(want_nv_prog)
2902         {
2903             /* This is an error - either we're advertising the wrong shader version, or aren't enforcing some
2904              * limits properly
2905              */
2906             ERR("The shader requires instructions that are not available in plain GL_ARB_fragment_program\n");
2907             ERR("Try GLSL\n");
2908         }
2909         priv_ctx.target_version = ARB;
2910     }
2911
2912     if (reg_maps->shader_version.major < 3)
2913     {
2914         switch(args->super.fog) {
2915             case FOG_OFF:
2916                 break;
2917             case FOG_LINEAR:
2918                 shader_addline(buffer, "OPTION ARB_fog_linear;\n");
2919                 break;
2920             case FOG_EXP:
2921                 shader_addline(buffer, "OPTION ARB_fog_exp;\n");
2922                 break;
2923             case FOG_EXP2:
2924                 shader_addline(buffer, "OPTION ARB_fog_exp2;\n");
2925                 break;
2926         }
2927     }
2928
2929     /* For now always declare the temps. At least the Nvidia assembler optimizes completely
2930      * unused temps away(but occupies them for the whole shader if they're used once). Always
2931      * declaring them avoids tricky bookkeeping work
2932      */
2933     shader_addline(buffer, "TEMP TA;\n");      /* Used for modifiers */
2934     shader_addline(buffer, "TEMP TB;\n");      /* Used for modifiers */
2935     shader_addline(buffer, "TEMP TC;\n");      /* Used for modifiers */
2936     if(dcl_td) shader_addline(buffer, "TEMP TD;\n"); /* Used for sRGB writing */
2937     shader_addline(buffer, "PARAM coefdiv = { 0.5, 0.25, 0.125, 0.0625 };\n");
2938     shader_addline(buffer, "PARAM coefmul = { 2, 4, 8, 16 };\n");
2939     shader_addline(buffer, "PARAM one = { 1.0, 1.0, 1.0, 1.0 };\n");
2940
2941     if (reg_maps->shader_version.major < 2)
2942     {
2943         strcpy(fragcolor, "R0");
2944     } else {
2945         if(args->super.srgb_correction) {
2946             if(This->color0_mov) {
2947                 sprintf(fragcolor, "R%u", This->color0_reg);
2948             } else {
2949                 shader_addline(buffer, "TEMP TMP_COLOR;\n");
2950                 strcpy(fragcolor, "TMP_COLOR");
2951             }
2952         } else {
2953             strcpy(fragcolor, "result.color");
2954         }
2955     }
2956
2957     if(args->super.srgb_correction) {
2958         shader_addline(buffer, "PARAM srgb_consts1 = {%f, %f, %f, %f};\n",
2959                        srgb_mul_low, srgb_cmp, srgb_pow, srgb_mul_high);
2960         shader_addline(buffer, "PARAM srgb_consts2 = {%f, %f, %f, %f};\n",
2961                        srgb_sub_high, 0.0, 0.0, 0.0);
2962     }
2963
2964     /* Base Declarations */
2965     next_local = shader_generate_arb_declarations( (IWineD3DBaseShader*) This, reg_maps, buffer, &GLINFO_LOCATION,
2966             lconst_map, NULL, &priv_ctx);
2967
2968     for(i = 0; i < (sizeof(reg_maps->bumpmat) / sizeof(reg_maps->bumpmat[0])); i++) {
2969         if(!reg_maps->bumpmat[i]) continue;
2970
2971         cur = compiled->numbumpenvmatconsts;
2972         compiled->bumpenvmatconst[cur].const_num = WINED3D_CONST_NUM_UNUSED;
2973         compiled->bumpenvmatconst[cur].texunit = i;
2974         compiled->luminanceconst[cur].const_num = WINED3D_CONST_NUM_UNUSED;
2975         compiled->luminanceconst[cur].texunit = i;
2976
2977         /* We can fit the constants into the constant limit for sure because texbem, texbeml, bem and beml are only supported
2978          * in 1.x shaders, and GL_ARB_fragment_program has a constant limit of 24 constants. So in the worst case we're loading
2979          * 8 shader constants, 8 bump matrices and 8 luminance parameters and are perfectly fine. (No NP2 fixup on bumpmapped
2980          * textures due to conditional NP2 restrictions)
2981          *
2982          * Use local constants to load the bump env parameters, not program.env. This avoids collisions with d3d constants of
2983          * shaders in newer shader models. Since the bump env parameters have to share their space with NP2 fixup constants,
2984          * their location is shader dependent anyway and they cannot be loaded globally.
2985          */
2986         compiled->bumpenvmatconst[cur].const_num = next_local++;
2987         shader_addline(buffer, "PARAM bumpenvmat%d = program.local[%d];\n",
2988                        i, compiled->bumpenvmatconst[cur].const_num);
2989         compiled->numbumpenvmatconsts = cur + 1;
2990
2991         if(!reg_maps->luminanceparams[i]) continue;
2992
2993         compiled->luminanceconst[cur].const_num = next_local++;
2994         shader_addline(buffer, "PARAM luminance%d = program.local[%d];\n",
2995                        i, compiled->luminanceconst[cur].const_num);
2996     }
2997
2998     for(i = 0; i < MAX_CONST_I; i++)
2999     {
3000         compiled->int_consts[i] = WINED3D_CONST_NUM_UNUSED;
3001         if (reg_maps->integer_constants & (1 << i) && priv_ctx.target_version >= NV2)
3002         {
3003             const DWORD *control_values = find_loop_control_values((IWineD3DBaseShaderImpl *) This, i);
3004
3005             if(control_values)
3006             {
3007                 shader_addline(buffer, "PARAM I%u = {%u, %u, %u, -1};\n", i,
3008                                 control_values[0], control_values[1], control_values[2]);
3009             }
3010             else
3011             {
3012                 compiled->int_consts[i] = next_local;
3013                 compiled->num_int_consts++;
3014                 shader_addline(buffer, "PARAM I%u = program.local[%u];\n", i, next_local++);
3015             }
3016         }
3017     }
3018
3019     if(reg_maps->vpos || reg_maps->usesdsy)
3020     {
3021         compiled->ycorrection = next_local;
3022         shader_addline(buffer, "PARAM ycorrection = program.local[%u];\n", next_local++);
3023
3024         if(reg_maps->vpos)
3025         {
3026             shader_addline(buffer, "TEMP vpos;\n");
3027             /* ycorrection.x: Backbuffer height(onscreen) or 0(offscreen).
3028              * ycorrection.y: -1.0(onscreen), 1.0(offscreen)
3029              * ycorrection.z: 1.0
3030              * ycorrection.w: 0.0
3031              */
3032             shader_addline(buffer, "MAD vpos, fragment.position, ycorrection.zyww, ycorrection.wxww;\n");
3033             shader_addline(buffer, "FLR vpos.xy, vpos;\n");
3034         }
3035     }
3036     else
3037     {
3038         compiled->ycorrection = WINED3D_CONST_NUM_UNUSED;
3039     }
3040
3041     /* Base Shader Body */
3042     shader_generate_main((IWineD3DBaseShader *)This, buffer, reg_maps, function, &priv_ctx);
3043
3044     if(args->super.srgb_correction) {
3045         arbfp_add_sRGB_correction(buffer, fragcolor, srgbtmp[0], srgbtmp[1], srgbtmp[2], srgbtmp[3],
3046                                   priv_ctx.target_version >= NV2);
3047     } else if(reg_maps->shader_version.major < 2) {
3048         shader_addline(buffer, "MOV result.color, %s;\n", fragcolor);
3049     }
3050     shader_addline(buffer, "END\n");
3051
3052     /* TODO: change to resource.glObjectHandle or something like that */
3053     GL_EXTCALL(glGenProgramsARB(1, &retval));
3054
3055     TRACE("Creating a hw pixel shader, prg=%d\n", retval);
3056     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, retval));
3057
3058     TRACE("Created hw pixel shader, prg=%d\n", retval);
3059     /* Create the program and check for errors */
3060     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
3061                buffer->bsize, buffer->buffer));
3062
3063     if (glGetError() == GL_INVALID_OPERATION) {
3064         GLint errPos;
3065         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &errPos);
3066         FIXME("HW PixelShader Error at position %d: %s\n",
3067               errPos, debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
3068         retval = 0;
3069     }
3070
3071     /* Load immediate constants */
3072     if(lconst_map) {
3073         LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
3074             const float *value = (const float *)lconst->value;
3075             GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, lconst_map[lconst->idx], value));
3076             checkGLcall("glProgramLocalParameter4fvARB");
3077         }
3078         HeapFree(GetProcessHeap(), 0, lconst_map);
3079     }
3080
3081     return retval;
3082 }
3083
3084 static int compare_sig(const struct wined3d_shader_signature_element *sig1, const struct wined3d_shader_signature_element *sig2)
3085 {
3086     unsigned int i;
3087     int ret;
3088
3089     for(i = 0; i < MAX_REG_INPUT; i++)
3090     {
3091         if(sig1[i].semantic_name == NULL || sig2[i].semantic_name == NULL)
3092         {
3093             /* Compare pointers, not contents. One string is NULL(element does not exist), the other one is not NULL */
3094             if(sig1[i].semantic_name != sig2[i].semantic_name) return sig1[i].semantic_name < sig2[i].semantic_name ? -1 : 1;
3095             continue;
3096         }
3097
3098         ret = strcmp(sig1[i].semantic_name, sig2[i].semantic_name);
3099         if(ret != 0) return ret;
3100         if(sig1[i].semantic_idx    != sig2[i].semantic_idx)    return sig1[i].semantic_idx    < sig2[i].semantic_idx    ? -1 : 1;
3101         if(sig1[i].sysval_semantic != sig2[i].sysval_semantic) return sig1[i].sysval_semantic < sig2[i].sysval_semantic ? -1 : 1;
3102         if(sig1[i].component_type  != sig2[i].component_type)  return sig1[i].sysval_semantic < sig2[i].component_type  ? -1 : 1;
3103         if(sig1[i].register_idx    != sig2[i].register_idx)    return sig1[i].register_idx    < sig2[i].register_idx    ? -1 : 1;
3104         if(sig1[i].mask            != sig2->mask)              return sig1[i].mask            < sig2[i].mask            ? -1 : 1;
3105     }
3106     return 0;
3107 }
3108
3109 static struct wined3d_shader_signature_element *clone_sig(const struct wined3d_shader_signature_element *sig)
3110 {
3111     struct wined3d_shader_signature_element *new;
3112     int i;
3113     char *name;
3114
3115     new = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*new) * MAX_REG_INPUT);
3116     for(i = 0; i < MAX_REG_INPUT; i++)
3117     {
3118         if(sig[i].semantic_name == NULL)
3119         {
3120             continue;
3121         }
3122
3123         new[i] = sig[i];
3124         /* Clone the semantic string */
3125         name = HeapAlloc(GetProcessHeap(), 0, strlen(sig[i].semantic_name) + 1);
3126         strcpy(name, sig[i].semantic_name);
3127         new[i].semantic_name = name;
3128     }
3129     return new;
3130 }
3131
3132 static DWORD find_input_signature(struct shader_arb_priv *priv, const struct wined3d_shader_signature_element *sig)
3133 {
3134     struct wine_rb_entry *entry = wine_rb_get(&priv->signature_tree, sig);
3135     struct ps_signature *found_sig;
3136
3137     if(entry != NULL)
3138     {
3139         found_sig = WINE_RB_ENTRY_VALUE(entry, struct ps_signature, entry);
3140         TRACE("Found existing signature %u\n", found_sig->idx);
3141         return found_sig->idx;
3142     }
3143     found_sig = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*sig));
3144     found_sig->sig = clone_sig(sig);
3145     found_sig->idx = priv->ps_sig_number++;
3146     TRACE("New signature stored and assigned number %u\n", found_sig->idx);
3147     if(wine_rb_put(&priv->signature_tree, sig, &found_sig->entry) == -1)
3148     {
3149         ERR("Failed to insert program entry.\n");
3150     }
3151     return found_sig->idx;
3152 }
3153
3154 static void init_output_registers(IWineD3DVertexShaderImpl *shader, DWORD sig_num, struct shader_arb_ctx_priv *priv_ctx)
3155 {
3156     unsigned int i, j;
3157     static const char *texcoords[8] =
3158     {
3159         "result.texcoord[0]", "result.texcoord[1]", "result.texcoord[2]", "result.texcoord[3]",
3160         "result.texcoord[4]", "result.texcoord[5]", "result.texcoord[6]", "result.texcoord[7]"
3161     };
3162     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) shader->baseShader.device;
3163     const struct wined3d_shader_signature_element *sig;
3164     const char *semantic_name;
3165     DWORD semantic_idx, reg_idx;
3166
3167     /* Write generic input varyings 0 to 7 to result.texcoord[], varying 8 to result.color.primary
3168      * and varying 9 to result.color.secondary
3169      */
3170     const char *decl_idx_to_string[MAX_REG_INPUT] =
3171     {
3172         texcoords[0], texcoords[1], texcoords[2], texcoords[3],
3173         texcoords[4], texcoords[5], texcoords[6], texcoords[7],
3174         "result.color.primary", "result.color.secondary"
3175     };
3176
3177     if(sig_num == ~0)
3178     {
3179         TRACE("Pixel shader uses builtin varyings\n");
3180         /* Map builtins to builtins */
3181         for(i = 0; i < 8; i++)
3182         {
3183             priv_ctx->texcrd_output[i] = texcoords[i];
3184         }
3185         priv_ctx->color_output[0] = "result.color.primary";
3186         priv_ctx->color_output[1] = "result.color.secondary";
3187         priv_ctx->fog_output = "result.fogcoord";
3188
3189         /* Map declared regs to builtins. Use "TA" to /dev/null unread output */
3190         for(i = 0; i < (sizeof(shader->output_signature) / sizeof(*shader->output_signature)); i++)
3191         {
3192             semantic_name = shader->output_signature[i].semantic_name;
3193             if(semantic_name == NULL) continue;
3194
3195             if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_POSITION))
3196             {
3197                 TRACE("o%u is TMP_OUT\n", i);
3198                 if(shader->output_signature[i].semantic_idx == 0) priv_ctx->vs_output[i] = "TMP_OUT";
3199                 else priv_ctx->vs_output[i] = "TA";
3200             }
3201             else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_PSIZE))
3202             {
3203                 TRACE("o%u is result.pointsize\n", i);
3204                 if(shader->output_signature[i].semantic_idx == 0) priv_ctx->vs_output[i] = "result.pointsize";
3205                 else priv_ctx->vs_output[i] = "TA";
3206             }
3207             else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_COLOR))
3208             {
3209                 TRACE("o%u is result.color.?, idx %u\n", i, shader->output_signature[i].semantic_idx);
3210                 if(shader->output_signature[i].semantic_idx == 0) priv_ctx->vs_output[i] = "result.color.primary";
3211                 else if(shader->output_signature[i].semantic_idx == 1) priv_ctx->vs_output[i] = "result.color.secondary";
3212                 else priv_ctx->vs_output[i] = "TA";
3213             }
3214             else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_TEXCOORD))
3215             {
3216                 TRACE("o%u is %s\n", i, texcoords[shader->output_signature[i].semantic_idx]);
3217                 if(shader->output_signature[i].semantic_idx >= 8) priv_ctx->vs_output[i] = "TA";
3218                 else priv_ctx->vs_output[i] = texcoords[shader->output_signature[i].semantic_idx];
3219             }
3220             else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_FOG))
3221             {
3222                 TRACE("o%u is result.fogcoord\n", i);
3223                 if(shader->output_signature[i].semantic_idx > 0) priv_ctx->vs_output[i] = "TA";
3224                 else priv_ctx->vs_output[i] = "result.fogcoord";
3225             }
3226             else
3227             {
3228                 priv_ctx->vs_output[i] = "TA";
3229             }
3230         }
3231         return;
3232     }
3233
3234     /* Instead of searching for the signature in the signature list, read the one from the current pixel shader.
3235      * Its maybe not the shader where the signature came from, but it is the same signature and faster to find
3236      */
3237     sig = ((IWineD3DPixelShaderImpl *)device->stateBlock->pixelShader)->input_signature;
3238     TRACE("Pixel shader uses declared varyings\n");
3239
3240     /* Map builtin to declared. /dev/null the results by default to the TA temp reg */
3241     for(i = 0; i < 8; i++)
3242     {
3243         priv_ctx->texcrd_output[i] = "TA";
3244     }
3245     priv_ctx->color_output[0] = "TA";
3246     priv_ctx->color_output[1] = "TA";
3247     priv_ctx->fog_output = "TA";
3248
3249     for(i = 0; i < MAX_REG_INPUT; i++)
3250     {
3251         semantic_name = sig[i].semantic_name;
3252         semantic_idx = sig[i].semantic_idx;
3253         reg_idx = sig[i].register_idx;
3254         if(semantic_name == NULL) continue;
3255
3256         /* If a declared input register is not written by builtin arguments, don't write to it.
3257          * GL_NV_vertex_program makes sure the input defaults to 0.0, which is correct with D3D
3258          *
3259          * Don't care about POSITION and PSIZE here - this is a builtin vertex shader, position goes
3260          * to TMP_OUT in any case
3261          */
3262         if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_TEXCOORD))
3263         {
3264             if(semantic_idx < 8) priv_ctx->texcrd_output[semantic_idx] = decl_idx_to_string[reg_idx];
3265         }
3266         else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_COLOR))
3267         {
3268             if(semantic_idx < 2) priv_ctx->color_output[semantic_idx] = decl_idx_to_string[reg_idx];
3269         }
3270         else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_FOG))
3271         {
3272             if(semantic_idx == 0) priv_ctx->fog_output = decl_idx_to_string[reg_idx];
3273         }
3274     }
3275
3276     /* Map declared to declared */
3277     for(i = 0; i < (sizeof(shader->output_signature) / sizeof(*shader->output_signature)); i++)
3278     {
3279         /* Write unread output to TA to throw them away */
3280         priv_ctx->vs_output[i] = "TA";
3281         semantic_name = shader->output_signature[i].semantic_name;
3282         if(semantic_name == NULL)
3283         {
3284             continue;
3285         }
3286
3287         if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_POSITION) &&
3288            shader->output_signature[i].semantic_idx == 0)
3289         {
3290             priv_ctx->vs_output[i] = "TMP_OUT";
3291             continue;
3292         }
3293         else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_PSIZE) &&
3294            shader->output_signature[i].semantic_idx == 0)
3295         {
3296             priv_ctx->vs_output[i] = "result.pointsize";
3297             continue;
3298         }
3299
3300         for(j = 0; j < MAX_REG_INPUT; j++)
3301         {
3302             if(sig[j].semantic_name == NULL)
3303             {
3304                 continue;
3305             }
3306
3307             if(strcmp(sig[j].semantic_name, semantic_name) == 0 &&
3308                sig[j].semantic_idx == shader->output_signature[i].semantic_idx)
3309             {
3310                 priv_ctx->vs_output[i] = decl_idx_to_string[sig[j].register_idx];
3311             }
3312         }
3313     }
3314 }
3315
3316 /* GL locking is done by the caller */
3317 static GLuint shader_arb_generate_vshader(IWineD3DVertexShaderImpl *This,
3318         SHADER_BUFFER *buffer, const struct arb_vs_compile_args *args, struct arb_vs_compiled_shader *compiled)
3319 {
3320     const shader_reg_maps *reg_maps = &This->baseShader.reg_maps;
3321     CONST DWORD *function = This->baseShader.function;
3322     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *)This->baseShader.device;
3323     const WineD3D_GL_Info *gl_info = &device->adapter->gl_info;
3324     const local_constant *lconst;
3325     GLuint ret;
3326     DWORD next_local, *lconst_map = local_const_mapping((IWineD3DBaseShaderImpl *) This);
3327     struct shader_arb_ctx_priv priv_ctx;
3328     unsigned int i;
3329     DWORD num_clipplanes = 0;
3330
3331     memset(&priv_ctx, 0, sizeof(priv_ctx));
3332     priv_ctx.cur_vs_args = args;
3333     list_init(&priv_ctx.control_frames);
3334     init_output_registers(This, args->ps_signature, &priv_ctx);
3335
3336     /*  Create the hw ARB shader */
3337     shader_addline(buffer, "!!ARBvp1.0\n");
3338
3339     /* Always enable the NV extension if available. Unlike fragment shaders, there is no
3340      * mesurable performance penalty, and we can always make use of it for clipplanes.
3341      */
3342     if(GL_SUPPORT(NV_VERTEX_PROGRAM3)) {
3343         shader_addline(buffer, "OPTION NV_vertex_program3;\n");
3344         priv_ctx.target_version = NV3;
3345         shader_addline(buffer, "ADDRESS aL;\n");
3346     } else if(GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION)) {
3347         shader_addline(buffer, "OPTION NV_vertex_program2;\n");
3348         priv_ctx.target_version = NV2;
3349         shader_addline(buffer, "ADDRESS aL;\n");
3350     } else {
3351         priv_ctx.target_version = ARB;
3352     }
3353
3354     shader_addline(buffer, "TEMP TMP_OUT;\n");
3355     if(need_helper_const(gl_info)) {
3356         shader_addline(buffer, "PARAM helper_const = { 2.0, -1.0, %d.0, 0.0 };\n", This->rel_offset);
3357     }
3358     if(need_mova_const((IWineD3DBaseShader *) This, gl_info)) {
3359         shader_addline(buffer, "PARAM mova_const = { 0.5, 0.0, 2.0, 1.0 };\n");
3360         shader_addline(buffer, "TEMP A0_SHADOW;\n");
3361     }
3362
3363     shader_addline(buffer, "TEMP TA;\n");
3364
3365     /* Base Declarations */
3366     next_local = shader_generate_arb_declarations( (IWineD3DBaseShader*) This, reg_maps, buffer, &GLINFO_LOCATION,
3367             lconst_map, &num_clipplanes, &priv_ctx);
3368
3369     for(i = 0; i < MAX_CONST_I; i++)
3370     {
3371         compiled->int_consts[i] = WINED3D_CONST_NUM_UNUSED;
3372         if(reg_maps->integer_constants & (1 << i) && priv_ctx.target_version >= NV2)
3373         {
3374             const DWORD *control_values = find_loop_control_values((IWineD3DBaseShaderImpl *) This, i);
3375
3376             if(control_values)
3377             {
3378                 shader_addline(buffer, "PARAM I%u = {%u, %u, %u, -1};\n", i,
3379                                 control_values[0], control_values[1], control_values[2]);
3380             }
3381             else
3382             {
3383                 compiled->int_consts[i] = next_local;
3384                 compiled->num_int_consts++;
3385                 shader_addline(buffer, "PARAM I%u = program.local[%u];\n", i, next_local++);
3386             }
3387         }
3388     }
3389
3390     /* We need a constant to fixup the final position */
3391     shader_addline(buffer, "PARAM posFixup = program.env[%d];\n", ARB_SHADER_PRIVCONST_POS);
3392
3393     /* Initialize output parameters. GL_ARB_vertex_program does not require special initialization values
3394      * for output parameters. D3D in theory does not do that either, but some applications depend on a
3395      * proper initialization of the secondary color, and programs using the fixed function pipeline without
3396      * a replacement shader depend on the texcoord.w being set properly.
3397      *
3398      * GL_NV_vertex_program defines that all output values are initialized to {0.0, 0.0, 0.0, 1.0}. This
3399      * assertion is in effect even when using GL_ARB_vertex_program without any NV specific additions. So
3400      * skip this if NV_vertex_program is supported. Otherwise, initialize the secondary color. For the tex-
3401      * coords, we have a flag in the opengl caps. Many cards do not require the texcoord being set, and
3402      * this can eat a number of instructions, so skip it unless this cap is set as well
3403      */
3404     if(!GL_SUPPORT(NV_VERTEX_PROGRAM)) {
3405         shader_addline(buffer, "MOV result.color.secondary, -helper_const.wwwy;\n");
3406
3407         if((GLINFO_LOCATION).set_texcoord_w && !device->frag_pipe->ffp_proj_control) {
3408             int i;
3409             for(i = 0; i < min(8, MAX_REG_TEXCRD); i++) {
3410                 if(This->baseShader.reg_maps.texcoord_mask[i] != 0 &&
3411                 This->baseShader.reg_maps.texcoord_mask[i] != WINED3DSP_WRITEMASK_ALL) {
3412                     shader_addline(buffer, "MOV result.texcoord[%u].w, -helper_const.y;\n", i);
3413                 }
3414             }
3415         }
3416     }
3417
3418     /* Base Shader Body */
3419     shader_generate_main((IWineD3DBaseShader *)This, buffer, reg_maps, function, &priv_ctx);
3420
3421     /* The D3DRS_FOGTABLEMODE render state defines if the shader-generated fog coord is used
3422      * or if the fragment depth is used. If the fragment depth is used(FOGTABLEMODE != NONE),
3423      * the fog frag coord is thrown away. If the fog frag coord is used, but not written by
3424      * the shader, it is set to 0.0(fully fogged, since start = 1.0, end = 0.0)
3425      */
3426     if(args->super.fog_src == VS_FOG_Z) {
3427         shader_addline(buffer, "MOV result.fogcoord, TMP_OUT.z;\n");
3428     } else if (!reg_maps->fog) {
3429         /* posFixup.x is always 1.0, so we can savely use it */
3430         shader_addline(buffer, "ADD result.fogcoord, posFixup.x, -posFixup.x;\n");
3431     }
3432
3433     /* Write the final position.
3434      *
3435      * OpenGL coordinates specify the center of the pixel while d3d coords specify
3436      * the corner. The offsets are stored in z and w in posFixup. posFixup.y contains
3437      * 1.0 or -1.0 to turn the rendering upside down for offscreen rendering. PosFixup.x
3438      * contains 1.0 to allow a mad, but arb vs swizzles are too restricted for that.
3439      */
3440     shader_addline(buffer, "MUL TA, posFixup, TMP_OUT.w;\n");
3441     shader_addline(buffer, "ADD TMP_OUT.x, TMP_OUT.x, TA.z;\n");
3442     shader_addline(buffer, "MAD TMP_OUT.y, TMP_OUT.y, posFixup.y, TA.w;\n");
3443
3444     for(i = 0; i < num_clipplanes; i++)
3445     {
3446         shader_addline(buffer, "DP4 result.clip[%u].x, TMP_OUT, state.clip[%u].plane;\n", i, i);
3447     }
3448
3449     /* Z coord [0;1]->[-1;1] mapping, see comment in transform_projection in state.c
3450      * and the glsl equivalent
3451      */
3452     if(need_helper_const(gl_info)) {
3453         shader_addline(buffer, "MAD TMP_OUT.z, TMP_OUT.z, helper_const.x, -TMP_OUT.w;\n");
3454     } else {
3455         shader_addline(buffer, "ADD TMP_OUT.z, TMP_OUT.z, TMP_OUT.z;\n");
3456         shader_addline(buffer, "ADD TMP_OUT.z, TMP_OUT.z, -TMP_OUT.w;\n");
3457     }
3458
3459     shader_addline(buffer, "MOV result.position, TMP_OUT;\n");
3460
3461     shader_addline(buffer, "END\n");
3462
3463     /* TODO: change to resource.glObjectHandle or something like that */
3464     GL_EXTCALL(glGenProgramsARB(1, &ret));
3465
3466     TRACE("Creating a hw vertex shader, prg=%d\n", ret);
3467     GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, ret));
3468
3469     TRACE("Created hw vertex shader, prg=%d\n", ret);
3470     /* Create the program and check for errors */
3471     GL_EXTCALL(glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
3472                buffer->bsize, buffer->buffer));
3473
3474     if (glGetError() == GL_INVALID_OPERATION) {
3475         GLint errPos;
3476         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &errPos);
3477         FIXME("HW VertexShader Error at position %d: %s\n",
3478               errPos, debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
3479         ret = -1;
3480     } else {
3481         /* Load immediate constants */
3482         if(lconst_map) {
3483             LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
3484                 const float *value = (const float *)lconst->value;
3485                 GL_EXTCALL(glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, lconst_map[lconst->idx], value));
3486             }
3487         }
3488     }
3489     HeapFree(GetProcessHeap(), 0, lconst_map);
3490
3491     return ret;
3492 }
3493
3494 /* GL locking is done by the caller */
3495 static struct arb_ps_compiled_shader *find_arb_pshader(IWineD3DPixelShaderImpl *shader, const struct arb_ps_compile_args *args)
3496 {
3497     UINT i;
3498     DWORD new_size;
3499     struct arb_ps_compiled_shader *new_array;
3500     SHADER_BUFFER buffer;
3501     struct arb_pshader_private *shader_data;
3502     GLuint ret;
3503
3504     if(!shader->backend_priv) {
3505         shader->backend_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data));
3506     }
3507     shader_data = shader->backend_priv;
3508
3509     /* Usually we have very few GL shaders for each d3d shader(just 1 or maybe 2),
3510      * so a linear search is more performant than a hashmap or a binary search
3511      * (cache coherency etc)
3512      */
3513     for(i = 0; i < shader_data->num_gl_shaders; i++) {
3514         if(memcmp(&shader_data->gl_shaders[i].args, args, sizeof(*args)) == 0) {
3515             return &shader_data->gl_shaders[i];
3516         }
3517     }
3518
3519     TRACE("No matching GL shader found, compiling a new shader\n");
3520     if(shader_data->shader_array_size == shader_data->num_gl_shaders) {
3521         if (shader_data->num_gl_shaders)
3522         {
3523             new_size = shader_data->shader_array_size + max(1, shader_data->shader_array_size / 2);
3524             new_array = HeapReAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, shader_data->gl_shaders,
3525                                     new_size * sizeof(*shader_data->gl_shaders));
3526         } else {
3527             new_array = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data->gl_shaders));
3528             new_size = 1;
3529         }
3530
3531         if(!new_array) {
3532             ERR("Out of memory\n");
3533             return 0;
3534         }
3535         shader_data->gl_shaders = new_array;
3536         shader_data->shader_array_size = new_size;
3537     }
3538
3539     shader_data->gl_shaders[shader_data->num_gl_shaders].args = *args;
3540
3541     pixelshader_update_samplers(&shader->baseShader.reg_maps,
3542             ((IWineD3DDeviceImpl *)shader->baseShader.device)->stateBlock->textures);
3543
3544     shader_buffer_init(&buffer);
3545     ret = shader_arb_generate_pshader(shader, &buffer, args,
3546                                       &shader_data->gl_shaders[shader_data->num_gl_shaders]);
3547     shader_buffer_free(&buffer);
3548     shader_data->gl_shaders[shader_data->num_gl_shaders].prgId = ret;
3549
3550     return &shader_data->gl_shaders[shader_data->num_gl_shaders++];
3551 }
3552
3553 static inline BOOL vs_args_equal(const struct arb_vs_compile_args *stored, const struct arb_vs_compile_args *new,
3554                                  const DWORD use_map, BOOL skip_int) {
3555     if((stored->super.swizzle_map & use_map) != new->super.swizzle_map) return FALSE;
3556     if(stored->super.fog_src != new->super.fog_src) return FALSE;
3557     if(stored->bools != new->bools) return FALSE;
3558     if(stored->ps_signature != new->ps_signature) return FALSE;
3559     if(skip_int) return TRUE;
3560
3561     return memcmp(stored->loop_ctrl, new->loop_ctrl, sizeof(stored->loop_ctrl)) == 0;
3562 }
3563
3564 static struct arb_vs_compiled_shader *find_arb_vshader(IWineD3DVertexShaderImpl *shader, const struct arb_vs_compile_args *args)
3565 {
3566     UINT i;
3567     DWORD new_size;
3568     struct arb_vs_compiled_shader *new_array;
3569     DWORD use_map = ((IWineD3DDeviceImpl *)shader->baseShader.device)->strided_streams.use_map;
3570     SHADER_BUFFER buffer;
3571     struct arb_vshader_private *shader_data;
3572     GLuint ret;
3573     const WineD3D_GL_Info *gl_info = &((IWineD3DDeviceImpl *)shader->baseShader.device)->adapter->gl_info;
3574
3575     if(!shader->backend_priv) {
3576         shader->backend_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data));
3577     }
3578     shader_data = shader->backend_priv;
3579
3580     /* Usually we have very few GL shaders for each d3d shader(just 1 or maybe 2),
3581      * so a linear search is more performant than a hashmap or a binary search
3582      * (cache coherency etc)
3583      */
3584     for(i = 0; i < shader_data->num_gl_shaders; i++) {
3585         if(vs_args_equal(&shader_data->gl_shaders[i].args, args, use_map, GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION))) {
3586             return &shader_data->gl_shaders[i];
3587         }
3588     }
3589
3590     TRACE("No matching GL shader found, compiling a new shader\n");
3591
3592     if(shader_data->shader_array_size == shader_data->num_gl_shaders) {
3593         if (shader_data->num_gl_shaders)
3594         {
3595             new_size = shader_data->shader_array_size + max(1, shader_data->shader_array_size / 2);
3596             new_array = HeapReAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, shader_data->gl_shaders,
3597                                     new_size * sizeof(*shader_data->gl_shaders));
3598         } else {
3599             new_array = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data->gl_shaders));
3600             new_size = 1;
3601         }
3602
3603         if(!new_array) {
3604             ERR("Out of memory\n");
3605             return 0;
3606         }
3607         shader_data->gl_shaders = new_array;
3608         shader_data->shader_array_size = new_size;
3609     }
3610
3611     shader_data->gl_shaders[shader_data->num_gl_shaders].args = *args;
3612
3613     shader_buffer_init(&buffer);
3614     ret = shader_arb_generate_vshader(shader, &buffer, args,
3615             &shader_data->gl_shaders[shader_data->num_gl_shaders]);
3616     shader_buffer_free(&buffer);
3617     shader_data->gl_shaders[shader_data->num_gl_shaders].prgId = ret;
3618
3619     return &shader_data->gl_shaders[shader_data->num_gl_shaders++];
3620 }
3621
3622 static inline void find_arb_ps_compile_args(IWineD3DPixelShaderImpl *shader, IWineD3DStateBlockImpl *stateblock,
3623         struct arb_ps_compile_args *args)
3624 {
3625     int i;
3626     WORD int_skip;
3627     const WineD3D_GL_Info *gl_info = &((IWineD3DDeviceImpl *)shader->baseShader.device)->adapter->gl_info;
3628     find_ps_compile_args(shader, stateblock, &args->super);
3629
3630     /* This forces all local boolean constants to 1 to make them stateblock independent */
3631     args->bools = shader->baseShader.reg_maps.local_bool_consts;
3632
3633     for(i = 0; i < MAX_CONST_B; i++)
3634     {
3635         if(stateblock->pixelShaderConstantB[i]) args->bools |= ( 1 << i);
3636     }
3637
3638     /* Skip if unused or local, or supported natively */
3639     int_skip = ~shader->baseShader.reg_maps.integer_constants | shader->baseShader.reg_maps.local_int_consts;
3640     if(int_skip == 0xffff || GL_SUPPORT(NV_FRAGMENT_PROGRAM_OPTION))
3641     {
3642         memset(&args->loop_ctrl, 0, sizeof(args->loop_ctrl));
3643         return;
3644     }
3645
3646     for(i = 0; i < MAX_CONST_I; i++)
3647     {
3648         if(int_skip & (1 << i))
3649         {
3650             args->loop_ctrl[i][0] = 0;
3651             args->loop_ctrl[i][1] = 0;
3652             args->loop_ctrl[i][2] = 0;
3653         }
3654         else
3655         {
3656             args->loop_ctrl[i][0] = stateblock->pixelShaderConstantI[i * 4];
3657             args->loop_ctrl[i][1] = stateblock->pixelShaderConstantI[i * 4 + 1];
3658             args->loop_ctrl[i][2] = stateblock->pixelShaderConstantI[i * 4 + 2];
3659         }
3660     }
3661 }
3662
3663 static inline void find_arb_vs_compile_args(IWineD3DVertexShaderImpl *shader, IWineD3DStateBlockImpl *stateblock,
3664         struct arb_vs_compile_args *args)
3665 {
3666     int i;
3667     WORD int_skip;
3668     const WineD3D_GL_Info *gl_info = &((IWineD3DDeviceImpl *)shader->baseShader.device)->adapter->gl_info;
3669     find_vs_compile_args(shader, stateblock, &args->super);
3670
3671     /* This forces all local boolean constants to 1 to make them stateblock independent */
3672     args->bools = shader->baseShader.reg_maps.local_bool_consts;
3673
3674     if(use_ps(stateblock))
3675     {
3676         IWineD3DPixelShaderImpl *ps = (IWineD3DPixelShaderImpl *) stateblock->pixelShader;
3677         struct arb_pshader_private *shader_priv = ps->backend_priv;
3678         args->ps_signature = shader_priv->input_signature_idx;
3679     }
3680     else args->ps_signature = ~0;
3681
3682     /* TODO: Figure out if it would be better to store bool constants as bitmasks in the stateblock */
3683     for(i = 0; i < MAX_CONST_B; i++)
3684     {
3685         if(stateblock->vertexShaderConstantB[i]) args->bools |= ( 1 << i);
3686     }
3687
3688     /* Skip if unused or local */
3689     int_skip = ~shader->baseShader.reg_maps.integer_constants | shader->baseShader.reg_maps.local_int_consts;
3690     if(int_skip == 0xffff || GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION))
3691     {
3692         memset(&args->loop_ctrl, 0, sizeof(args->loop_ctrl));
3693         return;
3694     }
3695
3696     for(i = 0; i < MAX_CONST_I; i++)
3697     {
3698         if(int_skip & (1 << i))
3699         {
3700             args->loop_ctrl[i][0] = 0;
3701             args->loop_ctrl[i][1] = 0;
3702             args->loop_ctrl[i][2] = 0;
3703         }
3704         else
3705         {
3706             args->loop_ctrl[i][0] = stateblock->vertexShaderConstantI[i * 4];
3707             args->loop_ctrl[i][1] = stateblock->vertexShaderConstantI[i * 4 + 1];
3708             args->loop_ctrl[i][2] = stateblock->vertexShaderConstantI[i * 4 + 2];
3709         }
3710     }
3711 }
3712
3713 /* GL locking is done by the caller */
3714 static void shader_arb_select(IWineD3DDevice *iface, BOOL usePS, BOOL useVS) {
3715     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
3716     struct shader_arb_priv *priv = This->shader_priv;
3717     const WineD3D_GL_Info *gl_info = &This->adapter->gl_info;
3718
3719     /* Deal with pixel shaders first so the vertex shader arg function has the input signature ready */
3720     if (usePS) {
3721         struct arb_ps_compile_args compile_args;
3722         struct arb_ps_compiled_shader *compiled;
3723         struct arb_pshader_private *shader_priv;
3724         IWineD3DPixelShaderImpl *ps = (IWineD3DPixelShaderImpl *) This->stateBlock->pixelShader;
3725
3726         TRACE("Using pixel shader %p\n", This->stateBlock->pixelShader);
3727         find_arb_ps_compile_args(ps, This->stateBlock, &compile_args);
3728         compiled = find_arb_pshader(ps, &compile_args);
3729         priv->current_fprogram_id = compiled->prgId;
3730         priv->compiled_fprog = compiled;
3731
3732         shader_priv = ps->backend_priv;
3733         if(!shader_priv->has_signature_idx)
3734         {
3735             if(ps->baseShader.reg_maps.shader_version.major < 3) shader_priv->input_signature_idx = ~0;
3736             else shader_priv->input_signature_idx = find_input_signature(priv, ps->input_signature);
3737
3738             shader_priv->has_signature_idx = TRUE;
3739             TRACE("Shader got assigned input signature index %u\n", shader_priv->input_signature_idx);
3740         }
3741
3742         /* Bind the fragment program */
3743         GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id));
3744         checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id);");
3745
3746         if(!priv->use_arbfp_fixed_func) {
3747             /* Enable OpenGL fragment programs */
3748             glEnable(GL_FRAGMENT_PROGRAM_ARB);
3749             checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB);");
3750         }
3751         TRACE("(%p) : Bound fragment program %u and enabled GL_FRAGMENT_PROGRAM_ARB\n", This, priv->current_fprogram_id);
3752
3753         shader_arb_ps_local_constants(This);
3754     } else if(GL_SUPPORT(ARB_FRAGMENT_PROGRAM) && !priv->use_arbfp_fixed_func) {
3755         /* Disable only if we're not using arbfp fixed function fragment processing. If this is used,
3756         * keep GL_FRAGMENT_PROGRAM_ARB enabled, and the fixed function pipeline will bind the fixed function
3757         * replacement shader
3758         */
3759         glDisable(GL_FRAGMENT_PROGRAM_ARB);
3760         checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
3761         priv->current_fprogram_id = 0;
3762     }
3763
3764     if (useVS) {
3765         struct arb_vs_compile_args compile_args;
3766         struct arb_vs_compiled_shader *compiled;
3767
3768         TRACE("Using vertex shader %p\n", This->stateBlock->vertexShader);
3769         find_arb_vs_compile_args((IWineD3DVertexShaderImpl *) This->stateBlock->vertexShader, This->stateBlock, &compile_args);
3770         compiled = find_arb_vshader((IWineD3DVertexShaderImpl *) This->stateBlock->vertexShader, &compile_args);
3771         priv->current_vprogram_id = compiled->prgId;
3772         priv->compiled_vprog = compiled;
3773
3774         /* Bind the vertex program */
3775         GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id));
3776         checkGLcall("glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id);");
3777
3778         /* Enable OpenGL vertex programs */
3779         glEnable(GL_VERTEX_PROGRAM_ARB);
3780         checkGLcall("glEnable(GL_VERTEX_PROGRAM_ARB);");
3781         TRACE("(%p) : Bound vertex program %u and enabled GL_VERTEX_PROGRAM_ARB\n", This, priv->current_vprogram_id);
3782         shader_arb_vs_local_constants(This);
3783     } else if(GL_SUPPORT(ARB_VERTEX_PROGRAM)) {
3784         priv->current_vprogram_id = 0;
3785         glDisable(GL_VERTEX_PROGRAM_ARB);
3786         checkGLcall("glDisable(GL_VERTEX_PROGRAM_ARB)");
3787     }
3788 }
3789
3790 /* GL locking is done by the caller */
3791 static void shader_arb_select_depth_blt(IWineD3DDevice *iface, enum tex_types tex_type) {
3792     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
3793     struct shader_arb_priv *priv = This->shader_priv;
3794     GLuint *blt_fprogram = &priv->depth_blt_fprogram_id[tex_type];
3795     const WineD3D_GL_Info *gl_info = &This->adapter->gl_info;
3796
3797     if (!priv->depth_blt_vprogram_id) priv->depth_blt_vprogram_id = create_arb_blt_vertex_program(gl_info);
3798     GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->depth_blt_vprogram_id));
3799     glEnable(GL_VERTEX_PROGRAM_ARB);
3800
3801     if (!*blt_fprogram) *blt_fprogram = create_arb_blt_fragment_program(gl_info, tex_type);
3802     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, *blt_fprogram));
3803     glEnable(GL_FRAGMENT_PROGRAM_ARB);
3804 }
3805
3806 /* GL locking is done by the caller */
3807 static void shader_arb_deselect_depth_blt(IWineD3DDevice *iface) {
3808     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
3809     struct shader_arb_priv *priv = This->shader_priv;
3810     const WineD3D_GL_Info *gl_info = &This->adapter->gl_info;
3811
3812     if (priv->current_vprogram_id) {
3813         GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id));
3814         checkGLcall("glBindProgramARB(GL_VERTEX_PROGRAM_ARB, vertexShader->prgId);");
3815
3816         glEnable(GL_VERTEX_PROGRAM_ARB);
3817         checkGLcall("glEnable(GL_VERTEX_PROGRAM_ARB);");
3818
3819         TRACE("(%p) : Bound vertex program %u and enabled GL_VERTEX_PROGRAM_ARB\n", This, priv->current_vprogram_id);
3820     } else {
3821         glDisable(GL_VERTEX_PROGRAM_ARB);
3822         checkGLcall("glDisable(GL_VERTEX_PROGRAM_ARB)");
3823     }
3824
3825     if (priv->current_fprogram_id) {
3826         GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id));
3827         checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, pixelShader->prgId);");
3828
3829         glEnable(GL_FRAGMENT_PROGRAM_ARB);
3830         checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB);");
3831
3832         TRACE("(%p) : Bound fragment program %u and enabled GL_FRAGMENT_PROGRAM_ARB\n", This, priv->current_fprogram_id);
3833     } else {
3834         glDisable(GL_FRAGMENT_PROGRAM_ARB);
3835         checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
3836     }
3837 }
3838
3839 static void shader_arb_destroy(IWineD3DBaseShader *iface) {
3840     IWineD3DBaseShaderImpl *baseShader = (IWineD3DBaseShaderImpl *) iface;
3841     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *)baseShader->baseShader.device;
3842     const WineD3D_GL_Info *gl_info = &device->adapter->gl_info;
3843
3844     ActivateContext(device, device->lastActiveRenderTarget, CTXUSAGE_RESOURCELOAD);
3845
3846     if (shader_is_pshader_version(baseShader->baseShader.reg_maps.shader_version.type))
3847     {
3848         IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *) iface;
3849         struct arb_pshader_private *shader_data = This->backend_priv;
3850         UINT i;
3851
3852         if(!shader_data) return; /* This can happen if a shader was never compiled */
3853         ENTER_GL();
3854         for(i = 0; i < shader_data->num_gl_shaders; i++) {
3855             GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId));
3856             checkGLcall("GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId))");
3857         }
3858         LEAVE_GL();
3859         HeapFree(GetProcessHeap(), 0, shader_data->gl_shaders);
3860         HeapFree(GetProcessHeap(), 0, shader_data);
3861         This->backend_priv = NULL;
3862     } else {
3863         IWineD3DVertexShaderImpl *This = (IWineD3DVertexShaderImpl *) iface;
3864         struct arb_vshader_private *shader_data = This->backend_priv;
3865         UINT i;
3866
3867         if(!shader_data) return; /* This can happen if a shader was never compiled */
3868         ENTER_GL();
3869         for(i = 0; i < shader_data->num_gl_shaders; i++) {
3870             GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId));
3871             checkGLcall("GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId))");
3872         }
3873         LEAVE_GL();
3874         HeapFree(GetProcessHeap(), 0, shader_data->gl_shaders);
3875         HeapFree(GetProcessHeap(), 0, shader_data);
3876         This->backend_priv = NULL;
3877     }
3878 }
3879
3880 static int sig_tree_compare(const void *key, const struct wine_rb_entry *entry)
3881 {
3882     struct ps_signature *e = WINE_RB_ENTRY_VALUE(entry, struct ps_signature, entry);
3883     return compare_sig(key, e->sig);
3884 }
3885
3886 struct wine_rb_functions sig_tree_functions =
3887 {
3888     wined3d_rb_alloc,
3889     wined3d_rb_realloc,
3890     wined3d_rb_free,
3891     sig_tree_compare
3892 };
3893
3894 static HRESULT shader_arb_alloc(IWineD3DDevice *iface) {
3895     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
3896     struct shader_arb_priv *priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*priv));
3897     if(wine_rb_init(&priv->signature_tree, &sig_tree_functions) == -1)
3898     {
3899         ERR("RB tree init failed\n");
3900         HeapFree(GetProcessHeap(), 0, priv);
3901         return E_OUTOFMEMORY;
3902     }
3903     This->shader_priv = priv;
3904     return WINED3D_OK;
3905 }
3906
3907 static void release_signature(struct wine_rb_entry *entry, void *context)
3908 {
3909     struct ps_signature *sig = WINE_RB_ENTRY_VALUE(entry, struct ps_signature, entry);
3910     int i;
3911     for(i = 0; i < MAX_REG_INPUT; i++)
3912     {
3913         HeapFree(GetProcessHeap(), 0, (char *) sig->sig[i].semantic_name);
3914     }
3915     HeapFree(GetProcessHeap(), 0, sig->sig);
3916     HeapFree(GetProcessHeap(), 0, sig);
3917 }
3918
3919 static void shader_arb_free(IWineD3DDevice *iface) {
3920     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
3921     const WineD3D_GL_Info *gl_info = &This->adapter->gl_info;
3922     struct shader_arb_priv *priv = This->shader_priv;
3923     int i;
3924
3925     ENTER_GL();
3926     if(priv->depth_blt_vprogram_id) {
3927         GL_EXTCALL(glDeleteProgramsARB(1, &priv->depth_blt_vprogram_id));
3928     }
3929     for (i = 0; i < tex_type_count; ++i) {
3930         if (priv->depth_blt_fprogram_id[i]) {
3931             GL_EXTCALL(glDeleteProgramsARB(1, &priv->depth_blt_fprogram_id[i]));
3932         }
3933     }
3934     LEAVE_GL();
3935
3936     wine_rb_destroy(&priv->signature_tree, release_signature, NULL);
3937     HeapFree(GetProcessHeap(), 0, This->shader_priv);
3938 }
3939
3940 static BOOL shader_arb_dirty_const(IWineD3DDevice *iface) {
3941     return TRUE;
3942 }
3943
3944 static void shader_arb_get_caps(WINED3DDEVTYPE devtype, const WineD3D_GL_Info *gl_info, struct shader_caps *pCaps)
3945 {
3946     /* We don't have an ARB fixed function pipeline yet, so let the none backend set its caps,
3947      * then overwrite the shader specific ones
3948      */
3949     none_shader_backend.shader_get_caps(devtype, gl_info, pCaps);
3950
3951     if(GL_SUPPORT(ARB_VERTEX_PROGRAM)) {
3952         pCaps->VertexShaderVersion = WINED3DVS_VERSION(1,1);
3953         TRACE_(d3d_caps)("Hardware vertex shader version 1.1 enabled (ARB_PROGRAM)\n");
3954         pCaps->MaxVertexShaderConst = GL_LIMITS(vshader_constantsF) - 1;
3955     }
3956
3957     if(GL_SUPPORT(ARB_FRAGMENT_PROGRAM)) {
3958         pCaps->PixelShaderVersion    = WINED3DPS_VERSION(1,4);
3959         pCaps->PixelShader1xMaxValue = 8.0;
3960         TRACE_(d3d_caps)("Hardware pixel shader version 1.4 enabled (ARB_PROGRAM)\n");
3961         pCaps->MaxPixelShaderConst = GL_LIMITS(pshader_constantsF);
3962     }
3963
3964     pCaps->VSClipping = GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION);
3965 }
3966
3967 static BOOL shader_arb_color_fixup_supported(struct color_fixup_desc fixup)
3968 {
3969     if (TRACE_ON(d3d_shader) && TRACE_ON(d3d))
3970     {
3971         TRACE("Checking support for color_fixup:\n");
3972         dump_color_fixup_desc(fixup);
3973     }
3974
3975     /* We support everything except YUV conversions. */
3976     if (!is_yuv_fixup(fixup))
3977     {
3978         TRACE("[OK]\n");
3979         return TRUE;
3980     }
3981
3982     TRACE("[FAILED]\n");
3983     return FALSE;
3984 }
3985
3986 static void shader_arb_add_instruction_modifiers(const struct wined3d_shader_instruction *ins) {
3987     DWORD shift;
3988     char write_mask[20], regstr[50];
3989     SHADER_BUFFER *buffer = ins->ctx->buffer;
3990     BOOL is_color = FALSE;
3991     const struct wined3d_shader_dst_param *dst;
3992
3993     if (!ins->dst_count) return;
3994
3995     dst = &ins->dst[0];
3996     shift = dst->shift;
3997     if(shift == 0) return; /* Saturate alone is handled by the instructions */
3998
3999     shader_arb_get_write_mask(ins, dst, write_mask);
4000     shader_arb_get_register_name(ins, &dst->reg, regstr, &is_color);
4001
4002     /* Generate a line that does the output modifier computation
4003      * FIXME: _SAT vs shift? _SAT alone is already handled in the instructions, if this
4004      * maps problems in e.g. _d4_sat modify shader_arb_get_modifier
4005      */
4006     shader_addline(buffer, "MUL%s %s%s, %s, %s;\n", shader_arb_get_modifier(ins),
4007                    regstr, write_mask, regstr, shift_tab[shift]);
4008 }
4009
4010 static const SHADER_HANDLER shader_arb_instruction_handler_table[WINED3DSIH_TABLE_SIZE] =
4011 {
4012     /* WINED3DSIH_ABS           */ shader_hw_map2gl,
4013     /* WINED3DSIH_ADD           */ shader_hw_map2gl,
4014     /* WINED3DSIH_BEM           */ pshader_hw_bem,
4015     /* WINED3DSIH_BREAK         */ shader_hw_break,
4016     /* WINED3DSIH_BREAKC        */ shader_hw_breakc,
4017     /* WINED3DSIH_BREAKP        */ NULL,
4018     /* WINED3DSIH_CALL          */ NULL,
4019     /* WINED3DSIH_CALLNZ        */ NULL,
4020     /* WINED3DSIH_CMP           */ pshader_hw_cmp,
4021     /* WINED3DSIH_CND           */ pshader_hw_cnd,
4022     /* WINED3DSIH_CRS           */ shader_hw_map2gl,
4023     /* WINED3DSIH_DCL           */ NULL,
4024     /* WINED3DSIH_DEF           */ NULL,
4025     /* WINED3DSIH_DEFB          */ NULL,
4026     /* WINED3DSIH_DEFI          */ NULL,
4027     /* WINED3DSIH_DP2ADD        */ pshader_hw_dp2add,
4028     /* WINED3DSIH_DP3           */ shader_hw_map2gl,
4029     /* WINED3DSIH_DP4           */ shader_hw_map2gl,
4030     /* WINED3DSIH_DST           */ shader_hw_map2gl,
4031     /* WINED3DSIH_DSX           */ shader_hw_map2gl,
4032     /* WINED3DSIH_DSY           */ shader_hw_dsy,
4033     /* WINED3DSIH_ELSE          */ shader_hw_else,
4034     /* WINED3DSIH_ENDIF         */ shader_hw_endif,
4035     /* WINED3DSIH_ENDLOOP       */ shader_hw_endloop,
4036     /* WINED3DSIH_ENDREP        */ shader_hw_endrep,
4037     /* WINED3DSIH_EXP           */ shader_hw_map2gl,
4038     /* WINED3DSIH_EXPP          */ shader_hw_map2gl,
4039     /* WINED3DSIH_FRC           */ shader_hw_map2gl,
4040     /* WINED3DSIH_IF            */ NULL /* Hardcoded into the shader */,
4041     /* WINED3DSIH_IFC           */ shader_hw_ifc,
4042     /* WINED3DSIH_LABEL         */ NULL,
4043     /* WINED3DSIH_LIT           */ shader_hw_map2gl,
4044     /* WINED3DSIH_LOG           */ shader_hw_map2gl,
4045     /* WINED3DSIH_LOGP          */ shader_hw_map2gl,
4046     /* WINED3DSIH_LOOP          */ shader_hw_loop,
4047     /* WINED3DSIH_LRP           */ shader_hw_lrp,
4048     /* WINED3DSIH_M3x2          */ shader_hw_mnxn,
4049     /* WINED3DSIH_M3x3          */ shader_hw_mnxn,
4050     /* WINED3DSIH_M3x4          */ shader_hw_mnxn,
4051     /* WINED3DSIH_M4x3          */ shader_hw_mnxn,
4052     /* WINED3DSIH_M4x4          */ shader_hw_mnxn,
4053     /* WINED3DSIH_MAD           */ shader_hw_map2gl,
4054     /* WINED3DSIH_MAX           */ shader_hw_map2gl,
4055     /* WINED3DSIH_MIN           */ shader_hw_map2gl,
4056     /* WINED3DSIH_MOV           */ shader_hw_mov,
4057     /* WINED3DSIH_MOVA          */ shader_hw_mov,
4058     /* WINED3DSIH_MUL           */ shader_hw_map2gl,
4059     /* WINED3DSIH_NOP           */ shader_hw_nop,
4060     /* WINED3DSIH_NRM           */ shader_hw_nrm,
4061     /* WINED3DSIH_PHASE         */ NULL,
4062     /* WINED3DSIH_POW           */ shader_hw_map2gl,
4063     /* WINED3DSIH_RCP           */ shader_hw_rsq_rcp,
4064     /* WINED3DSIH_REP           */ shader_hw_rep,
4065     /* WINED3DSIH_RET           */ NULL,
4066     /* WINED3DSIH_RSQ           */ shader_hw_rsq_rcp,
4067     /* WINED3DSIH_SETP          */ NULL,
4068     /* WINED3DSIH_SGE           */ shader_hw_map2gl,
4069     /* WINED3DSIH_SGN           */ shader_hw_sgn,
4070     /* WINED3DSIH_SINCOS        */ shader_hw_sincos,
4071     /* WINED3DSIH_SLT           */ shader_hw_map2gl,
4072     /* WINED3DSIH_SUB           */ shader_hw_map2gl,
4073     /* WINED3DSIH_TEX           */ pshader_hw_tex,
4074     /* WINED3DSIH_TEXBEM        */ pshader_hw_texbem,
4075     /* WINED3DSIH_TEXBEML       */ pshader_hw_texbem,
4076     /* WINED3DSIH_TEXCOORD      */ pshader_hw_texcoord,
4077     /* WINED3DSIH_TEXDEPTH      */ pshader_hw_texdepth,
4078     /* WINED3DSIH_TEXDP3        */ pshader_hw_texdp3,
4079     /* WINED3DSIH_TEXDP3TEX     */ pshader_hw_texdp3tex,
4080     /* WINED3DSIH_TEXKILL       */ pshader_hw_texkill,
4081     /* WINED3DSIH_TEXLDD        */ shader_hw_texldd,
4082     /* WINED3DSIH_TEXLDL        */ shader_hw_texldl,
4083     /* WINED3DSIH_TEXM3x2DEPTH  */ pshader_hw_texm3x2depth,
4084     /* WINED3DSIH_TEXM3x2PAD    */ pshader_hw_texm3x2pad,
4085     /* WINED3DSIH_TEXM3x2TEX    */ pshader_hw_texm3x2tex,
4086     /* WINED3DSIH_TEXM3x3       */ pshader_hw_texm3x3,
4087     /* WINED3DSIH_TEXM3x3DIFF   */ NULL,
4088     /* WINED3DSIH_TEXM3x3PAD    */ pshader_hw_texm3x3pad,
4089     /* WINED3DSIH_TEXM3x3SPEC   */ pshader_hw_texm3x3spec,
4090     /* WINED3DSIH_TEXM3x3TEX    */ pshader_hw_texm3x3tex,
4091     /* WINED3DSIH_TEXM3x3VSPEC  */ pshader_hw_texm3x3vspec,
4092     /* WINED3DSIH_TEXREG2AR     */ pshader_hw_texreg2ar,
4093     /* WINED3DSIH_TEXREG2GB     */ pshader_hw_texreg2gb,
4094     /* WINED3DSIH_TEXREG2RGB    */ pshader_hw_texreg2rgb,
4095 };
4096
4097 static inline BOOL get_bool_const(const struct wined3d_shader_instruction *ins, IWineD3DBaseShaderImpl *This, DWORD idx)
4098 {
4099     BOOL vshader = shader_is_vshader_version(This->baseShader.reg_maps.shader_version.type);
4100     WORD bools = 0;
4101     WORD flag = (1 << idx);
4102     const local_constant *constant;
4103     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
4104
4105     if(This->baseShader.reg_maps.local_bool_consts & flag)
4106     {
4107         /* What good is a if(bool) with a hardcoded local constant? I don't know, but handle it */
4108         LIST_FOR_EACH_ENTRY(constant, &This->baseShader.constantsB, local_constant, entry)
4109         {
4110             if (constant->idx == idx)
4111             {
4112                 return constant->value[0];
4113             }
4114         }
4115         ERR("Local constant not found\n");
4116         return FALSE;
4117     }
4118     else
4119     {
4120         if(vshader) bools = priv->cur_vs_args->bools;
4121         else bools = priv->cur_ps_args->bools;
4122         return bools & flag;
4123     }
4124 }
4125
4126 static inline void get_int_const(const struct wined3d_shader_instruction *ins, IWineD3DBaseShaderImpl *This, DWORD idx, int *ret)
4127 {
4128     BOOL vshader = shader_is_vshader_version(This->baseShader.reg_maps.shader_version.type);
4129     WORD flag = (1 << idx);
4130     const local_constant *constant;
4131     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
4132
4133     /* Integer constants can either be a local constant, or they can be stored in the shader
4134      * type specific compile args
4135      */
4136     if(This->baseShader.reg_maps.local_int_consts & flag)
4137     {
4138         LIST_FOR_EACH_ENTRY(constant, &This->baseShader.constantsI, local_constant, entry)
4139         {
4140             if (constant->idx == idx)
4141             {
4142                 ret[0] = constant->value[0];
4143                 ret[1] = constant->value[1];
4144                 /* Step / stride is signed */
4145                 ret[2] = (int) constant->value[2];
4146                 return;
4147             }
4148         }
4149         /* If this happens the flag was set incorrectly */
4150         ERR("Local constant not found\n");
4151         ret[0] = 0;
4152         ret[1] = 0;
4153         ret[2] = 0;
4154         return;
4155     }
4156     else
4157     {
4158         if(vshader)
4159         {
4160             /* Count and aL start value are unsigned */
4161             ret[0] = priv->cur_vs_args->loop_ctrl[idx][0];
4162             ret[1] = priv->cur_vs_args->loop_ctrl[idx][1];
4163             /* The step/stride is signed */
4164             ret[2] = ((char) priv->cur_vs_args->loop_ctrl[idx][2]);
4165         }
4166         else
4167         {
4168             ret[0] = priv->cur_ps_args->loop_ctrl[idx][0];
4169             ret[1] = priv->cur_ps_args->loop_ctrl[idx][1];
4170             ret[2] = ((char) priv->cur_ps_args->loop_ctrl[idx][2]);
4171         }
4172         return;
4173     }
4174 }
4175
4176 static void record_instruction(struct list *list, const struct wined3d_shader_instruction *ins)
4177 {
4178     unsigned int i;
4179     struct wined3d_shader_dst_param *dst_param = NULL;
4180     struct wined3d_shader_src_param *src_param = NULL, *rel_addr = NULL;
4181     struct recorded_instruction *rec = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*rec));
4182     if(!rec)
4183     {
4184         ERR("Out of memory\n");
4185         return;
4186     }
4187
4188     rec->ins = *ins;
4189     dst_param = HeapAlloc(GetProcessHeap(), 0, sizeof(*dst_param));
4190     if(!dst_param) goto free;
4191     *dst_param = *ins->dst;
4192     if(ins->dst->reg.rel_addr)
4193     {
4194         rel_addr = HeapAlloc(GetProcessHeap(), 0, sizeof(*dst_param->reg.rel_addr));
4195         if(!rel_addr) goto free;
4196         *rel_addr = *ins->dst->reg.rel_addr;
4197         dst_param->reg.rel_addr = rel_addr;
4198     }
4199     rec->ins.dst = dst_param;
4200
4201     src_param = HeapAlloc(GetProcessHeap(), 0, sizeof(*src_param) * ins->src_count);
4202     if(!src_param) goto free;
4203     for(i = 0; i < ins->src_count; i++)
4204     {
4205         src_param[i] = ins->src[i];
4206         if(ins->src[i].reg.rel_addr)
4207         {
4208             rel_addr = HeapAlloc(GetProcessHeap(), 0, sizeof(*rel_addr));
4209             if(!rel_addr) goto free;
4210             *rel_addr = *ins->src[i].reg.rel_addr;
4211             src_param[i].reg.rel_addr = rel_addr;
4212         }
4213     }
4214     rec->ins.src = src_param;
4215     list_add_tail(list, &rec->entry);
4216     return;
4217
4218 free:
4219     ERR("Out of memory\n");
4220     if(dst_param)
4221     {
4222         HeapFree(GetProcessHeap(), 0, (void *) dst_param->reg.rel_addr);
4223         HeapFree(GetProcessHeap(), 0, dst_param);
4224     }
4225     if(src_param)
4226     {
4227         for(i = 0; i < ins->src_count; i++)
4228         {
4229             HeapFree(GetProcessHeap(), 0, (void *) src_param[i].reg.rel_addr);
4230         }
4231         HeapFree(GetProcessHeap(), 0, src_param);
4232     }
4233     HeapFree(GetProcessHeap(), 0, rec);
4234 }
4235
4236 static void free_recorded_instruction(struct list *list)
4237 {
4238     struct recorded_instruction *rec_ins, *entry2;
4239     unsigned int i;
4240
4241     LIST_FOR_EACH_ENTRY_SAFE(rec_ins, entry2, list, struct recorded_instruction, entry)
4242     {
4243         list_remove(&rec_ins->entry);
4244         if(rec_ins->ins.dst)
4245         {
4246             HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.dst->reg.rel_addr);
4247             HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.dst);
4248         }
4249         if(rec_ins->ins.src)
4250         {
4251             for(i = 0; i < rec_ins->ins.src_count; i++)
4252             {
4253                 HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.src[i].reg.rel_addr);
4254             }
4255             HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.src);
4256         }
4257         HeapFree(GetProcessHeap(), 0, rec_ins);
4258     }
4259 }
4260
4261 static void shader_arb_handle_instruction(const struct wined3d_shader_instruction *ins) {
4262     SHADER_HANDLER hw_fct;
4263     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
4264     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
4265     struct control_frame *control_frame;
4266     SHADER_BUFFER *buffer = ins->ctx->buffer;
4267
4268     if(ins->handler_idx == WINED3DSIH_LOOP || ins->handler_idx == WINED3DSIH_REP)
4269     {
4270         control_frame = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*control_frame));
4271         list_add_head(&priv->control_frames, &control_frame->entry);
4272
4273         if(ins->handler_idx == WINED3DSIH_LOOP) control_frame->type = LOOP;
4274         if(ins->handler_idx == WINED3DSIH_REP) control_frame->type = REP;
4275
4276         if(priv->target_version >= NV2)
4277         {
4278             control_frame->loop_no = priv->num_loops++;
4279             priv->loop_depth++;
4280         }
4281         else
4282         {
4283             /* Don't bother recording when we're in a not used if branch */
4284             if(priv->muted)
4285             {
4286                 return;
4287             }
4288
4289             if(!priv->recording)
4290             {
4291                 int control_values[3];
4292                 get_int_const(ins, This, ins->src[0].reg.idx, control_values);
4293                 list_init(&priv->record);
4294                 priv->recording = TRUE;
4295                 control_frame->outer_loop = TRUE;
4296                 control_frame->loop_control[0] = control_values[0];
4297                 control_frame->loop_control[1] = control_values[1];
4298                 control_frame->loop_control[2] = control_values[2];
4299                 return; /* Instruction is handled */
4300             }
4301             /* Record this loop in the outer loop's recording */
4302         }
4303     }
4304     else if(ins->handler_idx == WINED3DSIH_ENDLOOP || ins->handler_idx == WINED3DSIH_ENDREP)
4305     {
4306         if(priv->target_version >= NV2)
4307         {
4308             /* Nothing to do. The control frame is popped after the HW instr handler */
4309         }
4310         else
4311         {
4312             struct list *e = list_head(&priv->control_frames);
4313             control_frame = LIST_ENTRY(e, struct control_frame, entry);
4314             list_remove(&control_frame->entry);
4315
4316             if(control_frame->outer_loop)
4317             {
4318                 int iteration, aL = 0;
4319                 struct list copy;
4320
4321                 /* Turn off recording before playback */
4322                 priv->recording = FALSE;
4323
4324                 /* Move the recorded instructions to a separate list and get them out of the private data
4325                  * structure. If there are nested loops, the shader_arb_handle_instruction below will
4326                  * be recorded again, thus priv->record might be overwritten
4327                  */
4328                 list_init(&copy);
4329                 list_move_tail(&copy, &priv->record);
4330                 list_init(&priv->record);
4331
4332                 if(ins->handler_idx == WINED3DSIH_ENDLOOP)
4333                 {
4334                     shader_addline(buffer, "#unrolling loop: %d iterations, aL=%d, inc %d\n",
4335                                    control_frame->loop_control[0], control_frame->loop_control[1],
4336                                    control_frame->loop_control[2]);
4337                     aL = control_frame->loop_control[1];
4338                 }
4339                 else
4340                 {
4341                     shader_addline(buffer, "#unrolling rep: %d iterations\n", control_frame->loop_control[0]);
4342                 }
4343
4344                 for(iteration = 0; iteration < control_frame->loop_control[0]; iteration++)
4345                 {
4346                     struct recorded_instruction *rec_ins;
4347                     if(ins->handler_idx == WINED3DSIH_ENDLOOP)
4348                     {
4349                         priv->aL = aL;
4350                         shader_addline(buffer, "#Iteration %d, aL=%d\n", iteration, aL);
4351                     }
4352                     else
4353                     {
4354                         shader_addline(buffer, "#Iteration %d\n", iteration);
4355                     }
4356
4357                     LIST_FOR_EACH_ENTRY(rec_ins, &copy, struct recorded_instruction, entry)
4358                     {
4359                         shader_arb_handle_instruction(&rec_ins->ins);
4360                     }
4361
4362                     if(ins->handler_idx == WINED3DSIH_ENDLOOP)
4363                     {
4364                         aL += control_frame->loop_control[2];
4365                     }
4366                 }
4367                 shader_addline(buffer, "#end loop/rep\n");
4368
4369                 free_recorded_instruction(&copy);
4370                 HeapFree(GetProcessHeap(), 0, control_frame);
4371                 return; /* Instruction is handled */
4372             }
4373             else
4374             {
4375                 /* This is a nested loop. Proceed to the normal recording function */
4376                 HeapFree(GetProcessHeap(), 0, control_frame);
4377             }
4378         }
4379     }
4380
4381     if(priv->recording)
4382     {
4383         record_instruction(&priv->record, ins);
4384         return;
4385     }
4386
4387     /* boolean if */
4388     if(ins->handler_idx == WINED3DSIH_IF)
4389     {
4390         control_frame = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*control_frame));
4391         list_add_head(&priv->control_frames, &control_frame->entry);
4392         control_frame->type = IF;
4393
4394         if(!priv->muted && get_bool_const(ins, This, ins->src[0].reg.idx) == FALSE)
4395         {
4396             shader_addline(buffer, "#if(FALSE){\n");
4397             priv->muted = TRUE;
4398             control_frame->muting = TRUE;
4399         }
4400         else shader_addline(buffer, "#if(TRUE) {\n");
4401
4402         return; /* Instruction is handled */
4403     }
4404     else if(ins->handler_idx == WINED3DSIH_IFC)
4405     {
4406         /* IF(bool) and if_cond(a, b) use the same ELSE and ENDIF tokens */
4407         control_frame = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*control_frame));
4408         control_frame->type = IFC;
4409         control_frame->ifc_no = priv->num_ifcs++;
4410         list_add_head(&priv->control_frames, &control_frame->entry);
4411     }
4412     else if(ins->handler_idx == WINED3DSIH_ELSE)
4413     {
4414         struct list *e = list_head(&priv->control_frames);
4415         control_frame = LIST_ENTRY(e, struct control_frame, entry);
4416
4417         if(control_frame->type == IF)
4418         {
4419             shader_addline(buffer, "#} else {\n");
4420             if(!priv->muted && !control_frame->muting)
4421             {
4422                 priv->muted = TRUE;
4423                 control_frame->muting = TRUE;
4424             }
4425             else if(control_frame->muting) priv->muted = FALSE;
4426             return; /* Instruction is handled. */
4427         }
4428         /* In case of an ifc, generate a HW shader instruction */
4429     }
4430     else if(ins->handler_idx == WINED3DSIH_ENDIF)
4431     {
4432         struct list *e = list_head(&priv->control_frames);
4433         control_frame = LIST_ENTRY(e, struct control_frame, entry);
4434
4435         if(control_frame->type == IF)
4436         {
4437             shader_addline(buffer, "#} endif\n");
4438             if(control_frame->muting) priv->muted = FALSE;
4439             list_remove(&control_frame->entry);
4440             HeapFree(GetProcessHeap(), 0, control_frame);
4441             return; /* Instruction is handled */
4442         }
4443     }
4444
4445     if(priv->muted) return;
4446
4447     /* Select handler */
4448     hw_fct = shader_arb_instruction_handler_table[ins->handler_idx];
4449
4450     /* Unhandled opcode */
4451     if (!hw_fct)
4452     {
4453         FIXME("Backend can't handle opcode %#x\n", ins->handler_idx);
4454         return;
4455     }
4456     hw_fct(ins);
4457
4458     if(ins->handler_idx == WINED3DSIH_ENDLOOP || ins->handler_idx == WINED3DSIH_ENDREP)
4459     {
4460         struct list *e = list_head(&priv->control_frames);
4461         control_frame = LIST_ENTRY(e, struct control_frame, entry);
4462         list_remove(&control_frame->entry);
4463         HeapFree(GetProcessHeap(), 0, control_frame);
4464         priv->loop_depth--;
4465     }
4466     else if(ins->handler_idx == WINED3DSIH_ENDIF)
4467     {
4468         /* Non-ifc ENDIFs don't reach that place because of the return in the if block above */
4469         struct list *e = list_head(&priv->control_frames);
4470         control_frame = LIST_ENTRY(e, struct control_frame, entry);
4471         list_remove(&control_frame->entry);
4472         HeapFree(GetProcessHeap(), 0, control_frame);
4473     }
4474
4475
4476     shader_arb_add_instruction_modifiers(ins);
4477 }
4478
4479 const shader_backend_t arb_program_shader_backend = {
4480     shader_arb_handle_instruction,
4481     shader_arb_select,
4482     shader_arb_select_depth_blt,
4483     shader_arb_deselect_depth_blt,
4484     shader_arb_update_float_vertex_constants,
4485     shader_arb_update_float_pixel_constants,
4486     shader_arb_load_constants,
4487     shader_arb_load_np2fixup_constants,
4488     shader_arb_destroy,
4489     shader_arb_alloc,
4490     shader_arb_free,
4491     shader_arb_dirty_const,
4492     shader_arb_get_caps,
4493     shader_arb_color_fixup_supported,
4494 };
4495
4496 /* ARB_fragment_program fixed function pipeline replacement definitions */
4497 #define ARB_FFP_CONST_TFACTOR           0
4498 #define ARB_FFP_CONST_SPECULAR_ENABLE   ((ARB_FFP_CONST_TFACTOR) + 1)
4499 #define ARB_FFP_CONST_CONSTANT(i)       ((ARB_FFP_CONST_SPECULAR_ENABLE) + 1 + i)
4500 #define ARB_FFP_CONST_BUMPMAT(i)        ((ARB_FFP_CONST_CONSTANT(7)) + 1 + i)
4501 #define ARB_FFP_CONST_LUMINANCE(i)      ((ARB_FFP_CONST_BUMPMAT(7)) + 1 + i)
4502
4503 struct arbfp_ffp_desc
4504 {
4505     struct ffp_frag_desc parent;
4506     GLuint shader;
4507     unsigned int num_textures_used;
4508 };
4509
4510 static void arbfp_enable(IWineD3DDevice *iface, BOOL enable) {
4511     ENTER_GL();
4512     if(enable) {
4513         glEnable(GL_FRAGMENT_PROGRAM_ARB);
4514         checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB)");
4515     } else {
4516         glDisable(GL_FRAGMENT_PROGRAM_ARB);
4517         checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
4518     }
4519     LEAVE_GL();
4520 }
4521
4522 static HRESULT arbfp_alloc(IWineD3DDevice *iface) {
4523     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *) iface;
4524     struct shader_arb_priv *priv;
4525     /* Share private data between the shader backend and the pipeline replacement, if both
4526      * are the arb implementation. This is needed to figure out whether ARBfp should be disabled
4527      * if no pixel shader is bound or not
4528      */
4529     if(This->shader_backend == &arb_program_shader_backend) {
4530         This->fragment_priv = This->shader_priv;
4531     } else {
4532         This->fragment_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(struct shader_arb_priv));
4533         if(!This->fragment_priv) return E_OUTOFMEMORY;
4534     }
4535     priv = This->fragment_priv;
4536     if (wine_rb_init(&priv->fragment_shaders, &wined3d_ffp_frag_program_rb_functions) == -1)
4537     {
4538         ERR("Failed to initialize rbtree.\n");
4539         HeapFree(GetProcessHeap(), 0, This->fragment_priv);
4540         return E_OUTOFMEMORY;
4541     }
4542     priv->use_arbfp_fixed_func = TRUE;
4543     return WINED3D_OK;
4544 }
4545
4546 static void arbfp_free_ffpshader(struct wine_rb_entry *entry, void *context)
4547 {
4548     const WineD3D_GL_Info *gl_info = context;
4549     struct arbfp_ffp_desc *entry_arb = WINE_RB_ENTRY_VALUE(entry, struct arbfp_ffp_desc, parent.entry);
4550
4551     ENTER_GL();
4552     GL_EXTCALL(glDeleteProgramsARB(1, &entry_arb->shader));
4553     checkGLcall("glDeleteProgramsARB(1, &entry_arb->shader)");
4554     HeapFree(GetProcessHeap(), 0, entry_arb);
4555     LEAVE_GL();
4556 }
4557
4558 static void arbfp_free(IWineD3DDevice *iface) {
4559     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *) iface;
4560     struct shader_arb_priv *priv = This->fragment_priv;
4561
4562     wine_rb_destroy(&priv->fragment_shaders, arbfp_free_ffpshader, &This->adapter->gl_info);
4563     priv->use_arbfp_fixed_func = FALSE;
4564
4565     if(This->shader_backend != &arb_program_shader_backend) {
4566         HeapFree(GetProcessHeap(), 0, This->fragment_priv);
4567     }
4568 }
4569
4570 static void arbfp_get_caps(WINED3DDEVTYPE devtype, const WineD3D_GL_Info *gl_info, struct fragment_caps *caps)
4571 {
4572     caps->TextureOpCaps =  WINED3DTEXOPCAPS_DISABLE                     |
4573                            WINED3DTEXOPCAPS_SELECTARG1                  |
4574                            WINED3DTEXOPCAPS_SELECTARG2                  |
4575                            WINED3DTEXOPCAPS_MODULATE4X                  |
4576                            WINED3DTEXOPCAPS_MODULATE2X                  |
4577                            WINED3DTEXOPCAPS_MODULATE                    |
4578                            WINED3DTEXOPCAPS_ADDSIGNED2X                 |
4579                            WINED3DTEXOPCAPS_ADDSIGNED                   |
4580                            WINED3DTEXOPCAPS_ADD                         |
4581                            WINED3DTEXOPCAPS_SUBTRACT                    |
4582                            WINED3DTEXOPCAPS_ADDSMOOTH                   |
4583                            WINED3DTEXOPCAPS_BLENDCURRENTALPHA           |
4584                            WINED3DTEXOPCAPS_BLENDFACTORALPHA            |
4585                            WINED3DTEXOPCAPS_BLENDTEXTUREALPHA           |
4586                            WINED3DTEXOPCAPS_BLENDDIFFUSEALPHA           |
4587                            WINED3DTEXOPCAPS_BLENDTEXTUREALPHAPM         |
4588                            WINED3DTEXOPCAPS_MODULATEALPHA_ADDCOLOR      |
4589                            WINED3DTEXOPCAPS_MODULATECOLOR_ADDALPHA      |
4590                            WINED3DTEXOPCAPS_MODULATEINVCOLOR_ADDALPHA   |
4591                            WINED3DTEXOPCAPS_MODULATEINVALPHA_ADDCOLOR   |
4592                            WINED3DTEXOPCAPS_DOTPRODUCT3                 |
4593                            WINED3DTEXOPCAPS_MULTIPLYADD                 |
4594                            WINED3DTEXOPCAPS_LERP                        |
4595                            WINED3DTEXOPCAPS_BUMPENVMAP                  |
4596                            WINED3DTEXOPCAPS_BUMPENVMAPLUMINANCE;
4597
4598     /* TODO: Implement WINED3DTEXOPCAPS_PREMODULATE */
4599
4600     caps->MaxTextureBlendStages   = 8;
4601     caps->MaxSimultaneousTextures = min(GL_LIMITS(fragment_samplers), 8);
4602
4603     caps->PrimitiveMiscCaps |= WINED3DPMISCCAPS_TSSARGTEMP;
4604 }
4605 #undef GLINFO_LOCATION
4606
4607 #define GLINFO_LOCATION stateblock->wineD3DDevice->adapter->gl_info
4608 static void state_texfactor_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
4609     float col[4];
4610     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
4611
4612     /* Don't load the parameter if we're using an arbfp pixel shader, otherwise we'll overwrite
4613      * application provided constants
4614      */
4615     if(device->shader_backend == &arb_program_shader_backend) {
4616         if (use_ps(stateblock)) return;
4617
4618         device = stateblock->wineD3DDevice;
4619         device->activeContext->pshader_const_dirty[ARB_FFP_CONST_TFACTOR] = 1;
4620         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_TFACTOR + 1);
4621     }
4622
4623     D3DCOLORTOGLFLOAT4(stateblock->renderState[WINED3DRS_TEXTUREFACTOR], col);
4624     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_TFACTOR, col));
4625     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_TFACTOR, col)");
4626
4627 }
4628
4629 static void state_arb_specularenable(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
4630     float col[4];
4631     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
4632
4633     /* Don't load the parameter if we're using an arbfp pixel shader, otherwise we'll overwrite
4634      * application provided constants
4635      */
4636     if(device->shader_backend == &arb_program_shader_backend) {
4637         if (use_ps(stateblock)) return;
4638
4639         device = stateblock->wineD3DDevice;
4640         device->activeContext->pshader_const_dirty[ARB_FFP_CONST_SPECULAR_ENABLE] = 1;
4641         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_SPECULAR_ENABLE + 1);
4642     }
4643
4644     if(stateblock->renderState[WINED3DRS_SPECULARENABLE]) {
4645         /* The specular color has no alpha */
4646         col[0] = 1.0; col[1] = 1.0;
4647         col[2] = 1.0; col[3] = 0.0;
4648     } else {
4649         col[0] = 0.0; col[1] = 0.0;
4650         col[2] = 0.0; col[3] = 0.0;
4651     }
4652     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_SPECULAR_ENABLE, col));
4653     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_SPECULAR_ENABLE, col)");
4654 }
4655
4656 static void set_bumpmat_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
4657     DWORD stage = (state - STATE_TEXTURESTAGE(0, 0)) / (WINED3D_HIGHEST_TEXTURE_STATE + 1);
4658     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
4659     float mat[2][2];
4660
4661     if (use_ps(stateblock))
4662     {
4663         if(stage != 0 &&
4664            ((IWineD3DPixelShaderImpl *) stateblock->pixelShader)->baseShader.reg_maps.bumpmat[stage]) {
4665             /* The pixel shader has to know the bump env matrix. Do a constants update if it isn't scheduled
4666              * anyway
4667              */
4668             if(!isStateDirty(context, STATE_PIXELSHADERCONSTANT)) {
4669                 device->StateTable[STATE_PIXELSHADERCONSTANT].apply(STATE_PIXELSHADERCONSTANT, stateblock, context);
4670             }
4671         }
4672
4673         if(device->shader_backend == &arb_program_shader_backend) {
4674             /* Exit now, don't set the bumpmat below, otherwise we may overwrite pixel shader constants */
4675             return;
4676         }
4677     } else if(device->shader_backend == &arb_program_shader_backend) {
4678         device->activeContext->pshader_const_dirty[ARB_FFP_CONST_BUMPMAT(stage)] = 1;
4679         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_BUMPMAT(stage) + 1);
4680     }
4681
4682     mat[0][0] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT00]);
4683     mat[0][1] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT01]);
4684     mat[1][0] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT10]);
4685     mat[1][1] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT11]);
4686
4687     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_BUMPMAT(stage), &mat[0][0]));
4688     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_BUMPMAT(stage), &mat[0][0])");
4689 }
4690
4691 static void tex_bumpenvlum_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
4692     DWORD stage = (state - STATE_TEXTURESTAGE(0, 0)) / (WINED3D_HIGHEST_TEXTURE_STATE + 1);
4693     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
4694     float param[4];
4695
4696     if (use_ps(stateblock))
4697     {
4698         if(stage != 0 &&
4699            ((IWineD3DPixelShaderImpl *) stateblock->pixelShader)->baseShader.reg_maps.luminanceparams[stage]) {
4700             /* The pixel shader has to know the luminance offset. Do a constants update if it
4701              * isn't scheduled anyway
4702              */
4703             if(!isStateDirty(context, STATE_PIXELSHADERCONSTANT)) {
4704                 device->StateTable[STATE_PIXELSHADERCONSTANT].apply(STATE_PIXELSHADERCONSTANT, stateblock, context);
4705             }
4706         }
4707
4708         if(device->shader_backend == &arb_program_shader_backend) {
4709             /* Exit now, don't set the bumpmat below, otherwise we may overwrite pixel shader constants */
4710             return;
4711         }
4712     } else if(device->shader_backend == &arb_program_shader_backend) {
4713         device->activeContext->pshader_const_dirty[ARB_FFP_CONST_LUMINANCE(stage)] = 1;
4714         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_LUMINANCE(stage) + 1);
4715     }
4716
4717     param[0] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVLSCALE]);
4718     param[1] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVLOFFSET]);
4719     param[2] = 0.0;
4720     param[3] = 0.0;
4721
4722     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_LUMINANCE(stage), param));
4723     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_LUMINANCE(stage), param)");
4724 }
4725
4726 static const char *get_argreg(SHADER_BUFFER *buffer, DWORD argnum, unsigned int stage, DWORD arg) {
4727     const char *ret;
4728
4729     if(arg == ARG_UNUSED) return "unused"; /* This is the marker for unused registers */
4730
4731     switch(arg & WINED3DTA_SELECTMASK) {
4732         case WINED3DTA_DIFFUSE:
4733             ret = "fragment.color.primary"; break;
4734
4735         case WINED3DTA_CURRENT:
4736             if(stage == 0) ret = "fragment.color.primary";
4737             else ret = "ret";
4738             break;
4739
4740         case WINED3DTA_TEXTURE:
4741             switch(stage) {
4742                 case 0: ret = "tex0"; break;
4743                 case 1: ret = "tex1"; break;
4744                 case 2: ret = "tex2"; break;
4745                 case 3: ret = "tex3"; break;
4746                 case 4: ret = "tex4"; break;
4747                 case 5: ret = "tex5"; break;
4748                 case 6: ret = "tex6"; break;
4749                 case 7: ret = "tex7"; break;
4750                 default: ret = "unknown texture";
4751             }
4752             break;
4753
4754         case WINED3DTA_TFACTOR:
4755             ret = "tfactor"; break;
4756
4757         case WINED3DTA_SPECULAR:
4758             ret = "fragment.color.secondary"; break;
4759
4760         case WINED3DTA_TEMP:
4761             ret = "tempreg"; break;
4762
4763         case WINED3DTA_CONSTANT:
4764             FIXME("Implement perstage constants\n");
4765             switch(stage) {
4766                 case 0: ret = "const0"; break;
4767                 case 1: ret = "const1"; break;
4768                 case 2: ret = "const2"; break;
4769                 case 3: ret = "const3"; break;
4770                 case 4: ret = "const4"; break;
4771                 case 5: ret = "const5"; break;
4772                 case 6: ret = "const6"; break;
4773                 case 7: ret = "const7"; break;
4774                 default: ret = "unknown constant";
4775             }
4776             break;
4777
4778         default:
4779             return "unknown";
4780     }
4781
4782     if(arg & WINED3DTA_COMPLEMENT) {
4783         shader_addline(buffer, "SUB arg%u, const.x, %s;\n", argnum, ret);
4784         if(argnum == 0) ret = "arg0";
4785         if(argnum == 1) ret = "arg1";
4786         if(argnum == 2) ret = "arg2";
4787     }
4788     if(arg & WINED3DTA_ALPHAREPLICATE) {
4789         shader_addline(buffer, "MOV arg%u, %s.w;\n", argnum, ret);
4790         if(argnum == 0) ret = "arg0";
4791         if(argnum == 1) ret = "arg1";
4792         if(argnum == 2) ret = "arg2";
4793     }
4794     return ret;
4795 }
4796
4797 static void gen_ffp_instr(SHADER_BUFFER *buffer, unsigned int stage, BOOL color, BOOL alpha,
4798                           DWORD dst, DWORD op, DWORD dw_arg0, DWORD dw_arg1, DWORD dw_arg2) {
4799     const char *dstmask, *dstreg, *arg0, *arg1, *arg2;
4800     unsigned int mul = 1;
4801     BOOL mul_final_dest = FALSE;
4802
4803     if(color && alpha) dstmask = "";
4804     else if(color) dstmask = ".xyz";
4805     else dstmask = ".w";
4806
4807     if(dst == tempreg) dstreg = "tempreg";
4808     else dstreg = "ret";
4809
4810     arg0 = get_argreg(buffer, 0, stage, dw_arg0);
4811     arg1 = get_argreg(buffer, 1, stage, dw_arg1);
4812     arg2 = get_argreg(buffer, 2, stage, dw_arg2);
4813
4814     switch(op) {
4815         case WINED3DTOP_DISABLE:
4816             if(stage == 0) shader_addline(buffer, "MOV %s%s, fragment.color.primary;\n", dstreg, dstmask);
4817             break;
4818
4819         case WINED3DTOP_SELECTARG2:
4820             arg1 = arg2;
4821         case WINED3DTOP_SELECTARG1:
4822             shader_addline(buffer, "MOV %s%s, %s;\n", dstreg, dstmask, arg1);
4823             break;
4824
4825         case WINED3DTOP_MODULATE4X:
4826             mul = 2;
4827         case WINED3DTOP_MODULATE2X:
4828             mul *= 2;
4829             if(strcmp(dstreg, "result.color") == 0) {
4830                 dstreg = "ret";
4831                 mul_final_dest = TRUE;
4832             }
4833         case WINED3DTOP_MODULATE:
4834             shader_addline(buffer, "MUL %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
4835             break;
4836
4837         case WINED3DTOP_ADDSIGNED2X:
4838             mul = 2;
4839             if(strcmp(dstreg, "result.color") == 0) {
4840                 dstreg = "ret";
4841                 mul_final_dest = TRUE;
4842             }
4843         case WINED3DTOP_ADDSIGNED:
4844             shader_addline(buffer, "SUB arg2, %s, const.w;\n", arg2);
4845             arg2 = "arg2";
4846         case WINED3DTOP_ADD:
4847             shader_addline(buffer, "ADD_SAT %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
4848             break;
4849
4850         case WINED3DTOP_SUBTRACT:
4851             shader_addline(buffer, "SUB_SAT %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
4852             break;
4853
4854         case WINED3DTOP_ADDSMOOTH:
4855             shader_addline(buffer, "SUB arg1, const.x, %s;\n", arg1);
4856             shader_addline(buffer, "MAD_SAT %s%s, arg1, %s, %s;\n", dstreg, dstmask, arg2, arg1);
4857             break;
4858
4859         case WINED3DTOP_BLENDCURRENTALPHA:
4860             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_CURRENT);
4861             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
4862             break;
4863         case WINED3DTOP_BLENDFACTORALPHA:
4864             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TFACTOR);
4865             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
4866             break;
4867         case WINED3DTOP_BLENDTEXTUREALPHA:
4868             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TEXTURE);
4869             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
4870             break;
4871         case WINED3DTOP_BLENDDIFFUSEALPHA:
4872             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_DIFFUSE);
4873             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
4874             break;
4875
4876         case WINED3DTOP_BLENDTEXTUREALPHAPM:
4877             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TEXTURE);
4878             shader_addline(buffer, "SUB arg0.w, const.x, %s.w;\n", arg0);
4879             shader_addline(buffer, "MAD_SAT %s%s, %s, arg0.w, %s;\n", dstreg, dstmask, arg2, arg1);
4880             break;
4881
4882         /* D3DTOP_PREMODULATE ???? */
4883
4884         case WINED3DTOP_MODULATEINVALPHA_ADDCOLOR:
4885             shader_addline(buffer, "SUB arg0.w, const.x, %s;\n", arg1);
4886             shader_addline(buffer, "MAD_SAT %s%s, arg0.w, %s, %s;\n", dstreg, dstmask, arg2, arg1);
4887             break;
4888         case WINED3DTOP_MODULATEALPHA_ADDCOLOR:
4889             shader_addline(buffer, "MAD_SAT %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg1, arg2, arg1);
4890             break;
4891         case WINED3DTOP_MODULATEINVCOLOR_ADDALPHA:
4892             shader_addline(buffer, "SUB arg0, const.x, %s;\n", arg1);
4893             shader_addline(buffer, "MAD_SAT %s%s, arg0, %s, %s.w;\n", dstreg, dstmask, arg2, arg1);
4894             break;
4895         case WINED3DTOP_MODULATECOLOR_ADDALPHA:
4896             shader_addline(buffer, "MAD_SAT %s%s, %s, %s, %s.w;\n", dstreg, dstmask, arg1, arg2, arg1);
4897             break;
4898
4899         case WINED3DTOP_DOTPRODUCT3:
4900             mul = 4;
4901             if(strcmp(dstreg, "result.color") == 0) {
4902                 dstreg = "ret";
4903                 mul_final_dest = TRUE;
4904             }
4905             shader_addline(buffer, "SUB arg1, %s, const.w;\n", arg1);
4906             shader_addline(buffer, "SUB arg2, %s, const.w;\n", arg2);
4907             shader_addline(buffer, "DP3_SAT %s%s, arg1, arg2;\n", dstreg, dstmask);
4908             break;
4909
4910         case WINED3DTOP_MULTIPLYADD:
4911             shader_addline(buffer, "MAD_SAT %s%s, %s, %s, %s;\n", dstreg, dstmask, arg1, arg2, arg0);
4912             break;
4913
4914         case WINED3DTOP_LERP:
4915             /* The msdn is not quite right here */
4916             shader_addline(buffer, "LRP %s%s, %s, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
4917             break;
4918
4919         case WINED3DTOP_BUMPENVMAP:
4920         case WINED3DTOP_BUMPENVMAPLUMINANCE:
4921             /* Those are handled in the first pass of the shader(generation pass 1 and 2) already */
4922             break;
4923
4924         default:
4925             FIXME("Unhandled texture op %08x\n", op);
4926     }
4927
4928     if(mul == 2) {
4929         shader_addline(buffer, "MUL_SAT %s%s, %s, const.y;\n", mul_final_dest ? "result.color" : dstreg, dstmask, dstreg);
4930     } else if(mul == 4) {
4931         shader_addline(buffer, "MUL_SAT %s%s, %s, const.z;\n", mul_final_dest ? "result.color" : dstreg, dstmask, dstreg);
4932     }
4933 }
4934
4935 /* The stateblock is passed for GLINFO_LOCATION */
4936 static GLuint gen_arbfp_ffp_shader(const struct ffp_frag_settings *settings, IWineD3DStateBlockImpl *stateblock)
4937 {
4938     unsigned int stage;
4939     SHADER_BUFFER buffer;
4940     BOOL tex_read[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
4941     BOOL bump_used[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
4942     BOOL luminance_used[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
4943     const char *textype;
4944     const char *instr, *sat;
4945     char colorcor_dst[8];
4946     GLuint ret;
4947     DWORD arg0, arg1, arg2;
4948     BOOL tempreg_used = FALSE, tfactor_used = FALSE;
4949     BOOL op_equal;
4950     const char *final_combiner_src = "ret";
4951
4952     /* Find out which textures are read */
4953     for(stage = 0; stage < MAX_TEXTURES; stage++) {
4954         if(settings->op[stage].cop == WINED3DTOP_DISABLE) break;
4955         arg0 = settings->op[stage].carg0 & WINED3DTA_SELECTMASK;
4956         arg1 = settings->op[stage].carg1 & WINED3DTA_SELECTMASK;
4957         arg2 = settings->op[stage].carg2 & WINED3DTA_SELECTMASK;
4958         if(arg0 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
4959         if(arg1 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
4960         if(arg2 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
4961
4962         if(settings->op[stage].cop == WINED3DTOP_BLENDTEXTUREALPHA) tex_read[stage] = TRUE;
4963         if(settings->op[stage].cop == WINED3DTOP_BLENDTEXTUREALPHAPM) tex_read[stage] = TRUE;
4964         if(settings->op[stage].cop == WINED3DTOP_BUMPENVMAP) {
4965             bump_used[stage] = TRUE;
4966             tex_read[stage] = TRUE;
4967         }
4968         if(settings->op[stage].cop == WINED3DTOP_BUMPENVMAPLUMINANCE) {
4969             bump_used[stage] = TRUE;
4970             tex_read[stage] = TRUE;
4971             luminance_used[stage] = TRUE;
4972         } else if(settings->op[stage].cop == WINED3DTOP_BLENDFACTORALPHA) {
4973             tfactor_used = TRUE;
4974         }
4975
4976         if(arg0 == WINED3DTA_TFACTOR || arg1 == WINED3DTA_TFACTOR || arg2 == WINED3DTA_TFACTOR) {
4977             tfactor_used = TRUE;
4978         }
4979
4980         if(settings->op[stage].dst == tempreg) tempreg_used = TRUE;
4981         if(arg0 == WINED3DTA_TEMP || arg1 == WINED3DTA_TEMP || arg2 == WINED3DTA_TEMP) {
4982             tempreg_used = TRUE;
4983         }
4984
4985         if(settings->op[stage].aop == WINED3DTOP_DISABLE) continue;
4986         arg0 = settings->op[stage].aarg0 & WINED3DTA_SELECTMASK;
4987         arg1 = settings->op[stage].aarg1 & WINED3DTA_SELECTMASK;
4988         arg2 = settings->op[stage].aarg2 & WINED3DTA_SELECTMASK;
4989         if(arg0 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
4990         if(arg1 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
4991         if(arg2 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
4992
4993         if(arg0 == WINED3DTA_TEMP || arg1 == WINED3DTA_TEMP || arg2 == WINED3DTA_TEMP) {
4994             tempreg_used = TRUE;
4995         }
4996         if(arg0 == WINED3DTA_TFACTOR || arg1 == WINED3DTA_TFACTOR || arg2 == WINED3DTA_TFACTOR) {
4997             tfactor_used = TRUE;
4998         }
4999     }
5000
5001     /* Shader header */
5002     shader_buffer_init(&buffer);
5003
5004     shader_addline(&buffer, "!!ARBfp1.0\n");
5005
5006     switch(settings->fog) {
5007         case FOG_OFF:                                                         break;
5008         case FOG_LINEAR: shader_addline(&buffer, "OPTION ARB_fog_linear;\n"); break;
5009         case FOG_EXP:    shader_addline(&buffer, "OPTION ARB_fog_exp;\n");    break;
5010         case FOG_EXP2:   shader_addline(&buffer, "OPTION ARB_fog_exp2;\n");   break;
5011         default: FIXME("Unexpected fog setting %d\n", settings->fog);
5012     }
5013
5014     shader_addline(&buffer, "PARAM const = {1, 2, 4, 0.5};\n");
5015     shader_addline(&buffer, "TEMP TMP;\n");
5016     shader_addline(&buffer, "TEMP ret;\n");
5017     if(tempreg_used || settings->sRGB_write) shader_addline(&buffer, "TEMP tempreg;\n");
5018     shader_addline(&buffer, "TEMP arg0;\n");
5019     shader_addline(&buffer, "TEMP arg1;\n");
5020     shader_addline(&buffer, "TEMP arg2;\n");
5021     for(stage = 0; stage < MAX_TEXTURES; stage++) {
5022         if(!tex_read[stage]) continue;
5023         shader_addline(&buffer, "TEMP tex%u;\n", stage);
5024         if(!bump_used[stage]) continue;
5025         shader_addline(&buffer, "PARAM bumpmat%u = program.env[%u];\n", stage, ARB_FFP_CONST_BUMPMAT(stage));
5026         if(!luminance_used[stage]) continue;
5027         shader_addline(&buffer, "PARAM luminance%u = program.env[%u];\n", stage, ARB_FFP_CONST_LUMINANCE(stage));
5028     }
5029     if(tfactor_used) {
5030         shader_addline(&buffer, "PARAM tfactor = program.env[%u];\n", ARB_FFP_CONST_TFACTOR);
5031     }
5032         shader_addline(&buffer, "PARAM specular_enable = program.env[%u];\n", ARB_FFP_CONST_SPECULAR_ENABLE);
5033
5034     if(settings->sRGB_write) {
5035         shader_addline(&buffer, "PARAM srgb_consts1 = {%f, %f, %f, %f};\n",
5036                        srgb_mul_low, srgb_cmp, srgb_pow, srgb_mul_high);
5037         shader_addline(&buffer, "PARAM srgb_consts2 = {%f, %f, %f, %f};\n",
5038                        srgb_sub_high, 0.0, 0.0, 0.0);
5039     }
5040
5041     /* Generate texture sampling instructions) */
5042     for(stage = 0; stage < MAX_TEXTURES && settings->op[stage].cop != WINED3DTOP_DISABLE; stage++) {
5043         if(!tex_read[stage]) continue;
5044
5045         switch(settings->op[stage].tex_type) {
5046             case tex_1d:                    textype = "1D";     break;
5047             case tex_2d:                    textype = "2D";     break;
5048             case tex_3d:                    textype = "3D";     break;
5049             case tex_cube:                  textype = "CUBE";   break;
5050             case tex_rect:                  textype = "RECT";   break;
5051             default: textype = "unexpected_textype";   break;
5052         }
5053
5054         if(settings->op[stage].cop == WINED3DTOP_BUMPENVMAP ||
5055            settings->op[stage].cop == WINED3DTOP_BUMPENVMAPLUMINANCE) {
5056             sat = "";
5057         } else {
5058             sat = "_SAT";
5059         }
5060
5061         if(settings->op[stage].projected == proj_none) {
5062             instr = "TEX";
5063         } else if(settings->op[stage].projected == proj_count4 ||
5064                   settings->op[stage].projected == proj_count3) {
5065             instr = "TXP";
5066         } else {
5067             FIXME("Unexpected projection mode %d\n", settings->op[stage].projected);
5068             instr = "TXP";
5069         }
5070
5071         if(stage > 0 &&
5072            (settings->op[stage - 1].cop == WINED3DTOP_BUMPENVMAP ||
5073             settings->op[stage - 1].cop == WINED3DTOP_BUMPENVMAPLUMINANCE)) {
5074             shader_addline(&buffer, "SWZ arg1, bumpmat%u, x, z, 0, 0;\n", stage - 1);
5075             shader_addline(&buffer, "DP3 ret.x, arg1, tex%u;\n", stage - 1);
5076             shader_addline(&buffer, "SWZ arg1, bumpmat%u, y, w, 0, 0;\n", stage - 1);
5077             shader_addline(&buffer, "DP3 ret.y, arg1, tex%u;\n", stage - 1);
5078
5079             /* with projective textures, texbem only divides the static texture coord, not the displacement,
5080              * so multiply the displacement with the dividing parameter before passing it to TXP
5081              */
5082             if (settings->op[stage].projected != proj_none) {
5083                 if(settings->op[stage].projected == proj_count4) {
5084                     shader_addline(&buffer, "MOV ret.w, fragment.texcoord[%u].w;\n", stage);
5085                     shader_addline(&buffer, "MUL ret.xyz, ret, fragment.texcoord[%u].w, fragment.texcoord[%u];\n", stage, stage);
5086                 } else {
5087                     shader_addline(&buffer, "MOV ret.w, fragment.texcoord[%u].z;\n", stage);
5088                     shader_addline(&buffer, "MAD ret.xyz, ret, fragment.texcoord[%u].z, fragment.texcoord[%u];\n", stage, stage);
5089                 }
5090             } else {
5091                 shader_addline(&buffer, "ADD ret, ret, fragment.texcoord[%u];\n", stage);
5092             }
5093
5094             shader_addline(&buffer, "%s%s tex%u, ret, texture[%u], %s;\n",
5095                            instr, sat, stage, stage, textype);
5096             if(settings->op[stage - 1].cop == WINED3DTOP_BUMPENVMAPLUMINANCE) {
5097                 shader_addline(&buffer, "MAD_SAT ret.x, tex%u.z, luminance%u.x, luminance%u.y;\n",
5098                                stage - 1, stage - 1, stage - 1);
5099                 shader_addline(&buffer, "MUL tex%u, tex%u, ret.x;\n", stage, stage);
5100             }
5101         } else if(settings->op[stage].projected == proj_count3) {
5102             shader_addline(&buffer, "MOV ret, fragment.texcoord[%u];\n", stage);
5103             shader_addline(&buffer, "MOV ret.w, ret.z;\n");
5104             shader_addline(&buffer, "%s%s tex%u, ret, texture[%u], %s;\n",
5105                             instr, sat, stage, stage, textype);
5106         } else {
5107             shader_addline(&buffer, "%s%s tex%u, fragment.texcoord[%u], texture[%u], %s;\n",
5108                             instr, sat, stage, stage, stage, textype);
5109         }
5110
5111         sprintf(colorcor_dst, "tex%u", stage);
5112         gen_color_correction(&buffer, colorcor_dst, WINED3DSP_WRITEMASK_ALL, "const.x", "const.y",
5113                 settings->op[stage].color_fixup);
5114     }
5115
5116     /* Generate the main shader */
5117     for(stage = 0; stage < MAX_TEXTURES; stage++) {
5118         if(settings->op[stage].cop == WINED3DTOP_DISABLE) {
5119             if(stage == 0) {
5120                 final_combiner_src = "fragment.color.primary";
5121             }
5122             break;
5123         }
5124
5125         if(settings->op[stage].cop == WINED3DTOP_SELECTARG1 &&
5126            settings->op[stage].aop == WINED3DTOP_SELECTARG1) {
5127             op_equal = settings->op[stage].carg1 == settings->op[stage].aarg1;
5128         } else if(settings->op[stage].cop == WINED3DTOP_SELECTARG1 &&
5129                   settings->op[stage].aop == WINED3DTOP_SELECTARG2) {
5130             op_equal = settings->op[stage].carg1 == settings->op[stage].aarg2;
5131         } else if(settings->op[stage].cop == WINED3DTOP_SELECTARG2 &&
5132                   settings->op[stage].aop == WINED3DTOP_SELECTARG1) {
5133             op_equal = settings->op[stage].carg2 == settings->op[stage].aarg1;
5134         } else if(settings->op[stage].cop == WINED3DTOP_SELECTARG2 &&
5135                   settings->op[stage].aop == WINED3DTOP_SELECTARG2) {
5136             op_equal = settings->op[stage].carg2 == settings->op[stage].aarg2;
5137         } else {
5138             op_equal = settings->op[stage].aop   == settings->op[stage].cop &&
5139                        settings->op[stage].carg0 == settings->op[stage].aarg0 &&
5140                        settings->op[stage].carg1 == settings->op[stage].aarg1 &&
5141                        settings->op[stage].carg2 == settings->op[stage].aarg2;
5142         }
5143
5144         if(settings->op[stage].aop == WINED3DTOP_DISABLE) {
5145             gen_ffp_instr(&buffer, stage, TRUE, FALSE, settings->op[stage].dst,
5146                           settings->op[stage].cop, settings->op[stage].carg0,
5147                           settings->op[stage].carg1, settings->op[stage].carg2);
5148             if(stage == 0) {
5149                 shader_addline(&buffer, "MOV ret.w, fragment.color.primary.w;\n");
5150             }
5151         } else if(op_equal) {
5152             gen_ffp_instr(&buffer, stage, TRUE, TRUE, settings->op[stage].dst,
5153                           settings->op[stage].cop, settings->op[stage].carg0,
5154                           settings->op[stage].carg1, settings->op[stage].carg2);
5155         } else {
5156             gen_ffp_instr(&buffer, stage, TRUE, FALSE, settings->op[stage].dst,
5157                           settings->op[stage].cop, settings->op[stage].carg0,
5158                           settings->op[stage].carg1, settings->op[stage].carg2);
5159             gen_ffp_instr(&buffer, stage, FALSE, TRUE, settings->op[stage].dst,
5160                           settings->op[stage].aop, settings->op[stage].aarg0,
5161                           settings->op[stage].aarg1, settings->op[stage].aarg2);
5162         }
5163     }
5164
5165     if(settings->sRGB_write) {
5166         shader_addline(&buffer, "MAD ret, fragment.color.secondary, specular_enable, %s;\n", final_combiner_src);
5167         arbfp_add_sRGB_correction(&buffer, "ret", "arg0", "arg1", "arg2", "tempreg", FALSE);
5168     } else {
5169         shader_addline(&buffer, "MAD result.color, fragment.color.secondary, specular_enable, %s;\n", final_combiner_src);
5170     }
5171
5172     /* Footer */
5173     shader_addline(&buffer, "END\n");
5174
5175     /* Generate the shader */
5176     GL_EXTCALL(glGenProgramsARB(1, &ret));
5177     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, ret));
5178     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, strlen(buffer.buffer), buffer.buffer));
5179
5180     if (glGetError() == GL_INVALID_OPERATION) {
5181         GLint pos;
5182         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
5183         FIXME("Fragment program error at position %d: %s\n", pos,
5184               debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
5185     }
5186     shader_buffer_free(&buffer);
5187     return ret;
5188 }
5189
5190 static void fragment_prog_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
5191     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
5192     struct shader_arb_priv *priv = device->fragment_priv;
5193     BOOL use_pshader = use_ps(stateblock);
5194     BOOL use_vshader = use_vs(stateblock);
5195     struct ffp_frag_settings settings;
5196     const struct arbfp_ffp_desc *desc;
5197     unsigned int i;
5198
5199     TRACE("state %#x, stateblock %p, context %p\n", state, stateblock, context);
5200
5201     if(isStateDirty(context, STATE_RENDER(WINED3DRS_FOGENABLE))) {
5202         if(!use_pshader && device->shader_backend == &arb_program_shader_backend && context->last_was_pshader) {
5203             /* Reload fixed function constants since they collide with the pixel shader constants */
5204             for(i = 0; i < MAX_TEXTURES; i++) {
5205                 set_bumpmat_arbfp(STATE_TEXTURESTAGE(i, WINED3DTSS_BUMPENVMAT00), stateblock, context);
5206             }
5207             state_texfactor_arbfp(STATE_RENDER(WINED3DRS_TEXTUREFACTOR), stateblock, context);
5208             state_arb_specularenable(STATE_RENDER(WINED3DRS_SPECULARENABLE), stateblock, context);
5209         } else if(use_pshader && !isStateDirty(context, device->StateTable[STATE_VSHADER].representative)) {
5210             device->shader_backend->shader_select((IWineD3DDevice *)stateblock->wineD3DDevice, use_pshader, use_vshader);
5211         }
5212         return;
5213     }
5214
5215     if(!use_pshader) {
5216         /* Find or create a shader implementing the fixed function pipeline settings, then activate it */
5217         gen_ffp_frag_op(stateblock, &settings, FALSE);
5218         desc = (const struct arbfp_ffp_desc *)find_ffp_frag_shader(&priv->fragment_shaders, &settings);
5219         if(!desc) {
5220             struct arbfp_ffp_desc *new_desc = HeapAlloc(GetProcessHeap(), 0, sizeof(*new_desc));
5221             if (!new_desc)
5222             {
5223                 ERR("Out of memory\n");
5224                 return;
5225             }
5226             new_desc->num_textures_used = 0;
5227             for(i = 0; i < GL_LIMITS(texture_stages); i++) {
5228                 if(settings.op[i].cop == WINED3DTOP_DISABLE) break;
5229                 new_desc->num_textures_used = i;
5230             }
5231
5232             memcpy(&new_desc->parent.settings, &settings, sizeof(settings));
5233             new_desc->shader = gen_arbfp_ffp_shader(&settings, stateblock);
5234             add_ffp_frag_shader(&priv->fragment_shaders, &new_desc->parent);
5235             TRACE("Allocated fixed function replacement shader descriptor %p\n", new_desc);
5236             desc = new_desc;
5237         }
5238
5239         /* Now activate the replacement program. GL_FRAGMENT_PROGRAM_ARB is already active(however, note the
5240          * comment above the shader_select call below). If e.g. GLSL is active, the shader_select call will
5241          * deactivate it.
5242          */
5243         GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, desc->shader));
5244         checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, desc->shader)");
5245         priv->current_fprogram_id = desc->shader;
5246
5247         if(device->shader_backend == &arb_program_shader_backend && context->last_was_pshader) {
5248             /* Reload fixed function constants since they collide with the pixel shader constants */
5249             for(i = 0; i < MAX_TEXTURES; i++) {
5250                 set_bumpmat_arbfp(STATE_TEXTURESTAGE(i, WINED3DTSS_BUMPENVMAT00), stateblock, context);
5251             }
5252             state_texfactor_arbfp(STATE_RENDER(WINED3DRS_TEXTUREFACTOR), stateblock, context);
5253             state_arb_specularenable(STATE_RENDER(WINED3DRS_SPECULARENABLE), stateblock, context);
5254         }
5255         context->last_was_pshader = FALSE;
5256     } else {
5257         context->last_was_pshader = TRUE;
5258     }
5259
5260     /* Finally, select the shader. If a pixel shader is used, it will be set and enabled by the shader backend.
5261      * If this shader backend is arbfp(most likely), then it will simply overwrite the last fixed function replace-
5262      * ment shader. If the shader backend is not ARB, it currently is important that the opengl implementation
5263      * type overwrites GL_ARB_fragment_program. This is currently the case with GLSL. If we really want to use
5264      * atifs or nvrc pixel shaders with arb fragment programs we'd have to disable GL_FRAGMENT_PROGRAM_ARB here
5265      *
5266      * Don't call shader_select if the vertex shader is dirty, because it will be called later on by the vertex
5267      * shader handler
5268      */
5269     if(!isStateDirty(context, device->StateTable[STATE_VSHADER].representative)) {
5270         device->shader_backend->shader_select((IWineD3DDevice *)stateblock->wineD3DDevice, use_pshader, use_vshader);
5271
5272         if (!isStateDirty(context, STATE_VERTEXSHADERCONSTANT) && (use_vshader || use_pshader)) {
5273             device->StateTable[STATE_VERTEXSHADERCONSTANT].apply(STATE_VERTEXSHADERCONSTANT, stateblock, context);
5274         }
5275     }
5276     if(use_pshader) {
5277         device->StateTable[STATE_PIXELSHADERCONSTANT].apply(STATE_PIXELSHADERCONSTANT, stateblock, context);
5278     }
5279 }
5280
5281 /* We can't link the fog states to the fragment state directly since the vertex pipeline links them
5282  * to FOGENABLE. A different linking in different pipeline parts can't be expressed in the combined
5283  * state table, so we need to handle that with a forwarding function. The other invisible side effect
5284  * is that changing the fog start and fog end(which links to FOGENABLE in vertex) results in the
5285  * fragment_prog_arbfp function being called because FOGENABLE is dirty, which calls this function here
5286  */
5287 static void state_arbfp_fog(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
5288     enum fogsource new_source;
5289
5290     TRACE("state %#x, stateblock %p, context %p\n", state, stateblock, context);
5291
5292     if(!isStateDirty(context, STATE_PIXELSHADER)) {
5293         fragment_prog_arbfp(state, stateblock, context);
5294     }
5295
5296     if(!stateblock->renderState[WINED3DRS_FOGENABLE]) return;
5297
5298     if(stateblock->renderState[WINED3DRS_FOGTABLEMODE] == WINED3DFOG_NONE) {
5299         if(use_vs(stateblock)) {
5300             new_source = FOGSOURCE_VS;
5301         } else {
5302             if(stateblock->renderState[WINED3DRS_FOGVERTEXMODE] == WINED3DFOG_NONE || context->last_was_rhw) {
5303                 new_source = FOGSOURCE_COORD;
5304             } else {
5305                 new_source = FOGSOURCE_FFP;
5306             }
5307         }
5308     } else {
5309         new_source = FOGSOURCE_FFP;
5310     }
5311     if(new_source != context->fog_source) {
5312         context->fog_source = new_source;
5313         state_fogstartend(STATE_RENDER(WINED3DRS_FOGSTART), stateblock, context);
5314     }
5315 }
5316
5317 static void textransform(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
5318     if(!isStateDirty(context, STATE_PIXELSHADER)) {
5319         fragment_prog_arbfp(state, stateblock, context);
5320     }
5321 }
5322
5323 #undef GLINFO_LOCATION
5324
5325 static const struct StateEntryTemplate arbfp_fragmentstate_template[] = {
5326     {STATE_RENDER(WINED3DRS_TEXTUREFACTOR),               { STATE_RENDER(WINED3DRS_TEXTUREFACTOR),              state_texfactor_arbfp   }, WINED3D_GL_EXT_NONE             },
5327     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5328     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5329     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5330     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5331     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5332     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5333     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5334     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5335     {STATE_TEXTURESTAGE(0, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5336     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5337     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5338     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5339     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5340     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5341     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5342     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5343     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5344     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5345     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5346     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5347     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5348     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5349     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5350     {STATE_TEXTURESTAGE(1, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5351     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5352     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5353     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5354     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5355     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5356     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5357     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5358     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5359     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5360     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5361     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5362     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5363     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5364     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5365     {STATE_TEXTURESTAGE(2, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5366     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5367     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5368     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5369     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5370     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5371     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5372     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5373     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5374     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5375     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5376     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5377     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5378     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5379     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5380     {STATE_TEXTURESTAGE(3, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5381     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5382     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5383     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5384     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5385     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5386     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5387     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5388     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5389     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5390     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5391     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5392     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5393     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5394     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5395     {STATE_TEXTURESTAGE(4, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5396     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5397     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5398     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5399     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5400     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5401     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5402     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5403     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5404     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5405     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5406     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5407     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5408     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5409     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5410     {STATE_TEXTURESTAGE(5, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5411     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5412     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5413     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5414     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5415     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5416     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5417     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5418     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5419     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5420     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5421     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5422     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5423     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5424     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5425     {STATE_TEXTURESTAGE(6, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5426     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5427     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5428     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5429     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5430     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5431     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5432     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5433     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5434     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5435     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5436     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5437     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5438     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5439     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5440     {STATE_TEXTURESTAGE(7, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5441     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5442     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5443     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5444     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5445     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5446     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5447     {STATE_SAMPLER(0),                                    { STATE_SAMPLER(0),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5448     {STATE_SAMPLER(1),                                    { STATE_SAMPLER(1),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5449     {STATE_SAMPLER(2),                                    { STATE_SAMPLER(2),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5450     {STATE_SAMPLER(3),                                    { STATE_SAMPLER(3),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5451     {STATE_SAMPLER(4),                                    { STATE_SAMPLER(4),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5452     {STATE_SAMPLER(5),                                    { STATE_SAMPLER(5),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5453     {STATE_SAMPLER(6),                                    { STATE_SAMPLER(6),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5454     {STATE_SAMPLER(7),                                    { STATE_SAMPLER(7),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5455     {STATE_PIXELSHADER,                                   { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5456     {STATE_RENDER(WINED3DRS_FOGENABLE),                   { STATE_RENDER(WINED3DRS_FOGENABLE),                  state_arbfp_fog         }, WINED3D_GL_EXT_NONE             },
5457     {STATE_RENDER(WINED3DRS_FOGTABLEMODE),                { STATE_RENDER(WINED3DRS_FOGENABLE),                  state_arbfp_fog         }, WINED3D_GL_EXT_NONE             },
5458     {STATE_RENDER(WINED3DRS_FOGVERTEXMODE),               { STATE_RENDER(WINED3DRS_FOGENABLE),                  state_arbfp_fog         }, WINED3D_GL_EXT_NONE             },
5459     {STATE_RENDER(WINED3DRS_FOGSTART),                    { STATE_RENDER(WINED3DRS_FOGSTART),                   state_fogstartend       }, WINED3D_GL_EXT_NONE             },
5460     {STATE_RENDER(WINED3DRS_FOGEND),                      { STATE_RENDER(WINED3DRS_FOGSTART),                   state_fogstartend       }, WINED3D_GL_EXT_NONE             },
5461     {STATE_RENDER(WINED3DRS_SRGBWRITEENABLE),             { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5462     {STATE_RENDER(WINED3DRS_FOGCOLOR),                    { STATE_RENDER(WINED3DRS_FOGCOLOR),                   state_fogcolor          }, WINED3D_GL_EXT_NONE             },
5463     {STATE_RENDER(WINED3DRS_FOGDENSITY),                  { STATE_RENDER(WINED3DRS_FOGDENSITY),                 state_fogdensity        }, WINED3D_GL_EXT_NONE             },
5464     {STATE_TEXTURESTAGE(0,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(0, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5465     {STATE_TEXTURESTAGE(1,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(1, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5466     {STATE_TEXTURESTAGE(2,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(2, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5467     {STATE_TEXTURESTAGE(3,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(3, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5468     {STATE_TEXTURESTAGE(4,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(4, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5469     {STATE_TEXTURESTAGE(5,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(5, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5470     {STATE_TEXTURESTAGE(6,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(6, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5471     {STATE_TEXTURESTAGE(7,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(7, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5472     {STATE_RENDER(WINED3DRS_SPECULARENABLE),              { STATE_RENDER(WINED3DRS_SPECULARENABLE),             state_arb_specularenable}, WINED3D_GL_EXT_NONE             },
5473     {0 /* Terminate */,                                   { 0,                                                  0                       }, WINED3D_GL_EXT_NONE             },
5474 };
5475
5476 const struct fragment_pipeline arbfp_fragment_pipeline = {
5477     arbfp_enable,
5478     arbfp_get_caps,
5479     arbfp_alloc,
5480     arbfp_free,
5481     shader_arb_color_fixup_supported,
5482     arbfp_fragmentstate_template,
5483     TRUE /* We can disable projected textures */
5484 };
5485
5486 #define GLINFO_LOCATION device->adapter->gl_info
5487
5488 struct arbfp_blit_priv {
5489     GLenum yuy2_rect_shader, yuy2_2d_shader;
5490     GLenum uyvy_rect_shader, uyvy_2d_shader;
5491     GLenum yv12_rect_shader, yv12_2d_shader;
5492 };
5493
5494 static HRESULT arbfp_blit_alloc(IWineD3DDevice *iface) {
5495     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
5496     device->blit_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(struct arbfp_blit_priv));
5497     if(!device->blit_priv) {
5498         ERR("Out of memory\n");
5499         return E_OUTOFMEMORY;
5500     }
5501     return WINED3D_OK;
5502 }
5503 static void arbfp_blit_free(IWineD3DDevice *iface) {
5504     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
5505     struct arbfp_blit_priv *priv = device->blit_priv;
5506
5507     ENTER_GL();
5508     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yuy2_rect_shader));
5509     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yuy2_2d_shader));
5510     GL_EXTCALL(glDeleteProgramsARB(1, &priv->uyvy_rect_shader));
5511     GL_EXTCALL(glDeleteProgramsARB(1, &priv->uyvy_2d_shader));
5512     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yv12_rect_shader));
5513     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yv12_2d_shader));
5514     checkGLcall("Delete yuv programs\n");
5515     LEAVE_GL();
5516 }
5517
5518 static BOOL gen_planar_yuv_read(SHADER_BUFFER *buffer, enum yuv_fixup yuv_fixup, GLenum textype, char *luminance)
5519 {
5520     char chroma;
5521     const char *tex, *texinstr;
5522
5523     if (yuv_fixup == YUV_FIXUP_UYVY) {
5524         chroma = 'x';
5525         *luminance = 'w';
5526     } else {
5527         chroma = 'w';
5528         *luminance = 'x';
5529     }
5530     switch(textype) {
5531         case GL_TEXTURE_2D:             tex = "2D";     texinstr = "TXP"; break;
5532         case GL_TEXTURE_RECTANGLE_ARB:  tex = "RECT";   texinstr = "TEX"; break;
5533         default:
5534             /* This is more tricky than just replacing the texture type - we have to navigate
5535              * properly in the texture to find the correct chroma values
5536              */
5537             FIXME("Implement yuv correction for non-2d, non-rect textures\n");
5538             return FALSE;
5539     }
5540
5541     /* First we have to read the chroma values. This means we need at least two pixels(no filtering),
5542      * or 4 pixels(with filtering). To get the unmodified chromas, we have to rid ourselves of the
5543      * filtering when we sample the texture.
5544      *
5545      * These are the rules for reading the chroma:
5546      *
5547      * Even pixel: Cr
5548      * Even pixel: U
5549      * Odd pixel: V
5550      *
5551      * So we have to get the sampling x position in non-normalized coordinates in integers
5552      */
5553     if(textype != GL_TEXTURE_RECTANGLE_ARB) {
5554         shader_addline(buffer, "MUL texcrd.xy, fragment.texcoord[0], size.x;\n");
5555         shader_addline(buffer, "MOV texcrd.w, size.x;\n");
5556     } else {
5557         shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
5558     }
5559     /* We must not allow filtering between pixel x and x+1, this would mix U and V
5560      * Vertical filtering is ok. However, bear in mind that the pixel center is at
5561      * 0.5, so add 0.5.
5562      */
5563     shader_addline(buffer, "FLR texcrd.x, texcrd.x;\n");
5564     shader_addline(buffer, "ADD texcrd.x, texcrd.x, coef.y;\n");
5565
5566     /* Divide the x coordinate by 0.5 and get the fraction. This gives 0.25 and 0.75 for the
5567      * even and odd pixels respectively
5568      */
5569     shader_addline(buffer, "MUL texcrd2, texcrd, coef.y;\n");
5570     shader_addline(buffer, "FRC texcrd2, texcrd2;\n");
5571
5572     /* Sample Pixel 1 */
5573     shader_addline(buffer, "%s luminance, texcrd, texture[0], %s;\n", texinstr, tex);
5574
5575     /* Put the value into either of the chroma values */
5576     shader_addline(buffer, "SGE temp.x, texcrd2.x, coef.y;\n");
5577     shader_addline(buffer, "MUL chroma.x, luminance.%c, temp.x;\n", chroma);
5578     shader_addline(buffer, "SLT temp.x, texcrd2.x, coef.y;\n");
5579     shader_addline(buffer, "MUL chroma.y, luminance.%c, temp.x;\n", chroma);
5580
5581     /* Sample pixel 2. If we read an even pixel(SLT above returned 1), sample
5582      * the pixel right to the current one. Otherwise, sample the left pixel.
5583      * Bias and scale the SLT result to -1;1 and add it to the texcrd.x.
5584      */
5585     shader_addline(buffer, "MAD temp.x, temp.x, coef.z, -coef.x;\n");
5586     shader_addline(buffer, "ADD texcrd.x, texcrd, temp.x;\n");
5587     shader_addline(buffer, "%s luminance, texcrd, texture[0], %s;\n", texinstr, tex);
5588
5589     /* Put the value into the other chroma */
5590     shader_addline(buffer, "SGE temp.x, texcrd2.x, coef.y;\n");
5591     shader_addline(buffer, "MAD chroma.y, luminance.%c, temp.x, chroma.y;\n", chroma);
5592     shader_addline(buffer, "SLT temp.x, texcrd2.x, coef.y;\n");
5593     shader_addline(buffer, "MAD chroma.x, luminance.%c, temp.x, chroma.x;\n", chroma);
5594
5595     /* TODO: If filtering is enabled, sample a 2nd pair of pixels left or right of
5596      * the current one and lerp the two U and V values
5597      */
5598
5599     /* This gives the correctly filtered luminance value */
5600     shader_addline(buffer, "TEX luminance, fragment.texcoord[0], texture[0], %s;\n", tex);
5601
5602     return TRUE;
5603 }
5604
5605 static BOOL gen_yv12_read(SHADER_BUFFER *buffer, GLenum textype, char *luminance)
5606 {
5607     const char *tex;
5608
5609     switch(textype) {
5610         case GL_TEXTURE_2D:             tex = "2D";     break;
5611         case GL_TEXTURE_RECTANGLE_ARB:  tex = "RECT";   break;
5612         default:
5613             FIXME("Implement yv12 correction for non-2d, non-rect textures\n");
5614             return FALSE;
5615     }
5616
5617     /* YV12 surfaces contain a WxH sized luminance plane, followed by a (W/2)x(H/2)
5618      * V and a (W/2)x(H/2) U plane, each with 8 bit per pixel. So the effective
5619      * bitdepth is 12 bits per pixel. Since the U and V planes have only half the
5620      * pitch of the luminance plane, the packing into the gl texture is a bit
5621      * unfortunate. If the whole texture is interpreted as luminance data it looks
5622      * approximately like this:
5623      *
5624      *        +----------------------------------+----
5625      *        |                                  |
5626      *        |                                  |
5627      *        |                                  |
5628      *        |                                  |
5629      *        |                                  |   2
5630      *        |            LUMINANCE             |   -
5631      *        |                                  |   3
5632      *        |                                  |
5633      *        |                                  |
5634      *        |                                  |
5635      *        |                                  |
5636      *        +----------------+-----------------+----
5637      *        |                |                 |
5638      *        |  U even rows   |  U odd rows     |
5639      *        |                |                 |   1
5640      *        +----------------+------------------   -
5641      *        |                |                 |   3
5642      *        |  V even rows   |  V odd rows     |
5643      *        |                |                 |
5644      *        +----------------+-----------------+----
5645      *        |                |                 |
5646      *        |     0.5        |       0.5       |
5647      *
5648      * So it appears as if there are 4 chroma images, but in fact the odd rows
5649      * in the chroma images are in the same row as the even ones. So its is
5650      * kinda tricky to read
5651      *
5652      * When reading from rectangle textures, keep in mind that the input y coordinates
5653      * go from 0 to d3d_height, whereas the opengl texture height is 1.5 * d3d_height
5654      */
5655     shader_addline(buffer, "PARAM yv12_coef = {%f, %f, %f, %f};\n",
5656                    2.0 / 3.0, 1.0 / 6.0, (2.0 / 3.0) + (1.0 / 6.0), 1.0 / 3.0);
5657
5658     shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
5659     /* the chroma planes have only half the width */
5660     shader_addline(buffer, "MUL texcrd.x, texcrd.x, coef.y;\n");
5661
5662     /* The first value is between 2/3 and 5/6th of the texture's height, so scale+bias
5663      * the coordinate. Also read the right side of the image when reading odd lines
5664      *
5665      * Don't forget to clamp the y values in into the range, otherwise we'll get filtering
5666      * bleeding
5667      */
5668     if(textype == GL_TEXTURE_2D) {
5669
5670         shader_addline(buffer, "RCP chroma.w, size.y;\n");
5671
5672         shader_addline(buffer, "MUL texcrd2.y, texcrd.y, size.y;\n");
5673
5674         shader_addline(buffer, "FLR texcrd2.y, texcrd2.y;\n");
5675         shader_addline(buffer, "MAD texcrd.y, texcrd.y, yv12_coef.y, yv12_coef.x;\n");
5676
5677         /* Read odd lines from the right side(add size * 0.5 to the x coordinate */
5678         shader_addline(buffer, "ADD texcrd2.x, texcrd2.y, yv12_coef.y;\n"); /* To avoid 0.5 == 0.5 comparisons */
5679         shader_addline(buffer, "FRC texcrd2.x, texcrd2.x;\n");
5680         shader_addline(buffer, "SGE texcrd2.x, texcrd2.x, coef.y;\n");
5681         shader_addline(buffer, "MAD texcrd.x, texcrd2.x, coef.y, texcrd.x;\n");
5682
5683         /* clamp, keep the half pixel origin in mind */
5684         shader_addline(buffer, "MAD temp.y, coef.y, chroma.w, yv12_coef.x;\n");
5685         shader_addline(buffer, "MAX texcrd.y, temp.y, texcrd.y;\n");
5686         shader_addline(buffer, "MAD temp.y, -coef.y, chroma.w, yv12_coef.z;\n");
5687         shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
5688     } else {
5689         /* Read from [size - size+size/4] */
5690         shader_addline(buffer, "FLR texcrd.y, texcrd.y;\n");
5691         shader_addline(buffer, "MAD texcrd.y, texcrd.y, coef.w, size.y;\n");
5692
5693         /* Read odd lines from the right side(add size * 0.5 to the x coordinate */
5694         shader_addline(buffer, "ADD texcrd2.x, texcrd.y, yv12_coef.y;\n"); /* To avoid 0.5 == 0.5 comparisons */
5695         shader_addline(buffer, "FRC texcrd2.x, texcrd2.x;\n");
5696         shader_addline(buffer, "SGE texcrd2.x, texcrd2.x, coef.y;\n");
5697         shader_addline(buffer, "MUL texcrd2.x, texcrd2.x, size.x;\n");
5698         shader_addline(buffer, "MAD texcrd.x, texcrd2.x, coef.y, texcrd.x;\n");
5699
5700         /* Make sure to read exactly from the pixel center */
5701         shader_addline(buffer, "FLR texcrd.y, texcrd.y;\n");
5702         shader_addline(buffer, "ADD texcrd.y, texcrd.y, coef.y;\n");
5703
5704         /* Clamp */
5705         shader_addline(buffer, "MAD temp.y, size.y, coef.w, size.y;\n");
5706         shader_addline(buffer, "ADD temp.y, temp.y, -coef.y;\n");
5707         shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
5708         shader_addline(buffer, "ADD temp.y, size.y, -coef.y;\n");
5709         shader_addline(buffer, "MAX texcrd.y, temp.y, texcrd.y;\n");
5710     }
5711     /* Read the texture, put the result into the output register */
5712     shader_addline(buffer, "TEX temp, texcrd, texture[0], %s;\n", tex);
5713     shader_addline(buffer, "MOV chroma.x, temp.w;\n");
5714
5715     /* The other chroma value is 1/6th of the texture lower, from 5/6th to 6/6th
5716      * No need to clamp because we're just reusing the already clamped value from above
5717      */
5718     if(textype == GL_TEXTURE_2D) {
5719         shader_addline(buffer, "ADD texcrd.y, texcrd.y, yv12_coef.y;\n");
5720     } else {
5721         shader_addline(buffer, "MAD texcrd.y, size.y, coef.w, texcrd.y;\n");
5722     }
5723     shader_addline(buffer, "TEX temp, texcrd, texture[0], %s;\n", tex);
5724     shader_addline(buffer, "MOV chroma.y, temp.w;\n");
5725
5726     /* Sample the luminance value. It is in the top 2/3rd of the texture, so scale the y coordinate.
5727      * Clamp the y coordinate to prevent the chroma values from bleeding into the sampled luminance
5728      * values due to filtering
5729      */
5730     shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
5731     if(textype == GL_TEXTURE_2D) {
5732         /* Multiply the y coordinate by 2/3 and clamp it */
5733         shader_addline(buffer, "MUL texcrd.y, texcrd.y, yv12_coef.x;\n");
5734         shader_addline(buffer, "MAD temp.y, -coef.y, chroma.w, yv12_coef.x;\n");
5735         shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
5736         shader_addline(buffer, "TEX luminance, texcrd, texture[0], %s;\n", tex);
5737     } else {
5738         /* Reading from texture_rectangles is pretty straightforward, just use the unmodified
5739          * texture coordinate. It is still a good idea to clamp it though, since the opengl texture
5740          * is bigger
5741          */
5742         shader_addline(buffer, "ADD temp.x, size.y, -coef.y;\n");
5743         shader_addline(buffer, "MIN texcrd.y, texcrd.y, size.x;\n");
5744         shader_addline(buffer, "TEX luminance, texcrd, texture[0], %s;\n", tex);
5745     }
5746     *luminance = 'a';
5747
5748     return TRUE;
5749 }
5750
5751 static GLuint gen_yuv_shader(IWineD3DDeviceImpl *device, enum yuv_fixup yuv_fixup, GLenum textype)
5752 {
5753     GLenum shader;
5754     SHADER_BUFFER buffer;
5755     char luminance_component;
5756     struct arbfp_blit_priv *priv = device->blit_priv;
5757
5758     /* Shader header */
5759     shader_buffer_init(&buffer);
5760
5761     ENTER_GL();
5762     GL_EXTCALL(glGenProgramsARB(1, &shader));
5763     checkGLcall("GL_EXTCALL(glGenProgramsARB(1, &shader))");
5764     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader));
5765     checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader)");
5766     LEAVE_GL();
5767     if(!shader) {
5768         shader_buffer_free(&buffer);
5769         return 0;
5770     }
5771
5772     /* The YUY2 and UYVY formats contain two pixels packed into a 32 bit macropixel,
5773      * giving effectively 16 bit per pixel. The color consists of a luminance(Y) and
5774      * two chroma(U and V) values. Each macropixel has two luminance values, one for
5775      * each single pixel it contains, and one U and one V value shared between both
5776      * pixels.
5777      *
5778      * The data is loaded into an A8L8 texture. With YUY2, the luminance component
5779      * contains the luminance and alpha the chroma. With UYVY it is vice versa. Thus
5780      * take the format into account when generating the read swizzles
5781      *
5782      * Reading the Y value is straightforward - just sample the texture. The hardware
5783      * takes care of filtering in the horizontal and vertical direction.
5784      *
5785      * Reading the U and V values is harder. We have to avoid filtering horizontally,
5786      * because that would mix the U and V values of one pixel or two adjacent pixels.
5787      * Thus floor the texture coordinate and add 0.5 to get an unfiltered read,
5788      * regardless of the filtering setting. Vertical filtering works automatically
5789      * though - the U and V values of two rows are mixed nicely.
5790      *
5791      * Appart of avoiding filtering issues, the code has to know which value it just
5792      * read, and where it can find the other one. To determine this, it checks if
5793      * it sampled an even or odd pixel, and shifts the 2nd read accordingly.
5794      *
5795      * Handling horizontal filtering of U and V values requires reading a 2nd pair
5796      * of pixels, extracting U and V and mixing them. This is not implemented yet.
5797      *
5798      * An alternative implementation idea is to load the texture as A8R8G8B8 texture,
5799      * with width / 2. This way one read gives all 3 values, finding U and V is easy
5800      * in an unfiltered situation. Finding the luminance on the other hand requires
5801      * finding out if it is an odd or even pixel. The real drawback of this approach
5802      * is filtering. This would have to be emulated completely in the shader, reading
5803      * up two 2 packed pixels in up to 2 rows and interpolating both horizontally and
5804      * vertically. Beyond that it would require adjustments to the texture handling
5805      * code to deal with the width scaling
5806      */
5807     shader_addline(&buffer, "!!ARBfp1.0\n");
5808     shader_addline(&buffer, "TEMP luminance;\n");
5809     shader_addline(&buffer, "TEMP temp;\n");
5810     shader_addline(&buffer, "TEMP chroma;\n");
5811     shader_addline(&buffer, "TEMP texcrd;\n");
5812     shader_addline(&buffer, "TEMP texcrd2;\n");
5813     shader_addline(&buffer, "PARAM coef = {1.0, 0.5, 2.0, 0.25};\n");
5814     shader_addline(&buffer, "PARAM yuv_coef = {1.403, 0.344, 0.714, 1.770};\n");
5815     shader_addline(&buffer, "PARAM size = program.local[0];\n");
5816
5817     switch (yuv_fixup)
5818     {
5819         case YUV_FIXUP_UYVY:
5820         case YUV_FIXUP_YUY2:
5821             if (!gen_planar_yuv_read(&buffer, yuv_fixup, textype, &luminance_component))
5822             {
5823                 shader_buffer_free(&buffer);
5824                 return 0;
5825             }
5826             break;
5827
5828         case YUV_FIXUP_YV12:
5829             if (!gen_yv12_read(&buffer, textype, &luminance_component))
5830             {
5831                 shader_buffer_free(&buffer);
5832                 return 0;
5833             }
5834             break;
5835
5836         default:
5837             FIXME("Unsupported YUV fixup %#x\n", yuv_fixup);
5838             shader_buffer_free(&buffer);
5839             return 0;
5840     }
5841
5842     /* Calculate the final result. Formula is taken from
5843      * http://www.fourcc.org/fccyvrgb.php. Note that the chroma
5844      * ranges from -0.5 to 0.5
5845      */
5846     shader_addline(&buffer, "SUB chroma.xy, chroma, coef.y;\n");
5847
5848     shader_addline(&buffer, "MAD result.color.x, chroma.x, yuv_coef.x, luminance.%c;\n", luminance_component);
5849     shader_addline(&buffer, "MAD temp.x, -chroma.y, yuv_coef.y, luminance.%c;\n", luminance_component);
5850     shader_addline(&buffer, "MAD result.color.y, -chroma.x, yuv_coef.z, temp.x;\n");
5851     shader_addline(&buffer, "MAD result.color.z, chroma.y, yuv_coef.w, luminance.%c;\n", luminance_component);
5852     shader_addline(&buffer, "END\n");
5853
5854     ENTER_GL();
5855     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, strlen(buffer.buffer), buffer.buffer));
5856
5857     if (glGetError() == GL_INVALID_OPERATION) {
5858         GLint pos;
5859         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
5860         FIXME("Fragment program error at position %d: %s\n", pos,
5861               debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
5862     }
5863     shader_buffer_free(&buffer);
5864     LEAVE_GL();
5865
5866     switch (yuv_fixup)
5867     {
5868         case YUV_FIXUP_YUY2:
5869             if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->yuy2_rect_shader = shader;
5870             else priv->yuy2_2d_shader = shader;
5871             break;
5872
5873         case YUV_FIXUP_UYVY:
5874             if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->uyvy_rect_shader = shader;
5875             else priv->uyvy_2d_shader = shader;
5876             break;
5877
5878         case YUV_FIXUP_YV12:
5879             if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->yv12_rect_shader = shader;
5880             else priv->yv12_2d_shader = shader;
5881             break;
5882     }
5883
5884     return shader;
5885 }
5886
5887 static HRESULT arbfp_blit_set(IWineD3DDevice *iface, const struct GlPixelFormatDesc *format_desc,
5888         GLenum textype, UINT width, UINT height)
5889 {
5890     GLenum shader;
5891     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
5892     float size[4] = {width, height, 1, 1};
5893     struct arbfp_blit_priv *priv = device->blit_priv;
5894     enum yuv_fixup yuv_fixup;
5895
5896     if (!is_yuv_fixup(format_desc->color_fixup))
5897     {
5898         TRACE("Fixup:\n");
5899         dump_color_fixup_desc(format_desc->color_fixup);
5900         /* Don't bother setting up a shader for unconverted formats */
5901         ENTER_GL();
5902         glEnable(textype);
5903         checkGLcall("glEnable(textype)");
5904         LEAVE_GL();
5905         return WINED3D_OK;
5906     }
5907
5908     yuv_fixup = get_yuv_fixup(format_desc->color_fixup);
5909
5910     switch(yuv_fixup)
5911     {
5912         case YUV_FIXUP_YUY2:
5913             shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->yuy2_rect_shader : priv->yuy2_2d_shader;
5914             break;
5915
5916         case YUV_FIXUP_UYVY:
5917             shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->uyvy_rect_shader : priv->uyvy_2d_shader;
5918             break;
5919
5920         case YUV_FIXUP_YV12:
5921             shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->yv12_rect_shader : priv->yv12_2d_shader;
5922             break;
5923
5924         default:
5925             FIXME("Unsupported YUV fixup %#x, not setting a shader\n", yuv_fixup);
5926             ENTER_GL();
5927             glEnable(textype);
5928             checkGLcall("glEnable(textype)");
5929             LEAVE_GL();
5930             return E_NOTIMPL;
5931     }
5932
5933     if (!shader) shader = gen_yuv_shader(device, yuv_fixup, textype);
5934
5935     ENTER_GL();
5936     glEnable(GL_FRAGMENT_PROGRAM_ARB);
5937     checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB)");
5938     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader));
5939     checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader)");
5940     GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 0, size));
5941     checkGLcall("glProgramLocalParameter4fvARB");
5942     LEAVE_GL();
5943
5944     return WINED3D_OK;
5945 }
5946
5947 static void arbfp_blit_unset(IWineD3DDevice *iface) {
5948     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
5949
5950     ENTER_GL();
5951     glDisable(GL_FRAGMENT_PROGRAM_ARB);
5952     checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
5953     glDisable(GL_TEXTURE_2D);
5954     checkGLcall("glDisable(GL_TEXTURE_2D)");
5955     if(GL_SUPPORT(ARB_TEXTURE_CUBE_MAP)) {
5956         glDisable(GL_TEXTURE_CUBE_MAP_ARB);
5957         checkGLcall("glDisable(GL_TEXTURE_CUBE_MAP_ARB)");
5958     }
5959     if(GL_SUPPORT(ARB_TEXTURE_RECTANGLE)) {
5960         glDisable(GL_TEXTURE_RECTANGLE_ARB);
5961         checkGLcall("glDisable(GL_TEXTURE_RECTANGLE_ARB)");
5962     }
5963     LEAVE_GL();
5964 }
5965
5966 static BOOL arbfp_blit_color_fixup_supported(struct color_fixup_desc fixup)
5967 {
5968     enum yuv_fixup yuv_fixup;
5969
5970     if (TRACE_ON(d3d_shader) && TRACE_ON(d3d))
5971     {
5972         TRACE("Checking support for fixup:\n");
5973         dump_color_fixup_desc(fixup);
5974     }
5975
5976     if (is_identity_fixup(fixup))
5977     {
5978         TRACE("[OK]\n");
5979         return TRUE;
5980     }
5981
5982     /* We only support YUV conversions. */
5983     if (!is_yuv_fixup(fixup))
5984     {
5985         TRACE("[FAILED]\n");
5986         return FALSE;
5987     }
5988
5989     yuv_fixup = get_yuv_fixup(fixup);
5990     switch(yuv_fixup)
5991     {
5992         case YUV_FIXUP_YUY2:
5993         case YUV_FIXUP_UYVY:
5994         case YUV_FIXUP_YV12:
5995             TRACE("[OK]\n");
5996             return TRUE;
5997
5998         default:
5999             FIXME("Unsupported YUV fixup %#x\n", yuv_fixup);
6000             TRACE("[FAILED]\n");
6001             return FALSE;
6002     }
6003 }
6004
6005 const struct blit_shader arbfp_blit = {
6006     arbfp_blit_alloc,
6007     arbfp_blit_free,
6008     arbfp_blit_set,
6009     arbfp_blit_unset,
6010     arbfp_blit_color_fixup_supported,
6011 };
6012
6013 #undef GLINFO_LOCATION