wined3d: Add a proper enum value for "no extension".
[wine] / dlls / wined3d / arb_program_shader.c
1 /*
2  * Pixel and vertex shaders implementation using ARB_vertex_program
3  * and ARB_fragment_program GL extensions.
4  *
5  * Copyright 2002-2003 Jason Edmeades
6  * Copyright 2002-2003 Raphael Junqueira
7  * Copyright 2004 Christian Costa
8  * Copyright 2005 Oliver Stieber
9  * Copyright 2006 Ivan Gyurdiev
10  * Copyright 2006 Jason Green
11  * Copyright 2006 Henri Verbeet
12  * Copyright 2007-2008 Stefan Dösinger for CodeWeavers
13  * Copyright 2009 Henri Verbeet for CodeWeavers
14  *
15  * This library is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU Lesser General Public
17  * License as published by the Free Software Foundation; either
18  * version 2.1 of the License, or (at your option) any later version.
19  *
20  * This library is distributed in the hope that it will be useful,
21  * but WITHOUT ANY WARRANTY; without even the implied warranty of
22  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23  * Lesser General Public License for more details.
24  *
25  * You should have received a copy of the GNU Lesser General Public
26  * License along with this library; if not, write to the Free Software
27  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
28  */
29
30 #include "config.h"
31
32 #include <math.h>
33 #include <stdio.h>
34
35 #include "wined3d_private.h"
36
37 WINE_DEFAULT_DEBUG_CHANNEL(d3d_shader);
38 WINE_DECLARE_DEBUG_CHANNEL(d3d_constants);
39 WINE_DECLARE_DEBUG_CHANNEL(d3d_caps);
40 WINE_DECLARE_DEBUG_CHANNEL(d3d);
41
42 #define GLINFO_LOCATION      (*gl_info)
43
44 /* GL locking for state handlers is done by the caller. */
45 static BOOL need_mova_const(IWineD3DBaseShader *shader, const WineD3D_GL_Info *gl_info) {
46     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *) shader;
47     if(!This->baseShader.reg_maps.usesmova) return FALSE;
48     return !GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION);
49 }
50
51 static BOOL need_helper_const(const WineD3D_GL_Info *gl_info) {
52     if(!GL_SUPPORT(NV_VERTEX_PROGRAM)   || /* Need to init colors */
53        gl_info->arb_vs_offset_limit     || /* Have to init texcoords */
54        gl_info->set_texcoord_w) {          /* Load the immval offset */
55         return TRUE;
56     }
57     return FALSE;
58 }
59
60 static unsigned int reserved_vs_const(IWineD3DBaseShader *shader, const WineD3D_GL_Info *gl_info) {
61     unsigned int ret = 1;
62     /* We use one PARAM for the pos fixup, and in some cases one to load
63      * some immediate values into the shader
64      */
65     if(need_helper_const(gl_info)) ret++;
66     if(need_mova_const(shader, gl_info)) ret++;
67     return ret;
68 }
69
70 /* Internally used shader constants. Applications can use constants 0 to GL_LIMITS(vshader_constantsF) - 1,
71  * so upload them above that
72  */
73 #define ARB_SHADER_PRIVCONST_BASE (GL_LIMITS(vshader_constantsF) - 1)
74 #define ARB_SHADER_PRIVCONST_POS ARB_SHADER_PRIVCONST_BASE + 0
75
76 /* ARB_program_shader private data */
77 struct shader_arb_priv {
78     GLuint                  current_vprogram_id;
79     GLuint                  current_fprogram_id;
80     GLuint                  depth_blt_vprogram_id;
81     GLuint                  depth_blt_fprogram_id[tex_type_count];
82     BOOL                    use_arbfp_fixed_func;
83     struct wine_rb_tree     fragment_shaders;
84 };
85
86 struct if_frame {
87     struct list entry;
88     BOOL ifc;
89     BOOL muting;
90 };
91
92 struct shader_arb_ctx_priv {
93     char addr_reg[20];
94     enum {
95         /* plain GL_ARB_vertex_program or GL_ARB_fragment_program */
96         ARB,
97         /* GL_NV_vertex_progam2_option or GL_NV_fragment_program_option */
98         NV2,
99         /* GL_NV_vertex_program3 or GL_NV_fragment_program2 */
100         NV3
101     } target_version;
102
103     const struct arb_vs_compile_args    *cur_vs_args;
104     const struct arb_ps_compile_args    *cur_ps_args;
105     struct list if_frames;
106     BOOL muted;
107 };
108
109 struct arb_ps_compile_args {
110     struct ps_compile_args          super;
111     DWORD                           bools; /* WORD is enough, use DWORD for alignment */
112 };
113
114 struct arb_ps_compiled_shader {
115     struct arb_ps_compile_args      args;
116     GLuint                          prgId;
117 };
118
119 struct arb_pshader_private {
120     struct arb_ps_compiled_shader   *gl_shaders;
121     UINT                            num_gl_shaders, shader_array_size;
122 };
123
124 struct arb_vs_compile_args {
125     struct vs_compile_args          super;
126     DWORD                           bools; /* WORD is enough, use DWORD for alignment */
127 };
128
129 struct arb_vs_compiled_shader {
130     struct arb_vs_compile_args      args;
131     GLuint                          prgId;
132 };
133
134 struct arb_vshader_private {
135     struct arb_vs_compiled_shader   *gl_shaders;
136     UINT                            num_gl_shaders, shader_array_size;
137 };
138
139 /********************************************************
140  * ARB_[vertex/fragment]_program helper functions follow
141  ********************************************************/
142
143 /** 
144  * Loads floating point constants into the currently set ARB_vertex/fragment_program.
145  * When constant_list == NULL, it will load all the constants.
146  *  
147  * @target_type should be either GL_VERTEX_PROGRAM_ARB (for vertex shaders)
148  *  or GL_FRAGMENT_PROGRAM_ARB (for pixel shaders)
149  */
150 /* GL locking is done by the caller */
151 static unsigned int shader_arb_load_constantsF(IWineD3DBaseShaderImpl* This, const WineD3D_GL_Info *gl_info,
152         GLuint target_type, unsigned int max_constants, const float *constants, char *dirty_consts)
153 {
154     local_constant* lconst;
155     DWORD i, j;
156     unsigned int ret;
157
158     if (TRACE_ON(d3d_shader)) {
159         for(i = 0; i < max_constants; i++) {
160             if(!dirty_consts[i]) continue;
161             TRACE_(d3d_constants)("Loading constants %i: %f, %f, %f, %f\n", i,
162                         constants[i * 4 + 0], constants[i * 4 + 1],
163                         constants[i * 4 + 2], constants[i * 4 + 3]);
164         }
165     }
166     /* In 1.X pixel shaders constants are implicitly clamped in the range [-1;1] */
167     if (target_type == GL_FRAGMENT_PROGRAM_ARB && This->baseShader.reg_maps.shader_version.major == 1)
168     {
169         float lcl_const[4];
170         for(i = 0; i < max_constants; i++) {
171             if(!dirty_consts[i]) continue;
172             dirty_consts[i] = 0;
173
174             j = 4 * i;
175             if(constants[j + 0] > 1.0) lcl_const[0] = 1.0;
176             else if(constants[j + 0] < -1.0) lcl_const[0] = -1.0;
177             else lcl_const[0] = constants[j + 0];
178
179             if(constants[j + 1] > 1.0) lcl_const[1] = 1.0;
180             else if(constants[j + 1] < -1.0) lcl_const[1] = -1.0;
181             else lcl_const[1] = constants[j + 1];
182
183             if(constants[j + 2] > 1.0) lcl_const[2] = 1.0;
184             else if(constants[j + 2] < -1.0) lcl_const[2] = -1.0;
185             else lcl_const[2] = constants[j + 2];
186
187             if(constants[j + 3] > 1.0) lcl_const[3] = 1.0;
188             else if(constants[j + 3] < -1.0) lcl_const[3] = -1.0;
189             else lcl_const[3] = constants[j + 3];
190
191             GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, i, lcl_const));
192         }
193     } else {
194         if(GL_SUPPORT(EXT_GPU_PROGRAM_PARAMETERS)) {
195             /* TODO: Benchmark if we're better of with finding the dirty constants ourselves,
196              * or just reloading *all* constants at once
197              *
198             GL_EXTCALL(glProgramEnvParameters4fvEXT(target_type, 0, max_constants, constants));
199              */
200             for(i = 0; i < max_constants; i++) {
201                 if(!dirty_consts[i]) continue;
202
203                 /* Find the next block of dirty constants */
204                 dirty_consts[i] = 0;
205                 j = i;
206                 for(i++; (i < max_constants) && dirty_consts[i]; i++) {
207                     dirty_consts[i] = 0;
208                 }
209
210                 GL_EXTCALL(glProgramEnvParameters4fvEXT(target_type, j, i - j, constants + (j * 4)));
211             }
212         } else {
213             for(i = 0; i < max_constants; i++) {
214                 if(dirty_consts[i]) {
215                     dirty_consts[i] = 0;
216                     GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, i, constants + (i * 4)));
217                 }
218             }
219         }
220     }
221     checkGLcall("glProgramEnvParameter4fvARB()");
222
223     /* Load immediate constants */
224     if(This->baseShader.load_local_constsF) {
225         if (TRACE_ON(d3d_shader)) {
226             LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
227                 GLfloat* values = (GLfloat*)lconst->value;
228                 TRACE_(d3d_constants)("Loading local constants %i: %f, %f, %f, %f\n", lconst->idx,
229                         values[0], values[1], values[2], values[3]);
230             }
231         }
232         /* Immediate constants are clamped for 1.X shaders at loading times */
233         ret = 0;
234         LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
235             dirty_consts[lconst->idx] = 1; /* Dirtify so the non-immediate constant overwrites it next time */
236             ret = max(ret, lconst->idx + 1);
237             GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, lconst->idx, (GLfloat*)lconst->value));
238         }
239         checkGLcall("glProgramEnvParameter4fvARB()");
240         return ret; /* The loaded immediate constants need reloading for the next shader */
241     } else {
242         return 0; /* No constants are dirty now */
243     }
244 }
245
246 /**
247  * Loads the texture dimensions for NP2 fixup into the currently set ARB_[vertex/fragment]_programs.
248  */
249 static void shader_arb_load_np2fixup_constants(
250     IWineD3DDevice* device,
251     char usePixelShader,
252     char useVertexShader) {
253     /* not implemented */
254 }
255
256 static inline void shader_arb_ps_local_constants(IWineD3DDeviceImpl* deviceImpl)
257 {
258     IWineD3DStateBlockImpl* stateBlock = deviceImpl->stateBlock;
259     IWineD3DBaseShaderImpl* pshader = (IWineD3DBaseShaderImpl*) stateBlock->pixelShader;
260     IWineD3DPixelShaderImpl *psi = (IWineD3DPixelShaderImpl *) pshader;
261     const WineD3D_GL_Info *gl_info = &deviceImpl->adapter->gl_info;
262     unsigned char i;
263
264     for(i = 0; i < psi->numbumpenvmatconsts; i++)
265     {
266         /* The state manager takes care that this function is always called if the bump env matrix changes */
267         const float *data = (const float *)&stateBlock->textureState[(int) psi->bumpenvmatconst[i].texunit][WINED3DTSS_BUMPENVMAT00];
268         GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, psi->bumpenvmatconst[i].const_num, data));
269
270         if (psi->luminanceconst[i].const_num != WINED3D_CONST_NUM_UNUSED)
271         {
272             /* WINED3DTSS_BUMPENVLSCALE and WINED3DTSS_BUMPENVLOFFSET are next to each other.
273              * point gl to the scale, and load 4 floats. x = scale, y = offset, z and w are junk, we
274              * don't care about them. The pointers are valid for sure because the stateblock is bigger.
275              * (they're WINED3DTSS_TEXTURETRANSFORMFLAGS and WINED3DTSS_ADDRESSW, so most likely 0 or NaN
276             */
277             const float *scale = (const float *)&stateBlock->textureState[(int) psi->luminanceconst[i].texunit][WINED3DTSS_BUMPENVLSCALE];
278             GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, psi->luminanceconst[i].const_num, scale));
279         }
280     }
281 }
282 /**
283  * Loads the app-supplied constants into the currently set ARB_[vertex/fragment]_programs.
284  * 
285  * We only support float constants in ARB at the moment, so don't 
286  * worry about the Integers or Booleans
287  */
288 /* GL locking is done by the caller (state handler) */
289 static void shader_arb_load_constants(
290     IWineD3DDevice* device,
291     char usePixelShader,
292     char useVertexShader) {
293    
294     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) device; 
295     IWineD3DStateBlockImpl* stateBlock = deviceImpl->stateBlock;
296     const WineD3D_GL_Info *gl_info = &deviceImpl->adapter->gl_info;
297
298     if (useVertexShader) {
299         IWineD3DBaseShaderImpl* vshader = (IWineD3DBaseShaderImpl*) stateBlock->vertexShader;
300
301         /* Load DirectX 9 float constants for vertex shader */
302         deviceImpl->highest_dirty_vs_const = shader_arb_load_constantsF(
303                 vshader, gl_info, GL_VERTEX_PROGRAM_ARB,
304                 deviceImpl->highest_dirty_vs_const,
305                 stateBlock->vertexShaderConstantF,
306                 deviceImpl->activeContext->vshader_const_dirty);
307
308         /* Upload the position fixup */
309         GL_EXTCALL(glProgramEnvParameter4fvARB(GL_VERTEX_PROGRAM_ARB, ARB_SHADER_PRIVCONST_POS, deviceImpl->posFixup));
310     }
311
312     if (usePixelShader) {
313         IWineD3DBaseShaderImpl* pshader = (IWineD3DBaseShaderImpl*) stateBlock->pixelShader;
314
315         /* Load DirectX 9 float constants for pixel shader */
316         deviceImpl->highest_dirty_ps_const = shader_arb_load_constantsF(
317                 pshader, gl_info, GL_FRAGMENT_PROGRAM_ARB,
318                 deviceImpl->highest_dirty_ps_const,
319                 stateBlock->pixelShaderConstantF,
320                 deviceImpl->activeContext->pshader_const_dirty);
321         shader_arb_ps_local_constants(deviceImpl);
322     }
323 }
324
325 static void shader_arb_update_float_vertex_constants(IWineD3DDevice *iface, UINT start, UINT count)
326 {
327     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
328
329     /* We don't want shader constant dirtification to be an O(contexts), so just dirtify the active
330      * context. On a context switch the old context will be fully dirtified */
331     memset(This->activeContext->vshader_const_dirty + start, 1,
332             sizeof(*This->activeContext->vshader_const_dirty) * count);
333     This->highest_dirty_vs_const = max(This->highest_dirty_vs_const, start + count + 1);
334 }
335
336 static void shader_arb_update_float_pixel_constants(IWineD3DDevice *iface, UINT start, UINT count)
337 {
338     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
339
340     /* We don't want shader constant dirtification to be an O(contexts), so just dirtify the active
341      * context. On a context switch the old context will be fully dirtified */
342     memset(This->activeContext->pshader_const_dirty + start, 1,
343             sizeof(*This->activeContext->pshader_const_dirty) * count);
344     This->highest_dirty_ps_const = max(This->highest_dirty_ps_const, start + count + 1);
345 }
346
347 static DWORD *local_const_mapping(IWineD3DBaseShaderImpl *This)
348 {
349     DWORD *ret;
350     DWORD idx = 0;
351     const local_constant *lconst;
352
353     if(This->baseShader.load_local_constsF || list_empty(&This->baseShader.constantsF)) return NULL;
354
355     ret = HeapAlloc(GetProcessHeap(), 0, sizeof(DWORD) * This->baseShader.limits.constant_float);
356     if(!ret) {
357         ERR("Out of memory\n");
358         return NULL;
359     }
360
361     LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
362         ret[lconst->idx] = idx++;
363     }
364     return ret;
365 }
366
367 /* Generate the variable & register declarations for the ARB_vertex_program output target */
368 static void shader_generate_arb_declarations(IWineD3DBaseShader *iface, const shader_reg_maps *reg_maps,
369         SHADER_BUFFER *buffer, const WineD3D_GL_Info *gl_info, DWORD *lconst_map)
370 {
371     IWineD3DBaseShaderImpl* This = (IWineD3DBaseShaderImpl*) iface;
372     DWORD i, cur, next_local = 0;
373     char pshader = shader_is_pshader_version(reg_maps->shader_version.type);
374     unsigned max_constantsF;
375     const local_constant *lconst;
376
377     /* In pixel shaders, all private constants are program local, we don't need anything
378      * from program.env. Thus we can advertise the full set of constants in pixel shaders.
379      * If we need a private constant the GL implementation will squeeze it in somewhere
380      *
381      * With vertex shaders we need the posFixup and on some GL implementations 4 helper
382      * immediate values. The posFixup is loaded using program.env for now, so always
383      * subtract one from the number of constants. If the shader uses indirect addressing,
384      * account for the helper const too because we have to declare all availabke d3d constants
385      * and don't know which are actually used.
386      */
387     if(pshader) {
388         max_constantsF = GL_LIMITS(pshader_constantsF);
389     } else {
390         if(This->baseShader.reg_maps.usesrelconstF) {
391             max_constantsF = GL_LIMITS(vshader_constantsF) - reserved_vs_const(iface, gl_info);
392             if(GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION)) max_constantsF -= GL_LIMITS(clipplanes);
393         } else {
394             max_constantsF = GL_LIMITS(vshader_constantsF) - 1;
395         }
396     }
397
398     for(i = 0; i < This->baseShader.limits.temporary; i++) {
399         if (reg_maps->temporary[i])
400             shader_addline(buffer, "TEMP R%u;\n", i);
401     }
402
403     for (i = 0; i < This->baseShader.limits.address; i++) {
404         if (reg_maps->address[i])
405             shader_addline(buffer, "ADDRESS A%d;\n", i);
406     }
407
408     if(pshader && reg_maps->shader_version.major == 1 && reg_maps->shader_version.minor <= 3) {
409         for(i = 0; i < This->baseShader.limits.texcoord; i++) {
410             if (reg_maps->texcoord[i] && pshader)
411                 shader_addline(buffer,"TEMP T%u;\n", i);
412         }
413     }
414
415     /* Load local constants using the program-local space,
416      * this avoids reloading them each time the shader is used
417      */
418     if(lconst_map) {
419         LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
420             shader_addline(buffer, "PARAM C%u = program.local[%u];\n", lconst->idx,
421                            lconst_map[lconst->idx]);
422             next_local = max(next_local, lconst_map[lconst->idx] + 1);
423         }
424     }
425
426     /* we use the array-based constants array if the local constants are marked for loading,
427      * because then we use indirect addressing, or when the local constant list is empty,
428      * because then we don't know if we're using indirect addressing or not. If we're hardcoding
429      * local constants do not declare the loaded constants as an array because ARB compilers usually
430      * do not optimize unused constants away
431      */
432     if(This->baseShader.reg_maps.usesrelconstF) {
433         /* Need to PARAM the environment parameters (constants) so we can use relative addressing */
434         shader_addline(buffer, "PARAM C[%d] = { program.env[0..%d] };\n",
435                     max_constantsF, max_constantsF - 1);
436     } else {
437         for(i = 0; i < max_constantsF; i++) {
438             DWORD idx, mask;
439             idx = i >> 5;
440             mask = 1 << (i & 0x1f);
441             if(!shader_constant_is_local(This, i) && (This->baseShader.reg_maps.constf[idx] & mask)) {
442                 shader_addline(buffer, "PARAM C%d = program.env[%d];\n",i, i);
443             }
444         }
445     }
446
447     for(i = 0; i < (sizeof(reg_maps->bumpmat) / sizeof(reg_maps->bumpmat[0])); i++) {
448         IWineD3DPixelShaderImpl *ps = (IWineD3DPixelShaderImpl *) This;
449         if(!reg_maps->bumpmat[i]) continue;
450
451         cur = ps->numbumpenvmatconsts;
452         ps->bumpenvmatconst[cur].const_num = -1;
453         ps->bumpenvmatconst[cur].texunit = i;
454         ps->luminanceconst[cur].const_num = -1;
455         ps->luminanceconst[cur].texunit = i;
456
457         /* We can fit the constants into the constant limit for sure because texbem, texbeml, bem and beml are only supported
458          * in 1.x shaders, and GL_ARB_fragment_program has a constant limit of 24 constants. So in the worst case we're loading
459          * 8 shader constants, 8 bump matrices and 8 luminance parameters and are perfectly fine. (No NP2 fixup on bumpmapped
460          * textures due to conditional NP2 restrictions)
461          *
462          * Use local constants to load the bump env parameters, not program.env. This avoids collisions with d3d constants of
463          * shaders in newer shader models. Since the bump env parameters have to share their space with NP2 fixup constants,
464          * their location is shader dependent anyway and they cannot be loaded globally.
465          */
466         ps->bumpenvmatconst[cur].const_num = next_local++;
467         shader_addline(buffer, "PARAM bumpenvmat%d = program.local[%d];\n",
468                        i, ps->bumpenvmatconst[cur].const_num);
469         ps->numbumpenvmatconsts = cur + 1;
470
471         if(!reg_maps->luminanceparams[i]) continue;
472
473         ((IWineD3DPixelShaderImpl *)This)->luminanceconst[cur].const_num = next_local++;
474         shader_addline(buffer, "PARAM luminance%d = program.local[%d];\n",
475                         i, ps->luminanceconst[cur].const_num);
476     }
477
478 }
479
480 static const char * const shift_tab[] = {
481     "dummy",     /*  0 (none) */
482     "coefmul.x", /*  1 (x2)   */
483     "coefmul.y", /*  2 (x4)   */
484     "coefmul.z", /*  3 (x8)   */
485     "coefmul.w", /*  4 (x16)  */
486     "dummy",     /*  5 (x32)  */
487     "dummy",     /*  6 (x64)  */
488     "dummy",     /*  7 (x128) */
489     "dummy",     /*  8 (d256) */
490     "dummy",     /*  9 (d128) */
491     "dummy",     /* 10 (d64)  */
492     "dummy",     /* 11 (d32)  */
493     "coefdiv.w", /* 12 (d16)  */
494     "coefdiv.z", /* 13 (d8)   */
495     "coefdiv.y", /* 14 (d4)   */
496     "coefdiv.x"  /* 15 (d2)   */
497 };
498
499 static void shader_arb_get_write_mask(const struct wined3d_shader_instruction *ins,
500         const struct wined3d_shader_dst_param *dst, char *write_mask)
501 {
502     char *ptr = write_mask;
503
504     if (dst->write_mask != WINED3DSP_WRITEMASK_ALL)
505     {
506         *ptr++ = '.';
507         if (dst->write_mask & WINED3DSP_WRITEMASK_0) *ptr++ = 'x';
508         if (dst->write_mask & WINED3DSP_WRITEMASK_1) *ptr++ = 'y';
509         if (dst->write_mask & WINED3DSP_WRITEMASK_2) *ptr++ = 'z';
510         if (dst->write_mask & WINED3DSP_WRITEMASK_3) *ptr++ = 'w';
511     }
512
513     *ptr = '\0';
514 }
515
516 static void shader_arb_get_swizzle(const struct wined3d_shader_src_param *param, BOOL fixup, char *swizzle_str)
517 {
518     /* For registers of type WINED3DDECLTYPE_D3DCOLOR, data is stored as "bgra",
519      * but addressed as "rgba". To fix this we need to swap the register's x
520      * and z components. */
521     const char *swizzle_chars = fixup ? "zyxw" : "xyzw";
522     char *ptr = swizzle_str;
523
524     /* swizzle bits fields: wwzzyyxx */
525     DWORD swizzle = param->swizzle;
526     DWORD swizzle_x = swizzle & 0x03;
527     DWORD swizzle_y = (swizzle >> 2) & 0x03;
528     DWORD swizzle_z = (swizzle >> 4) & 0x03;
529     DWORD swizzle_w = (swizzle >> 6) & 0x03;
530
531     /* If the swizzle is the default swizzle (ie, "xyzw"), we don't need to
532      * generate a swizzle string. Unless we need to our own swizzling. */
533     if (swizzle != WINED3DSP_NOSWIZZLE || fixup)
534     {
535         *ptr++ = '.';
536         if (swizzle_x == swizzle_y && swizzle_x == swizzle_z && swizzle_x == swizzle_w) {
537             *ptr++ = swizzle_chars[swizzle_x];
538         } else {
539             *ptr++ = swizzle_chars[swizzle_x];
540             *ptr++ = swizzle_chars[swizzle_y];
541             *ptr++ = swizzle_chars[swizzle_z];
542             *ptr++ = swizzle_chars[swizzle_w];
543         }
544     }
545
546     *ptr = '\0';
547 }
548
549 static void shader_arb_request_a0(const struct wined3d_shader_instruction *ins, const char *src)
550 {
551     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
552     SHADER_BUFFER *buffer = ins->ctx->buffer;
553
554     if(strcmp(priv->addr_reg, src) == 0) return;
555
556     strcpy(priv->addr_reg, src);
557     shader_addline(buffer, "ARL A0.x, %s;\n", src);
558 }
559
560 static void shader_arb_get_src_param(const struct wined3d_shader_instruction *ins,
561         const struct wined3d_shader_src_param *src, unsigned int tmpreg, char *outregstr);
562
563 static void shader_arb_get_register_name(const struct wined3d_shader_instruction *ins,
564         const struct wined3d_shader_register *reg, char *register_name, BOOL *is_color)
565 {
566     /* oPos, oFog and oPts in D3D */
567     static const char * const rastout_reg_names[] = {"TMP_OUT", "result.fogcoord", "result.pointsize"};
568     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
569     BOOL pshader = shader_is_pshader_version(This->baseShader.reg_maps.shader_version.type);
570     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
571
572     *is_color = FALSE;
573
574     switch (reg->type)
575     {
576         case WINED3DSPR_TEMP:
577             sprintf(register_name, "R%u", reg->idx);
578             break;
579
580         case WINED3DSPR_INPUT:
581             if (pshader)
582             {
583                 if (reg->idx == 0) strcpy(register_name, "fragment.color.primary");
584                 else strcpy(register_name, "fragment.color.secondary");
585             }
586             else
587             {
588                 if (ctx->cur_vs_args->super.swizzle_map & (1 << reg->idx)) *is_color = TRUE;
589                 sprintf(register_name, "vertex.attrib[%u]", reg->idx);
590             }
591             break;
592
593         case WINED3DSPR_CONST:
594             if (!pshader && reg->rel_addr)
595             {
596                 char rel_reg[50];
597                 UINT rel_offset = ((IWineD3DVertexShaderImpl *)This)->rel_offset;
598                 if(This->baseShader.reg_maps.shader_version.major < 2) {
599                     sprintf(rel_reg, "A0.x");
600                 } else {
601                     shader_arb_get_src_param(ins, reg->rel_addr, 0, rel_reg);
602                     if(ctx->target_version == ARB) {
603                         shader_arb_request_a0(ins, rel_reg);
604                         sprintf(rel_reg, "A0.x");
605                     }
606                 }
607                 if (reg->idx >= rel_offset)
608                     sprintf(register_name, "C[%s + %u]", rel_reg, reg->idx - rel_offset);
609                 else
610                     sprintf(register_name, "C[%s - %u]", rel_reg, -reg->idx + rel_offset);
611             }
612             else
613             {
614                 if (This->baseShader.reg_maps.usesrelconstF)
615                     sprintf(register_name, "C[%u]", reg->idx);
616                 else
617                     sprintf(register_name, "C%u", reg->idx);
618             }
619             break;
620
621         case WINED3DSPR_TEXTURE: /* case WINED3DSPR_ADDR: */
622             if (pshader) {
623                 if(This->baseShader.reg_maps.shader_version.major == 1 &&
624                    This->baseShader.reg_maps.shader_version.minor <= 3) {
625                     /* In ps <= 1.3, Tx is a temporary register as destination to all instructions,
626                      * and as source to most instructions. For some instructions it is the texcoord
627                      * input. Those instructions know about the special use
628                      */
629                     sprintf(register_name, "T%u", reg->idx);
630                 } else {
631                     /* in ps 1.4 and 2.x Tx is always a (read-only) varying */
632                     sprintf(register_name, "fragment.texcoord[%u]", reg->idx);
633                 }
634             }
635             else
636             {
637                 if(This->baseShader.reg_maps.shader_version.major == 1 || ctx->target_version >= NV2)
638                 {
639                     sprintf(register_name, "A%u", reg->idx);
640                 }
641                 else
642                 {
643                     sprintf(register_name, "A%u_SHADOW", reg->idx);
644                 }
645             }
646             break;
647
648         case WINED3DSPR_COLOROUT:
649             if (reg->idx == 0)
650             {
651                 if(ctx->cur_ps_args->super.srgb_correction)
652                 {
653                     strcpy(register_name, "TMP_COLOR");
654                 }
655                 else
656                 {
657                     strcpy(register_name, "result.color");
658                 }
659             }
660             else
661             {
662                 /* TODO: See GL_ARB_draw_buffers */
663                 FIXME("Unsupported write to render target %u\n", reg->idx);
664                 sprintf(register_name, "unsupported_register");
665             }
666             break;
667
668         case WINED3DSPR_RASTOUT:
669             sprintf(register_name, "%s", rastout_reg_names[reg->idx]);
670             break;
671
672         case WINED3DSPR_DEPTHOUT:
673             strcpy(register_name, "result.depth");
674             break;
675
676         case WINED3DSPR_ATTROUT:
677             if (pshader) sprintf(register_name, "oD[%u]", reg->idx);
678             else if (reg->idx == 0) strcpy(register_name, "result.color.primary");
679             else strcpy(register_name, "result.color.secondary");
680             break;
681
682         case WINED3DSPR_TEXCRDOUT:
683             if (pshader) sprintf(register_name, "oT[%u]", reg->idx);
684             else sprintf(register_name, "result.texcoord[%u]", reg->idx);
685             break;
686
687         default:
688             FIXME("Unhandled register type %#x[%u]\n", reg->type, reg->idx);
689             sprintf(register_name, "unrecognized_register[%u]", reg->idx);
690             break;
691     }
692 }
693
694 static void shader_arb_get_dst_param(const struct wined3d_shader_instruction *ins,
695         const struct wined3d_shader_dst_param *wined3d_dst, char *str)
696 {
697     char register_name[255];
698     char write_mask[6];
699     BOOL is_color;
700
701     shader_arb_get_register_name(ins, &wined3d_dst->reg, register_name, &is_color);
702     strcpy(str, register_name);
703
704     shader_arb_get_write_mask(ins, wined3d_dst, write_mask);
705     strcat(str, write_mask);
706 }
707
708 static const char *shader_arb_get_fixup_swizzle(enum fixup_channel_source channel_source)
709 {
710     switch(channel_source)
711     {
712         case CHANNEL_SOURCE_ZERO: return "0";
713         case CHANNEL_SOURCE_ONE: return "1";
714         case CHANNEL_SOURCE_X: return "x";
715         case CHANNEL_SOURCE_Y: return "y";
716         case CHANNEL_SOURCE_Z: return "z";
717         case CHANNEL_SOURCE_W: return "w";
718         default:
719             FIXME("Unhandled channel source %#x\n", channel_source);
720             return "undefined";
721     }
722 }
723
724 static void gen_color_correction(SHADER_BUFFER *buffer, const char *reg, DWORD dst_mask,
725                                  const char *one, const char *two, struct color_fixup_desc fixup)
726 {
727     DWORD mask;
728
729     if (is_yuv_fixup(fixup))
730     {
731         enum yuv_fixup yuv_fixup = get_yuv_fixup(fixup);
732         FIXME("YUV fixup (%#x) not supported\n", yuv_fixup);
733         return;
734     }
735
736     mask = 0;
737     if (fixup.x_source != CHANNEL_SOURCE_X) mask |= WINED3DSP_WRITEMASK_0;
738     if (fixup.y_source != CHANNEL_SOURCE_Y) mask |= WINED3DSP_WRITEMASK_1;
739     if (fixup.z_source != CHANNEL_SOURCE_Z) mask |= WINED3DSP_WRITEMASK_2;
740     if (fixup.w_source != CHANNEL_SOURCE_W) mask |= WINED3DSP_WRITEMASK_3;
741     mask &= dst_mask;
742
743     if (mask)
744     {
745         shader_addline(buffer, "SWZ %s, %s, %s, %s, %s, %s;\n", reg, reg,
746                 shader_arb_get_fixup_swizzle(fixup.x_source), shader_arb_get_fixup_swizzle(fixup.y_source),
747                 shader_arb_get_fixup_swizzle(fixup.z_source), shader_arb_get_fixup_swizzle(fixup.w_source));
748     }
749
750     mask = 0;
751     if (fixup.x_sign_fixup) mask |= WINED3DSP_WRITEMASK_0;
752     if (fixup.y_sign_fixup) mask |= WINED3DSP_WRITEMASK_1;
753     if (fixup.z_sign_fixup) mask |= WINED3DSP_WRITEMASK_2;
754     if (fixup.w_sign_fixup) mask |= WINED3DSP_WRITEMASK_3;
755     mask &= dst_mask;
756
757     if (mask)
758     {
759         char reg_mask[6];
760         char *ptr = reg_mask;
761
762         if (mask != WINED3DSP_WRITEMASK_ALL)
763         {
764             *ptr++ = '.';
765             if (mask & WINED3DSP_WRITEMASK_0) *ptr++ = 'x';
766             if (mask & WINED3DSP_WRITEMASK_1) *ptr++ = 'y';
767             if (mask & WINED3DSP_WRITEMASK_2) *ptr++ = 'z';
768             if (mask & WINED3DSP_WRITEMASK_3) *ptr++ = 'w';
769         }
770         *ptr = '\0';
771
772         shader_addline(buffer, "MAD %s%s, %s, %s, -%s;\n", reg, reg_mask, reg, two, one);
773     }
774 }
775
776 static void shader_hw_sample(const struct wined3d_shader_instruction *ins, DWORD sampler_idx,
777         const char *dst_str, const char *coord_reg, BOOL projected, BOOL bias)
778 {
779     SHADER_BUFFER *buffer = ins->ctx->buffer;
780     DWORD sampler_type = ins->ctx->reg_maps->sampler_type[sampler_idx];
781     const char *tex_type;
782     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
783     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) This->baseShader.device;
784     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
785
786     switch(sampler_type) {
787         case WINED3DSTT_1D:
788             tex_type = "1D";
789             break;
790
791         case WINED3DSTT_2D:
792             if(device->stateBlock->textures[sampler_idx] &&
793                IWineD3DBaseTexture_GetTextureDimensions(device->stateBlock->textures[sampler_idx]) == GL_TEXTURE_RECTANGLE_ARB) {
794                 tex_type = "RECT";
795             } else {
796                 tex_type = "2D";
797             }
798             if (shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type))
799             {
800                 if(priv->cur_ps_args->super.np2_fixup & (1 << sampler_idx))
801                 {
802                     FIXME("NP2 texcoord fixup is currently not implemented in ARB mode (use GLSL instead).\n");
803                 }
804             }
805             break;
806
807         case WINED3DSTT_VOLUME:
808             tex_type = "3D";
809             break;
810
811         case WINED3DSTT_CUBE:
812             tex_type = "CUBE";
813             break;
814
815         default:
816             ERR("Unexpected texture type %d\n", sampler_type);
817             tex_type = "";
818     }
819
820     if (bias) {
821         /* Shouldn't be possible, but let's check for it */
822         if(projected) FIXME("Biased and Projected texture sampling\n");
823         /* TXB takes the 4th component of the source vector automatically, as d3d. Nothing more to do */
824         shader_addline(buffer, "TXB %s, %s, texture[%u], %s;\n", dst_str, coord_reg, sampler_idx, tex_type);
825     } else if (projected) {
826         shader_addline(buffer, "TXP %s, %s, texture[%u], %s;\n", dst_str, coord_reg, sampler_idx, tex_type);
827     } else {
828         shader_addline(buffer, "TEX %s, %s, texture[%u], %s;\n", dst_str, coord_reg, sampler_idx, tex_type);
829     }
830
831     if (shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type))
832     {
833         gen_color_correction(buffer, dst_str, ins->dst[0].write_mask,
834                 "one", "coefmul.x", priv->cur_ps_args->super.color_fixup[sampler_idx]);
835     }
836 }
837
838 static void shader_arb_get_src_param(const struct wined3d_shader_instruction *ins,
839         const struct wined3d_shader_src_param *src, unsigned int tmpreg, char *outregstr)
840 {
841     /* Generate a line that does the input modifier computation and return the input register to use */
842     BOOL is_color = FALSE;
843     char regstr[256];
844     char swzstr[20];
845     int insert_line;
846     SHADER_BUFFER *buffer = ins->ctx->buffer;
847     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
848
849     /* Assume a new line will be added */
850     insert_line = 1;
851
852     /* Get register name */
853     shader_arb_get_register_name(ins, &src->reg, regstr, &is_color);
854     shader_arb_get_swizzle(src, is_color, swzstr);
855
856     switch (src->modifiers)
857     {
858     case WINED3DSPSM_NONE:
859         sprintf(outregstr, "%s%s", regstr, swzstr);
860         insert_line = 0;
861         break;
862     case WINED3DSPSM_NEG:
863         sprintf(outregstr, "-%s%s", regstr, swzstr);
864         insert_line = 0;
865         break;
866     case WINED3DSPSM_BIAS:
867         shader_addline(buffer, "ADD T%c, %s, -coefdiv.x;\n", 'A' + tmpreg, regstr);
868         break;
869     case WINED3DSPSM_BIASNEG:
870         shader_addline(buffer, "ADD T%c, -%s, coefdiv.x;\n", 'A' + tmpreg, regstr);
871         break;
872     case WINED3DSPSM_SIGN:
873         shader_addline(buffer, "MAD T%c, %s, coefmul.x, -one.x;\n", 'A' + tmpreg, regstr);
874         break;
875     case WINED3DSPSM_SIGNNEG:
876         shader_addline(buffer, "MAD T%c, %s, -coefmul.x, one.x;\n", 'A' + tmpreg, regstr);
877         break;
878     case WINED3DSPSM_COMP:
879         shader_addline(buffer, "SUB T%c, one.x, %s;\n", 'A' + tmpreg, regstr);
880         break;
881     case WINED3DSPSM_X2:
882         shader_addline(buffer, "ADD T%c, %s, %s;\n", 'A' + tmpreg, regstr, regstr);
883         break;
884     case WINED3DSPSM_X2NEG:
885         shader_addline(buffer, "ADD T%c, -%s, -%s;\n", 'A' + tmpreg, regstr, regstr);
886         break;
887     case WINED3DSPSM_DZ:
888         shader_addline(buffer, "RCP T%c, %s.z;\n", 'A' + tmpreg, regstr);
889         shader_addline(buffer, "MUL T%c, %s, T%c;\n", 'A' + tmpreg, regstr, 'A' + tmpreg);
890         break;
891     case WINED3DSPSM_DW:
892         shader_addline(buffer, "RCP T%c, %s.w;\n", 'A' + tmpreg, regstr);
893         shader_addline(buffer, "MUL T%c, %s, T%c;\n", 'A' + tmpreg, regstr, 'A' + tmpreg);
894         break;
895     case WINED3DSPSM_ABS:
896         if(ctx->target_version >= NV2) {
897             sprintf(outregstr, "|%s%s|", regstr, swzstr);
898             insert_line = 0;
899         } else {
900             shader_addline(buffer, "ABS T%c, %s;\n", 'A' + tmpreg, regstr);
901         }
902         break;
903     case WINED3DSPSM_ABSNEG:
904         if(ctx->target_version >= NV2) {
905             sprintf(outregstr, "-|%s%s|", regstr, swzstr);
906         } else {
907             shader_addline(buffer, "ABS T%c, %s;\n", 'A' + tmpreg, regstr);
908             sprintf(outregstr, "-T%c%s", 'A' + tmpreg, swzstr);
909         }
910         insert_line = 0;
911         break;
912     default:
913         sprintf(outregstr, "%s%s", regstr, swzstr);
914         insert_line = 0;
915     }
916
917     /* Return modified or original register, with swizzle */
918     if (insert_line)
919         sprintf(outregstr, "T%c%s", 'A' + tmpreg, swzstr);
920 }
921
922 static const char *shader_arb_get_modifier(const struct wined3d_shader_instruction *ins)
923 {
924     DWORD mod;
925     const char *ret = "";
926     if (!ins->dst_count) return "";
927
928     mod = ins->dst[0].modifiers;
929     if(mod & WINED3DSPDM_SATURATE) {
930         ret = "_SAT";
931         mod &= ~WINED3DSPDM_SATURATE;
932     }
933     if(mod & WINED3DSPDM_PARTIALPRECISION) {
934         FIXME("Unhandled modifier WINED3DSPDM_PARTIALPRECISION\n");
935         mod &= ~WINED3DSPDM_PARTIALPRECISION;
936     }
937     if(mod & WINED3DSPDM_MSAMPCENTROID) {
938         FIXME("Unhandled modifier WINED3DSPDM_MSAMPCENTROID\n");
939         mod &= ~WINED3DSPDM_MSAMPCENTROID;
940     }
941     if(mod) {
942         FIXME("Unknown modifiers 0x%08x\n", mod);
943     }
944     return ret;
945 }
946
947 static void pshader_hw_bem(const struct wined3d_shader_instruction *ins)
948 {
949     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
950     SHADER_BUFFER *buffer = ins->ctx->buffer;
951     char dst_name[50];
952     char src_name[2][50];
953     DWORD sampler_code = dst->reg.idx;
954
955     shader_arb_get_dst_param(ins, dst, dst_name);
956
957     /* Sampling the perturbation map in Tsrc was done already, including the signedness correction if needed
958      *
959      * Keep in mind that src_name[1] can be "TB" and src_name[0] can be "TA" because modifiers like _x2 are valid
960      * with bem. So delay loading the first parameter until after the perturbation calculation which needs two
961      * temps is done.
962      */
963     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
964     shader_addline(buffer, "SWZ TA, bumpenvmat%d, x, z, 0, 0;\n", sampler_code);
965     shader_addline(buffer, "DP3 TC.r, TA, %s;\n", src_name[1]);
966     shader_addline(buffer, "SWZ TA, bumpenvmat%d, y, w, 0, 0;\n", sampler_code);
967     shader_addline(buffer, "DP3 TC.g, TA, %s;\n", src_name[1]);
968
969     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
970     shader_addline(buffer, "ADD %s, %s, TC;\n", dst_name, src_name[0]);
971 }
972
973 static void pshader_hw_cnd(const struct wined3d_shader_instruction *ins)
974 {
975     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
976     SHADER_BUFFER *buffer = ins->ctx->buffer;
977     char dst_name[50];
978     char src_name[3][50];
979     DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
980             ins->ctx->reg_maps->shader_version.minor);
981
982     shader_arb_get_dst_param(ins, dst, dst_name);
983     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
984
985     /* The coissue flag changes the semantic of the cnd instruction in <= 1.3 shaders */
986     if (shader_version <= WINED3D_SHADER_VERSION(1, 3) && ins->coissue)
987     {
988         shader_addline(buffer, "MOV%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name[1]);
989     } else {
990         shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
991         shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
992         shader_addline(buffer, "ADD TA, -%s, coefdiv.x;\n", src_name[0]);
993         shader_addline(buffer, "CMP%s %s, TA, %s, %s;\n",
994                        shader_arb_get_modifier(ins), dst_name, src_name[1], src_name[2]);
995     }
996 }
997
998 static void pshader_hw_cmp(const struct wined3d_shader_instruction *ins)
999 {
1000     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1001     SHADER_BUFFER *buffer = ins->ctx->buffer;
1002     char dst_name[50];
1003     char src_name[3][50];
1004
1005     shader_arb_get_dst_param(ins, dst, dst_name);
1006
1007     /* Generate input register names (with modifiers) */
1008     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
1009     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1010     shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1011
1012     shader_addline(buffer, "CMP%s %s, %s, %s, %s;\n", shader_arb_get_modifier(ins), dst_name,
1013                    src_name[0], src_name[2], src_name[1]);
1014 }
1015
1016 /** Process the WINED3DSIO_DP2ADD instruction in ARB.
1017  * dst = dot2(src0, src1) + src2 */
1018 static void pshader_hw_dp2add(const struct wined3d_shader_instruction *ins)
1019 {
1020     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1021     SHADER_BUFFER *buffer = ins->ctx->buffer;
1022     char dst_name[50];
1023     char src_name[3][50];
1024     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1025
1026     shader_arb_get_dst_param(ins, dst, dst_name);
1027     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
1028     shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1029
1030     if(ctx->target_version >= NV3)
1031     {
1032         /* GL_NV_fragment_program2 has a 1:1 matching instruction */
1033         shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1034         shader_addline(buffer, "DP2A%s %s, %s, %s, %s;\n", shader_arb_get_modifier(ins),
1035                        dst_name, src_name[0], src_name[1], src_name[2]);
1036     }
1037     else if(ctx->target_version >= NV2)
1038     {
1039         /* dst.x = src2.?, src0.x, src1.x + src0.y * src1.y
1040          * dst.y = src2.?, src0.x, src1.z + src0.y * src1.w
1041          * dst.z = src2.?, src0.x, src1.x + src0.y * src1.y
1042          * dst.z = src2.?, src0.x, src1.z + src0.y * src1.w
1043          *
1044          * Make sure that src1.zw = src1.xy, then we get a classic dp2add
1045          *
1046          * .xyxy and other swizzles that we could get with this are not valid in
1047          * plain ARBfp, but luckily the NV extension grammar lifts this limitation.
1048          */
1049         struct wined3d_shader_src_param tmp_param = ins->src[1];
1050         DWORD swizzle = tmp_param.swizzle & 0xf; /* Selects .xy */
1051         tmp_param.swizzle = swizzle | (swizzle << 4); /* Creates .xyxy */
1052
1053         shader_arb_get_src_param(ins, &tmp_param, 1, src_name[1]);
1054
1055         shader_addline(buffer, "X2D%s %s, %s, %s, %s;\n", shader_arb_get_modifier(ins),
1056                        dst_name, src_name[2], src_name[0], src_name[1]);
1057     }
1058     else
1059     {
1060         shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1061         /* Emulate a DP2 with a DP3 and 0.0. Don't use the dest as temp register, it could be src[1] or src[2]
1062         * src_name[0] can be TA, but TA is a private temp for modifiers, so it is save to overwrite
1063         */
1064         shader_addline(buffer, "MOV TA, %s;\n", src_name[0]);
1065         shader_addline(buffer, "MOV TA.z, 0.0;\n");
1066         shader_addline(buffer, "DP3 TA, TA, %s;\n", src_name[1]);
1067         shader_addline(buffer, "ADD%s %s, TA, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name[2]);
1068     }
1069 }
1070
1071 /* Map the opcode 1-to-1 to the GL code */
1072 static void shader_hw_map2gl(const struct wined3d_shader_instruction *ins)
1073 {
1074     SHADER_BUFFER *buffer = ins->ctx->buffer;
1075     const char *instruction;
1076     char arguments[256], dst_str[50];
1077     unsigned int i;
1078     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1079
1080     switch (ins->handler_idx)
1081     {
1082         case WINED3DSIH_ABS: instruction = "ABS"; break;
1083         case WINED3DSIH_ADD: instruction = "ADD"; break;
1084         case WINED3DSIH_CRS: instruction = "XPD"; break;
1085         case WINED3DSIH_DP3: instruction = "DP3"; break;
1086         case WINED3DSIH_DP4: instruction = "DP4"; break;
1087         case WINED3DSIH_DST: instruction = "DST"; break;
1088         case WINED3DSIH_EXP: instruction = "EX2"; break;
1089         case WINED3DSIH_EXPP: instruction = "EXP"; break;
1090         case WINED3DSIH_FRC: instruction = "FRC"; break;
1091         case WINED3DSIH_LIT: instruction = "LIT"; break;
1092         case WINED3DSIH_LOG: instruction = "LG2"; break;
1093         case WINED3DSIH_LOGP: instruction = "LOG"; break;
1094         case WINED3DSIH_LRP: instruction = "LRP"; break;
1095         case WINED3DSIH_MAD: instruction = "MAD"; break;
1096         case WINED3DSIH_MAX: instruction = "MAX"; break;
1097         case WINED3DSIH_MIN: instruction = "MIN"; break;
1098         case WINED3DSIH_MOV: instruction = "MOV"; break;
1099         case WINED3DSIH_MUL: instruction = "MUL"; break;
1100         case WINED3DSIH_POW: instruction = "POW"; break;
1101         case WINED3DSIH_SGE: instruction = "SGE"; break;
1102         case WINED3DSIH_SLT: instruction = "SLT"; break;
1103         case WINED3DSIH_SUB: instruction = "SUB"; break;
1104         case WINED3DSIH_MOVA:instruction = "ARR"; break;
1105         case WINED3DSIH_SGN: instruction = "SSG"; break;
1106         case WINED3DSIH_DSX: instruction = "DDX"; break;
1107         default: instruction = "";
1108             FIXME("Unhandled opcode %#x\n", ins->handler_idx);
1109             break;
1110     }
1111
1112     /* Note that shader_arb_add_dst_param() adds spaces. */
1113     arguments[0] = '\0';
1114     shader_arb_get_dst_param(ins, dst, dst_str);
1115     for (i = 0; i < ins->src_count; ++i)
1116     {
1117         char operand[100];
1118         strcat(arguments, ", ");
1119         shader_arb_get_src_param(ins, &ins->src[i], i, operand);
1120         strcat(arguments, operand);
1121     }
1122     shader_addline(buffer, "%s%s %s%s;\n", instruction, shader_arb_get_modifier(ins), dst_str, arguments);
1123 }
1124
1125 static void shader_hw_nop(const struct wined3d_shader_instruction *ins)
1126 {
1127     SHADER_BUFFER *buffer = ins->ctx->buffer;
1128     shader_addline(buffer, "NOP;\n");
1129 }
1130
1131 static void shader_hw_mov(const struct wined3d_shader_instruction *ins)
1132 {
1133     IWineD3DBaseShaderImpl *shader = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
1134     BOOL pshader = shader_is_pshader_version(shader->baseShader.reg_maps.shader_version.type);
1135     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1136
1137     SHADER_BUFFER *buffer = ins->ctx->buffer;
1138     char src0_param[256];
1139
1140     if(ins->handler_idx == WINED3DSIH_MOVA) {
1141         struct wined3d_shader_src_param tmp_src = ins->src[0];
1142         char write_mask[6];
1143
1144         if(ctx->target_version >= NV2) {
1145             shader_hw_map2gl(ins);
1146             return;
1147         }
1148         tmp_src.swizzle = (tmp_src.swizzle & 0x3) * 0x55;
1149         shader_arb_get_src_param(ins, &tmp_src, 0, src0_param);
1150         shader_arb_get_write_mask(ins, &ins->dst[0], write_mask);
1151
1152         /* This implements the mova formula used in GLSL. The first two instructions
1153          * prepare the sign() part. Note that it is fine to have my_sign(0.0) = 1.0
1154          * in this case:
1155          * mova A0.x, 0.0
1156          *
1157          * A0.x = arl(floor(abs(0.0) + 0.5) * 1.0) = floor(0.5) = 0.0 since arl does a floor
1158          *
1159          * The ARL is performed when A0 is used - the requested component is read from A0_SHADOW into
1160          * A0.x. We can use the overwritten component of A0_shadow as temporary storage for the sign.
1161          */
1162         shader_addline(buffer, "SGE A0_SHADOW%s, %s, mova_const.y;\n", write_mask, src0_param);
1163         shader_addline(buffer, "MAD A0_SHADOW%s, A0_SHADOW, mova_const.z, -mova_const.w;\n", write_mask);
1164
1165         shader_addline(buffer, "ABS TA%s, %s;\n", write_mask, src0_param);
1166         shader_addline(buffer, "ADD TA%s, TA, mova_const.x;\n", write_mask);
1167         shader_addline(buffer, "FLR TA%s, TA;\n", write_mask);
1168         shader_addline(buffer, "MUL A0_SHADOW%s, TA, A0_SHADOW;\n", write_mask);
1169
1170         ((struct shader_arb_ctx_priv *)ins->ctx->backend_data)->addr_reg[0] = '\0';
1171     } else if (ins->ctx->reg_maps->shader_version.major == 1
1172           && !shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)
1173           && ins->dst[0].reg.type == WINED3DSPR_ADDR)
1174     {
1175         src0_param[0] = '\0';
1176         if (((IWineD3DVertexShaderImpl *)shader)->rel_offset)
1177         {
1178             shader_arb_get_src_param(ins, &ins->src[0], 0, src0_param);
1179             shader_addline(buffer, "ADD TA.x, %s, helper_const.z;\n", src0_param);
1180             shader_addline(buffer, "ARL A0.x, TA.x;\n");
1181         }
1182         else
1183         {
1184             /* Apple's ARB_vertex_program implementation does not accept an ARL source argument
1185              * with more than one component. Thus replicate the first source argument over all
1186              * 4 components. For example, .xyzw -> .x (or better: .xxxx), .zwxy -> .z, etc) */
1187             struct wined3d_shader_src_param tmp_src = ins->src[0];
1188             tmp_src.swizzle = (tmp_src.swizzle & 0x3) * 0x55;
1189             shader_arb_get_src_param(ins, &tmp_src, 0, src0_param);
1190             shader_addline(buffer, "ARL A0.x, %s;\n", src0_param);
1191         }
1192     }
1193     else if(ins->dst[0].reg.type == WINED3DSPR_COLOROUT && ins->dst[0].reg.idx == 0 && pshader)
1194     {
1195         IWineD3DPixelShaderImpl *ps = (IWineD3DPixelShaderImpl *) shader;
1196         if(ctx->cur_ps_args->super.srgb_correction && ps->color0_mov)
1197         {
1198             shader_addline(buffer, "#mov handled in srgb write code\n");
1199             return;
1200         }
1201         shader_hw_map2gl(ins);
1202     }
1203     else
1204     {
1205         shader_hw_map2gl(ins);
1206     }
1207 }
1208
1209 static void pshader_hw_texkill(const struct wined3d_shader_instruction *ins)
1210 {
1211     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1212     SHADER_BUFFER *buffer = ins->ctx->buffer;
1213     char reg_dest[40];
1214
1215     /* No swizzles are allowed in d3d's texkill. PS 1.x ignores the 4th component as documented,
1216      * but >= 2.0 honors it(undocumented, but tested by the d3d9 testsuit)
1217      */
1218     shader_arb_get_dst_param(ins, dst, reg_dest);
1219
1220     if (ins->ctx->reg_maps->shader_version.major >= 2)
1221     {
1222         /* The arb backend doesn't claim ps 2.0 support, but try to eat what the app feeds to us */
1223         shader_arb_get_dst_param(ins, dst, reg_dest);
1224         shader_addline(buffer, "KIL %s;\n", reg_dest);
1225     } else {
1226         /* ARB fp doesn't like swizzles on the parameter of the KIL instruction. To mask the 4th component,
1227          * copy the register into our general purpose TMP variable, overwrite .w and pass TMP to KIL
1228          *
1229          * ps_1_3 shaders use the texcoord incarnation of the Tx register. ps_1_4 shaders can use the same,
1230          * or pass in any temporary register(in shader phase 2)
1231          */
1232         if(ins->ctx->reg_maps->shader_version.minor <= 3) {
1233             sprintf(reg_dest, "fragment.texcoord[%u]", dst->reg.idx);
1234         } else {
1235             shader_arb_get_dst_param(ins, dst, reg_dest);
1236         }
1237         shader_addline(buffer, "SWZ TA, %s, x, y, z, 1;\n", reg_dest);
1238         shader_addline(buffer, "KIL TA;\n");
1239     }
1240 }
1241
1242 static void pshader_hw_tex(const struct wined3d_shader_instruction *ins)
1243 {
1244     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1245     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1246     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1247     DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
1248             ins->ctx->reg_maps->shader_version.minor);
1249     BOOL projected = FALSE, bias = FALSE;
1250     struct wined3d_shader_src_param src;
1251
1252     char reg_dest[40];
1253     char reg_coord[40];
1254     DWORD reg_sampler_code;
1255
1256     /* All versions have a destination register */
1257     shader_arb_get_dst_param(ins, dst, reg_dest);
1258
1259     /* 1.0-1.4: Use destination register number as texture code.
1260        2.0+: Use provided sampler number as texure code. */
1261     if (shader_version < WINED3D_SHADER_VERSION(2,0))
1262         reg_sampler_code = dst->reg.idx;
1263     else
1264         reg_sampler_code = ins->src[1].reg.idx;
1265
1266     /* 1.0-1.3: Use the texcoord varying.
1267        1.4+: Use provided coordinate source register. */
1268     if (shader_version < WINED3D_SHADER_VERSION(1,4))
1269         sprintf(reg_coord, "fragment.texcoord[%u]", reg_sampler_code);
1270     else {
1271         /* TEX is the only instruction that can handle DW and DZ natively */
1272         src = ins->src[0];
1273         if(src.modifiers == WINED3DSPSM_DW) src.modifiers = WINED3DSPSM_NONE;
1274         if(src.modifiers == WINED3DSPSM_DZ) src.modifiers = WINED3DSPSM_NONE;
1275         shader_arb_get_src_param(ins, &src, 0, reg_coord);
1276     }
1277
1278     /* projection flag:
1279      * 1.1, 1.2, 1.3: Use WINED3DTSS_TEXTURETRANSFORMFLAGS
1280      * 1.4: Use WINED3DSPSM_DZ or WINED3DSPSM_DW on src[0]
1281      * 2.0+: Use WINED3DSI_TEXLD_PROJECT on the opcode
1282      */
1283     if (shader_version < WINED3D_SHADER_VERSION(1,4))
1284     {
1285         DWORD flags = 0;
1286         if(reg_sampler_code < MAX_TEXTURES) {
1287             flags = deviceImpl->stateBlock->textureState[reg_sampler_code][WINED3DTSS_TEXTURETRANSFORMFLAGS];
1288         }
1289         if (flags & WINED3DTTFF_PROJECTED) {
1290             projected = TRUE;
1291         }
1292     }
1293     else if (shader_version < WINED3D_SHADER_VERSION(2,0))
1294     {
1295         DWORD src_mod = ins->src[0].modifiers;
1296         if (src_mod == WINED3DSPSM_DZ) {
1297             /* TXP cannot handle DZ natively, so move the z coordinate to .w. reg_coord is a read-only
1298              * varying register, so we need a temp reg
1299              */
1300             shader_addline(ins->ctx->buffer, "SWZ TA, %s, x, y, z, z;\n", reg_coord);
1301             strcpy(reg_coord, "TA");
1302             projected = TRUE;
1303         } else if(src_mod == WINED3DSPSM_DW) {
1304             projected = TRUE;
1305         }
1306     } else {
1307         if (ins->flags & WINED3DSI_TEXLD_PROJECT) projected = TRUE;
1308         if (ins->flags & WINED3DSI_TEXLD_BIAS) bias = TRUE;
1309     }
1310     shader_hw_sample(ins, reg_sampler_code, reg_dest, reg_coord, projected, bias);
1311 }
1312
1313 static void pshader_hw_texcoord(const struct wined3d_shader_instruction *ins)
1314 {
1315     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1316     SHADER_BUFFER *buffer = ins->ctx->buffer;
1317     DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
1318             ins->ctx->reg_maps->shader_version.minor);
1319     char dst_str[50];
1320
1321     if (shader_version < WINED3D_SHADER_VERSION(1,4))
1322     {
1323         DWORD reg = dst->reg.idx;
1324
1325         shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1326         shader_addline(buffer, "MOV_SAT %s, fragment.texcoord[%u];\n", dst_str, reg);
1327     } else {
1328         char reg_src[40];
1329
1330         shader_arb_get_src_param(ins, &ins->src[0], 0, reg_src);
1331         shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1332         shader_addline(buffer, "MOV %s, %s;\n", dst_str, reg_src);
1333    }
1334 }
1335
1336 static void pshader_hw_texreg2ar(const struct wined3d_shader_instruction *ins)
1337 {
1338      SHADER_BUFFER *buffer = ins->ctx->buffer;
1339      IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1340      IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1341      DWORD flags;
1342
1343      DWORD reg1 = ins->dst[0].reg.idx;
1344      char dst_str[50];
1345      char src_str[50];
1346
1347      /* Note that texreg2ar treats Tx as a temporary register, not as a varying */
1348      shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1349      shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
1350      /* Move .x first in case src_str is "TA" */
1351      shader_addline(buffer, "MOV TA.y, %s.x;\n", src_str);
1352      shader_addline(buffer, "MOV TA.x, %s.w;\n", src_str);
1353      flags = reg1 < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg1][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1354      shader_hw_sample(ins, reg1, dst_str, "TA", flags & WINED3DTTFF_PROJECTED, FALSE);
1355 }
1356
1357 static void pshader_hw_texreg2gb(const struct wined3d_shader_instruction *ins)
1358 {
1359      SHADER_BUFFER *buffer = ins->ctx->buffer;
1360
1361      DWORD reg1 = ins->dst[0].reg.idx;
1362      char dst_str[50];
1363      char src_str[50];
1364
1365      /* Note that texreg2gb treats Tx as a temporary register, not as a varying */
1366      shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1367      shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
1368      shader_addline(buffer, "MOV TA.x, %s.y;\n", src_str);
1369      shader_addline(buffer, "MOV TA.y, %s.z;\n", src_str);
1370      shader_hw_sample(ins, reg1, dst_str, "TA", FALSE, FALSE);
1371 }
1372
1373 static void pshader_hw_texreg2rgb(const struct wined3d_shader_instruction *ins)
1374 {
1375     DWORD reg1 = ins->dst[0].reg.idx;
1376     char dst_str[50];
1377     char src_str[50];
1378
1379     /* Note that texreg2rg treats Tx as a temporary register, not as a varying */
1380     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1381     shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
1382     shader_hw_sample(ins, reg1, dst_str, src_str, FALSE, FALSE);
1383 }
1384
1385 static void pshader_hw_texbem(const struct wined3d_shader_instruction *ins)
1386 {
1387     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1388     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1389     SHADER_BUFFER *buffer = ins->ctx->buffer;
1390     char reg_coord[40], dst_reg[50], src_reg[50];
1391     DWORD reg_dest_code;
1392
1393     /* All versions have a destination register. The Tx where the texture coordinates come
1394      * from is the varying incarnation of the texture register
1395      */
1396     reg_dest_code = dst->reg.idx;
1397     shader_arb_get_dst_param(ins, &ins->dst[0], dst_reg);
1398     shader_arb_get_src_param(ins, &ins->src[0], 0, src_reg);
1399     sprintf(reg_coord, "fragment.texcoord[%u]", reg_dest_code);
1400
1401     /* Sampling the perturbation map in Tsrc was done already, including the signedness correction if needed
1402      * The Tx in which the perturbation map is stored is the tempreg incarnation of the texture register
1403      *
1404      * GL_NV_fragment_program_option could handle this in one instruction via X2D:
1405      * X2D TA.xy, fragment.texcoord, T%u, bumpenvmat%u.xzyw
1406      *
1407      * However, the NV extensions are never enabled for <= 2.0 shaders because of the performance penalty that
1408      * comes with it, and texbem is an 1.x only instruction. No 1.x instruction forces us to enable the NV
1409      * extension.
1410      */
1411     shader_addline(buffer, "SWZ TB, bumpenvmat%d, x, z, 0, 0;\n", reg_dest_code);
1412     shader_addline(buffer, "DP3 TA.x, TB, %s;\n", src_reg);
1413     shader_addline(buffer, "SWZ TB, bumpenvmat%d, y, w, 0, 0;\n", reg_dest_code);
1414     shader_addline(buffer, "DP3 TA.y, TB, %s;\n", src_reg);
1415
1416     /* with projective textures, texbem only divides the static texture coord, not the displacement,
1417      * so we can't let the GL handle this.
1418      */
1419     if (((IWineD3DDeviceImpl*) This->baseShader.device)->stateBlock->textureState[reg_dest_code][WINED3DTSS_TEXTURETRANSFORMFLAGS]
1420             & WINED3DTTFF_PROJECTED) {
1421         shader_addline(buffer, "RCP TB.w, %s.w;\n", reg_coord);
1422         shader_addline(buffer, "MUL TB.xy, %s, TB.w;\n", reg_coord);
1423         shader_addline(buffer, "ADD TA.xy, TA, TB;\n");
1424     } else {
1425         shader_addline(buffer, "ADD TA.xy, TA, %s;\n", reg_coord);
1426     }
1427
1428     shader_hw_sample(ins, reg_dest_code, dst_reg, "TA", FALSE, FALSE);
1429
1430     if (ins->handler_idx == WINED3DSIH_TEXBEML)
1431     {
1432         /* No src swizzles are allowed, so this is ok */
1433         shader_addline(buffer, "MAD TA, %s.z, luminance%d.x, luminance%d.y;\n",
1434                        src_reg, reg_dest_code, reg_dest_code);
1435         shader_addline(buffer, "MUL %s, %s, TA;\n", dst_reg, dst_reg);
1436     }
1437 }
1438
1439 static void pshader_hw_texm3x2pad(const struct wined3d_shader_instruction *ins)
1440 {
1441     DWORD reg = ins->dst[0].reg.idx;
1442     SHADER_BUFFER *buffer = ins->ctx->buffer;
1443     char src0_name[50], dst_name[50];
1444     BOOL is_color;
1445     struct wined3d_shader_register tmp_reg = ins->dst[0].reg;
1446
1447     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1448     /* The next instruction will be a texm3x2tex or texm3x2depth that writes to the uninitialized
1449      * T<reg+1> register. Use this register to store the calculated vector
1450      */
1451     tmp_reg.idx = reg + 1;
1452     shader_arb_get_register_name(ins, &tmp_reg, dst_name, &is_color);
1453     shader_addline(buffer, "DP3 %s.x, fragment.texcoord[%u], %s;\n", dst_name, reg, src0_name);
1454 }
1455
1456 static void pshader_hw_texm3x2tex(const struct wined3d_shader_instruction *ins)
1457 {
1458     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1459     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1460     DWORD flags;
1461     DWORD reg = ins->dst[0].reg.idx;
1462     SHADER_BUFFER *buffer = ins->ctx->buffer;
1463     char dst_str[50];
1464     char src0_name[50];
1465     char dst_reg[50];
1466     BOOL is_color;
1467
1468     /* We know that we're writing to the uninitialized T<reg> register, so use it for temporary storage */
1469     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
1470
1471     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1472     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1473     shader_addline(buffer, "DP3 %s.y, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
1474     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1475     shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3DTTFF_PROJECTED, FALSE);
1476 }
1477
1478 static void pshader_hw_texm3x3pad(const struct wined3d_shader_instruction *ins)
1479 {
1480     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1481     DWORD reg = ins->dst[0].reg.idx;
1482     SHADER_BUFFER *buffer = ins->ctx->buffer;
1483     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
1484     char src0_name[50], dst_name[50];
1485     struct wined3d_shader_register tmp_reg = ins->dst[0].reg;
1486     BOOL is_color;
1487
1488     /* There are always 2 texm3x3pad instructions followed by one texm3x3[tex,vspec, ...] instruction, with
1489      * incrementing ins->dst[0].register_idx numbers. So the pad instruction already knows the final destination
1490      * register, and this register is uninitialized(otherwise the assembler complains that it is 'redeclared')
1491      */
1492     tmp_reg.idx = reg + 2 - current_state->current_row;
1493     shader_arb_get_register_name(ins, &tmp_reg, dst_name, &is_color);
1494
1495     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1496     shader_addline(buffer, "DP3 %s%u.%c, fragment.texcoord[%u], %s;\n",
1497                    dst_name, tmp_reg.idx, 'x' + current_state->current_row, reg, src0_name);
1498     current_state->texcoord_w[current_state->current_row++] = reg;
1499 }
1500
1501 static void pshader_hw_texm3x3tex(const struct wined3d_shader_instruction *ins)
1502 {
1503     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1504     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1505     DWORD flags;
1506     DWORD reg = ins->dst[0].reg.idx;
1507     SHADER_BUFFER *buffer = ins->ctx->buffer;
1508     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
1509     char dst_str[50];
1510     char src0_name[50], dst_name[50];
1511     BOOL is_color;
1512
1513     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
1514     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1515     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_name, reg, src0_name);
1516
1517     /* Sample the texture using the calculated coordinates */
1518     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1519     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1520     shader_hw_sample(ins, reg, dst_str, dst_name, flags & WINED3DTTFF_PROJECTED, FALSE);
1521     current_state->current_row = 0;
1522 }
1523
1524 static void pshader_hw_texm3x3vspec(const struct wined3d_shader_instruction *ins)
1525 {
1526     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1527     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1528     DWORD flags;
1529     DWORD reg = ins->dst[0].reg.idx;
1530     SHADER_BUFFER *buffer = ins->ctx->buffer;
1531     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
1532     char dst_str[50];
1533     char src0_name[50];
1534     char dst_reg[8];
1535     BOOL is_color;
1536
1537     /* Get the dst reg without writemask strings. We know this register is uninitialized, so we can use all
1538      * components for temporary data storage
1539      */
1540     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
1541     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1542     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
1543
1544     /* Construct the eye-ray vector from w coordinates */
1545     shader_addline(buffer, "MOV TB.x, fragment.texcoord[%u].w;\n", current_state->texcoord_w[0]);
1546     shader_addline(buffer, "MOV TB.y, fragment.texcoord[%u].w;\n", current_state->texcoord_w[1]);
1547     shader_addline(buffer, "MOV TB.z, fragment.texcoord[%u].w;\n", reg);
1548
1549     /* Calculate reflection vector
1550      */
1551     shader_addline(buffer, "DP3 %s.w, %s, TB;\n", dst_reg, dst_reg);
1552     /* The .w is ignored when sampling, so I can use TB.w to calculate dot(N, N) */
1553     shader_addline(buffer, "DP3 TB.w, %s, %s;\n", dst_reg, dst_reg);
1554     shader_addline(buffer, "RCP TB.w, TB.w;\n");
1555     shader_addline(buffer, "MUL %s.w, %s.w, TB.w;\n", dst_reg, dst_reg);
1556     shader_addline(buffer, "MUL %s, %s.w, %s;\n", dst_reg, dst_reg, dst_reg);
1557     shader_addline(buffer, "MAD %s, coefmul.x, %s, -TB;\n", dst_reg, dst_reg);
1558
1559     /* Sample the texture using the calculated coordinates */
1560     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1561     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1562     shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3DTTFF_PROJECTED, FALSE);
1563     current_state->current_row = 0;
1564 }
1565
1566 static void pshader_hw_texm3x3spec(const struct wined3d_shader_instruction *ins)
1567 {
1568     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1569     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1570     DWORD flags;
1571     DWORD reg = ins->dst[0].reg.idx;
1572     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
1573     SHADER_BUFFER *buffer = ins->ctx->buffer;
1574     char dst_str[50];
1575     char src0_name[50];
1576     char src1_name[50];
1577     char dst_reg[8];
1578     BOOL is_color;
1579
1580     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1581     shader_arb_get_src_param(ins, &ins->src[0], 1, src1_name);
1582     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
1583     /* Note: dst_reg.xy is input here, generated by two texm3x3pad instructions */
1584     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
1585
1586     /* Calculate reflection vector.
1587      *
1588      *                   dot(N, E)
1589      * dst_reg.xyz = 2 * --------- * N - E
1590      *                   dot(N, N)
1591      *
1592      * Which normalizes the normal vector
1593      */
1594     shader_addline(buffer, "DP3 %s.w, %s, %s;\n", dst_reg, dst_reg, src1_name);
1595     shader_addline(buffer, "DP3 TC.w, %s, %s;\n", dst_reg, dst_reg);
1596     shader_addline(buffer, "RCP TC.w, TC.w;\n");
1597     shader_addline(buffer, "MUL %s.w, %s.w, TC.w;\n", dst_reg, dst_reg);
1598     shader_addline(buffer, "MUL %s, %s.w, %s;\n", dst_reg, dst_reg, dst_reg);
1599     shader_addline(buffer, "MAD %s, coefmul.x, %s, -%s;\n", dst_reg, dst_reg, src1_name);
1600
1601     /* Sample the texture using the calculated coordinates */
1602     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1603     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1604     shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3DTTFF_PROJECTED, FALSE);
1605     current_state->current_row = 0;
1606 }
1607
1608 static void pshader_hw_texdepth(const struct wined3d_shader_instruction *ins)
1609 {
1610     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1611     SHADER_BUFFER *buffer = ins->ctx->buffer;
1612     char dst_name[50];
1613
1614     /* texdepth has an implicit destination, the fragment depth value. It's only parameter,
1615      * which is essentially an input, is the destination register because it is the first
1616      * parameter. According to the msdn, this must be register r5, but let's keep it more flexible
1617      * here(writemasks/swizzles are not valid on texdepth)
1618      */
1619     shader_arb_get_dst_param(ins, dst, dst_name);
1620
1621     /* According to the msdn, the source register(must be r5) is unusable after
1622      * the texdepth instruction, so we're free to modify it
1623      */
1624     shader_addline(buffer, "MIN %s.y, %s.y, one.y;\n", dst_name, dst_name);
1625
1626     /* How to deal with the special case dst_name.g == 0? if r != 0, then
1627      * the r * (1 / 0) will give infinity, which is clamped to 1.0, the correct
1628      * result. But if r = 0.0, then 0 * inf = 0, which is incorrect.
1629      */
1630     shader_addline(buffer, "RCP %s.y, %s.y;\n", dst_name, dst_name);
1631     shader_addline(buffer, "MUL TA.x, %s.x, %s.y;\n", dst_name, dst_name);
1632     shader_addline(buffer, "MIN TA.x, TA.x, one.x;\n");
1633     shader_addline(buffer, "MAX result.depth, TA.x, 0.0;\n");
1634 }
1635
1636 /** Process the WINED3DSIO_TEXDP3TEX instruction in ARB:
1637  * Take a 3-component dot product of the TexCoord[dstreg] and src,
1638  * then perform a 1D texture lookup from stage dstregnum, place into dst. */
1639 static void pshader_hw_texdp3tex(const struct wined3d_shader_instruction *ins)
1640 {
1641     SHADER_BUFFER *buffer = ins->ctx->buffer;
1642     DWORD sampler_idx = ins->dst[0].reg.idx;
1643     char src0[50];
1644     char dst_str[50];
1645
1646     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
1647     shader_addline(buffer, "MOV TB, 0.0;\n");
1648     shader_addline(buffer, "DP3 TB.x, fragment.texcoord[%u], %s;\n", sampler_idx, src0);
1649
1650     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1651     shader_hw_sample(ins, sampler_idx, dst_str, "TB", FALSE /* Only one coord, can't be projected */, FALSE);
1652 }
1653
1654 /** Process the WINED3DSIO_TEXDP3 instruction in ARB:
1655  * Take a 3-component dot product of the TexCoord[dstreg] and src. */
1656 static void pshader_hw_texdp3(const struct wined3d_shader_instruction *ins)
1657 {
1658     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1659     char src0[50];
1660     char dst_str[50];
1661     SHADER_BUFFER *buffer = ins->ctx->buffer;
1662
1663     /* Handle output register */
1664     shader_arb_get_dst_param(ins, dst, dst_str);
1665     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
1666     shader_addline(buffer, "DP3 %s, fragment.texcoord[%u], %s;\n", dst_str, dst->reg.idx, src0);
1667 }
1668
1669 /** Process the WINED3DSIO_TEXM3X3 instruction in ARB
1670  * Perform the 3rd row of a 3x3 matrix multiply */
1671 static void pshader_hw_texm3x3(const struct wined3d_shader_instruction *ins)
1672 {
1673     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1674     SHADER_BUFFER *buffer = ins->ctx->buffer;
1675     char dst_str[50], dst_name[50];
1676     char src0[50];
1677     BOOL is_color;
1678
1679     shader_arb_get_dst_param(ins, dst, dst_str);
1680     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
1681     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
1682     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_name, dst->reg.idx, src0);
1683     shader_addline(buffer, "MOV %s, %s;\n", dst_str, dst_name);
1684 }
1685
1686 /** Process the WINED3DSIO_TEXM3X2DEPTH instruction in ARB:
1687  * Last row of a 3x2 matrix multiply, use the result to calculate the depth:
1688  * Calculate tmp0.y = TexCoord[dstreg] . src.xyz;  (tmp0.x has already been calculated)
1689  * depth = (tmp0.y == 0.0) ? 1.0 : tmp0.x / tmp0.y
1690  */
1691 static void pshader_hw_texm3x2depth(const struct wined3d_shader_instruction *ins)
1692 {
1693     SHADER_BUFFER *buffer = ins->ctx->buffer;
1694     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1695     char src0[50], dst_name[50];
1696     BOOL is_color;
1697
1698     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
1699     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
1700     shader_addline(buffer, "DP3 %s.y, fragment.texcoord[%u], %s;\n", dst_name, dst->reg.idx, src0);
1701
1702     /* How to deal with the special case dst_name.g == 0? if r != 0, then
1703      * the r * (1 / 0) will give infinity, which is clamped to 1.0, the correct
1704      * result. But if r = 0.0, then 0 * inf = 0, which is incorrect.
1705      */
1706     shader_addline(buffer, "RCP %s.y, %s.y;\n", dst_name, dst_name);
1707     shader_addline(buffer, "MUL %s.x, %s.x, %s.y;\n", dst_name, dst_name, dst_name);
1708     shader_addline(buffer, "MIN %s.x, %s.x, one.x;\n", dst_name, dst_name);
1709     shader_addline(buffer, "MAX result.depth, %s.x, 0.0;\n", dst_name);
1710 }
1711
1712 /** Handles transforming all WINED3DSIO_M?x? opcodes for
1713     Vertex/Pixel shaders to ARB_vertex_program codes */
1714 static void shader_hw_mnxn(const struct wined3d_shader_instruction *ins)
1715 {
1716     int i;
1717     int nComponents = 0;
1718     struct wined3d_shader_dst_param tmp_dst = {{0}};
1719     struct wined3d_shader_src_param tmp_src[2] = {{{0}}};
1720     struct wined3d_shader_instruction tmp_ins;
1721
1722     memset(&tmp_ins, 0, sizeof(tmp_ins));
1723
1724     /* Set constants for the temporary argument */
1725     tmp_ins.ctx = ins->ctx;
1726     tmp_ins.dst_count = 1;
1727     tmp_ins.dst = &tmp_dst;
1728     tmp_ins.src_count = 2;
1729     tmp_ins.src = tmp_src;
1730
1731     switch(ins->handler_idx)
1732     {
1733         case WINED3DSIH_M4x4:
1734             nComponents = 4;
1735             tmp_ins.handler_idx = WINED3DSIH_DP4;
1736             break;
1737         case WINED3DSIH_M4x3:
1738             nComponents = 3;
1739             tmp_ins.handler_idx = WINED3DSIH_DP4;
1740             break;
1741         case WINED3DSIH_M3x4:
1742             nComponents = 4;
1743             tmp_ins.handler_idx = WINED3DSIH_DP3;
1744             break;
1745         case WINED3DSIH_M3x3:
1746             nComponents = 3;
1747             tmp_ins.handler_idx = WINED3DSIH_DP3;
1748             break;
1749         case WINED3DSIH_M3x2:
1750             nComponents = 2;
1751             tmp_ins.handler_idx = WINED3DSIH_DP3;
1752             break;
1753         default:
1754             FIXME("Unhandled opcode %#x\n", ins->handler_idx);
1755             break;
1756     }
1757
1758     tmp_dst = ins->dst[0];
1759     tmp_src[0] = ins->src[0];
1760     tmp_src[1] = ins->src[1];
1761     for (i = 0; i < nComponents; i++) {
1762         tmp_dst.write_mask = WINED3DSP_WRITEMASK_0 << i;
1763         shader_hw_map2gl(&tmp_ins);
1764         ++tmp_src[1].reg.idx;
1765     }
1766 }
1767
1768 static void shader_hw_rsq_rcp(const struct wined3d_shader_instruction *ins)
1769 {
1770     SHADER_BUFFER *buffer = ins->ctx->buffer;
1771     const char *instruction;
1772
1773     char dst[50];
1774     char src[50];
1775
1776     switch(ins->handler_idx)
1777     {
1778         case WINED3DSIH_RSQ: instruction = "RSQ"; break;
1779         case WINED3DSIH_RCP: instruction = "RCP"; break;
1780         default: instruction = "";
1781             FIXME("Unhandled opcode %#x\n", ins->handler_idx);
1782             break;
1783     }
1784
1785     shader_arb_get_dst_param(ins, &ins->dst[0], dst); /* Destination */
1786     shader_arb_get_src_param(ins, &ins->src[0], 0, src);
1787     if (ins->src[0].swizzle == WINED3DSP_NOSWIZZLE)
1788     {
1789         /* Dx sdk says .x is used if no swizzle is given, but our test shows that
1790          * .w is used
1791          */
1792         strcat(src, ".w");
1793     }
1794
1795     shader_addline(buffer, "%s%s %s, %s;\n", instruction, shader_arb_get_modifier(ins), dst, src);
1796 }
1797
1798 static void shader_hw_nrm(const struct wined3d_shader_instruction *ins)
1799 {
1800     SHADER_BUFFER *buffer = ins->ctx->buffer;
1801     char dst_name[50];
1802     char src_name[50];
1803     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
1804     BOOL pshader = shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type);
1805
1806     shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
1807     shader_arb_get_src_param(ins, &ins->src[0], 1 /* Use TB */, src_name);
1808
1809     if(pshader && priv->target_version >= NV3)
1810     {
1811         shader_addline(buffer, "NRM%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name);
1812     }
1813     else
1814     {
1815         shader_addline(buffer, "DP3 TA, %s, %s;\n", src_name, src_name);
1816         shader_addline(buffer, "RSQ TA, TA.x;\n");
1817         /* dst.w = src[0].w * 1 / (src.x^2 + src.y^2 + src.z^2)^(1/2) according to msdn*/
1818         shader_addline(buffer, "MUL%s %s, %s, TA;\n", shader_arb_get_modifier(ins), dst_name,
1819                     src_name);
1820     }
1821 }
1822
1823 static void shader_hw_lrp(const struct wined3d_shader_instruction *ins)
1824 {
1825     SHADER_BUFFER *buffer = ins->ctx->buffer;
1826     char dst_name[50];
1827     char src_name[3][50];
1828
1829     /* ARB_fragment_program has a convenient LRP instruction */
1830     if(shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)) {
1831         shader_hw_map2gl(ins);
1832         return;
1833     }
1834
1835     shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
1836     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
1837     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1838     shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1839
1840     shader_addline(buffer, "SUB TA, %s, %s;\n", src_name[1], src_name[2]);
1841     shader_addline(buffer, "MAD%s %s, %s, TA, %s;\n", shader_arb_get_modifier(ins),
1842                    dst_name, src_name[0], src_name[2]);
1843 }
1844
1845 static void shader_hw_sincos(const struct wined3d_shader_instruction *ins)
1846 {
1847     /* This instruction exists in ARB, but the d3d instruction takes two extra parameters which
1848      * must contain fixed constants. So we need a separate function to filter those constants and
1849      * can't use map2gl
1850      */
1851     SHADER_BUFFER *buffer = ins->ctx->buffer;
1852     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
1853     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1854     char dst_name[50];
1855     char src_name0[50], src_name1[50], src_name2[50];
1856     BOOL is_color;
1857
1858     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0);
1859     if(shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)) {
1860         shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
1861         shader_addline(buffer, "SCS%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name,
1862                        src_name0);
1863     } else if(priv->target_version >= NV2) {
1864         shader_arb_get_register_name(ins, &dst->reg, dst_name, &is_color);
1865
1866         /* Sincos writemask must be .x, .y or .xy */
1867         if(dst->write_mask & WINED3DSP_WRITEMASK_0)
1868             shader_addline(buffer, "COS%s %s.x, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name0);
1869         if(dst->write_mask & WINED3DSP_WRITEMASK_1)
1870             shader_addline(buffer, "SIN%s %s.y, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name0);
1871     } else {
1872         /* Approximate sine and cosine with a taylor series, as per math textbook. The application passes 8
1873          * helper constants(D3DSINCOSCONST1 and D3DSINCOSCONST2) in src1 and src2.
1874          *
1875          * sin(x) = x - x^3/3! + x^5/5! - x^7/7! + ...
1876          * cos(x) = 1 - x^2/2! + x^4/4! - x^6/6! + ...
1877          *
1878          * The constants we get are:
1879          *
1880          *  +1   +1,     -1     -1     +1      +1      -1       -1
1881          *      ---- ,  ---- , ---- , ----- , ----- , ----- , ------
1882          *      1!*2    2!*4   3!*8   4!*16   5!*32   6!*64   7!*128
1883          *
1884          * If used with x^2, x^3, x^4 etc they calculate sin(x/2) and cos(x/2):
1885          *
1886          * (x/2)^2 = x^2 / 4
1887          * (x/2)^3 = x^3 / 8
1888          * (x/2)^4 = x^4 / 16
1889          * (x/2)^5 = x^5 / 32
1890          * etc
1891          *
1892          * To get the final result:
1893          * sin(x) = 2 * sin(x/2) * cos(x/2)
1894          * cos(x) = cos(x/2)^2 - sin(x/2)^2
1895          * (from sin(x+y) and cos(x+y) rules)
1896          *
1897          * As per MSDN, dst.z is undefined after the operation, and so is
1898          * dst.x and dst.y if they're masked out by the writemask. Ie
1899          * sincos dst.y, src1, c0, c1
1900          * returns the sine in dst.y. dst.x and dst.z are undefined, dst.w is not touched. The assembler
1901          * vsa.exe also stops with an error if the dest register is the same register as the source
1902          * register. This means we can use dest.xyz as temporary storage. The assembler vsa.exe output also
1903          * indicates that sincos consumes 8 instruction slots in vs_2_0(and, strangely, in vs_3_0).
1904          */
1905         shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1);
1906         shader_arb_get_src_param(ins, &ins->src[2], 2, src_name2);
1907         shader_arb_get_register_name(ins, &dst->reg, dst_name, &is_color);
1908
1909         shader_addline(buffer, "MUL %s.x, %s, %s;\n", dst_name, src_name0, src_name0);  /* x ^ 2 */
1910         shader_addline(buffer, "MUL TA.y, %s.x, %s;\n", dst_name, src_name0);           /* x ^ 3 */
1911         shader_addline(buffer, "MUL %s.y, TA.y, %s;\n", dst_name, src_name0);           /* x ^ 4 */
1912         shader_addline(buffer, "MUL TA.z, %s.y, %s;\n", dst_name, src_name0);           /* x ^ 5 */
1913         shader_addline(buffer, "MUL %s.z, TA.z, %s;\n", dst_name, src_name0);           /* x ^ 6 */
1914         shader_addline(buffer, "MUL TA.w, %s.z, %s;\n", dst_name, src_name0);           /* x ^ 7 */
1915
1916         /* sin(x/2)
1917          *
1918          * Unfortunately we don't get the constants in a DP4-capable form. Is there a way to
1919          * properly merge that with MULs in the code above?
1920          * The swizzles .yz and xw however fit into the .yzxw swizzle added to ps_2_0. Maybe
1921          * we can merge the sine and cosine MAD rows to calculate them together.
1922          */
1923         shader_addline(buffer, "MUL TA.x, %s, %s.w;\n", src_name0, src_name2); /* x^1, +1/(1!*2) */
1924         shader_addline(buffer, "MAD TA.x, TA.y, %s.x, TA.x;\n", src_name2); /* -1/(3!*8) */
1925         shader_addline(buffer, "MAD TA.x, TA.z, %s.w, TA.x;\n", src_name1); /* +1/(5!*32) */
1926         shader_addline(buffer, "MAD TA.x, TA.w, %s.x, TA.x;\n", src_name1); /* -1/(7!*128) */
1927
1928         /* cos(x/2) */
1929         shader_addline(buffer, "MAD TA.y, %s.x, %s.y, %s.z;\n", dst_name, src_name2, src_name2); /* -1/(2!*4), +1.0 */
1930         shader_addline(buffer, "MAD TA.y, %s.y, %s.z, TA.y;\n", dst_name, src_name1); /* +1/(4!*16) */
1931         shader_addline(buffer, "MAD TA.y, %s.z, %s.y, TA.y;\n", dst_name, src_name1); /* -1/(6!*64) */
1932
1933         if(dst->write_mask & WINED3DSP_WRITEMASK_0) {
1934             /* cos x */
1935             shader_addline(buffer, "MUL TA.z, TA.y, TA.y;\n");
1936             shader_addline(buffer, "MAD %s.x, -TA.x, TA.x, TA.z;\n", dst_name);
1937         }
1938         if(dst->write_mask & WINED3DSP_WRITEMASK_1) {
1939             /* sin x */
1940             shader_addline(buffer, "MUL %s.y, TA.x, TA.y;\n", dst_name);
1941             shader_addline(buffer, "ADD %s.y, %s.y, %s.y;\n", dst_name, dst_name, dst_name);
1942         }
1943     }
1944 }
1945
1946 /* GL locking is done by the caller */
1947 static void shader_hw_sgn(const struct wined3d_shader_instruction *ins)
1948 {
1949     SHADER_BUFFER *buffer = ins->ctx->buffer;
1950     char dst_name[50];
1951     char src_name[50];
1952     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1953
1954     /* SGN is only valid in vertex shaders */
1955     if(ctx->target_version == NV2) {
1956         shader_hw_map2gl(ins);
1957         return;
1958     }
1959     shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
1960     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name);
1961
1962     FIXME("Emulated SGN untested\n");
1963     /* If SRC > 0.0, -SRC < SRC = TRUE, otherwise false.
1964      * if SRC < 0.0,  SRC < -SRC = TRUE. If neither is true, src = 0.0
1965      */
1966     if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE) {
1967         shader_addline(buffer, "SLT %s, -%s, %s;\n", dst_name, src_name, src_name);
1968     } else {
1969         shader_addline(buffer, "SLT TB, -%s, %s;\n", src_name, src_name);
1970         shader_addline(buffer, "SLT TC,  %s, -%s;\n", src_name, src_name);
1971         shader_addline(buffer, "ADD %s, TB, -TC;\n", dst_name);
1972     }
1973 }
1974
1975 static GLuint create_arb_blt_vertex_program(const WineD3D_GL_Info *gl_info)
1976 {
1977     GLuint program_id = 0;
1978     const char *blt_vprogram =
1979         "!!ARBvp1.0\n"
1980         "PARAM c[1] = { { 1, 0.5 } };\n"
1981         "MOV result.position, vertex.position;\n"
1982         "MOV result.color, c[0].x;\n"
1983         "MOV result.texcoord[0], vertex.texcoord[0];\n"
1984         "END\n";
1985
1986     GL_EXTCALL(glGenProgramsARB(1, &program_id));
1987     GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, program_id));
1988     GL_EXTCALL(glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, strlen(blt_vprogram), blt_vprogram));
1989
1990     if (glGetError() == GL_INVALID_OPERATION) {
1991         GLint pos;
1992         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
1993         FIXME("Vertex program error at position %d: %s\n", pos,
1994             debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
1995     }
1996
1997     return program_id;
1998 }
1999
2000 /* GL locking is done by the caller */
2001 static GLuint create_arb_blt_fragment_program(const WineD3D_GL_Info *gl_info, enum tex_types tex_type)
2002 {
2003     GLuint program_id = 0;
2004     static const char * const blt_fprograms[tex_type_count] =
2005     {
2006         /* tex_1d */
2007         NULL,
2008         /* tex_2d */
2009         "!!ARBfp1.0\n"
2010         "TEMP R0;\n"
2011         "TEX R0.x, fragment.texcoord[0], texture[0], 2D;\n"
2012         "MOV result.depth.z, R0.x;\n"
2013         "END\n",
2014         /* tex_3d */
2015         NULL,
2016         /* tex_cube */
2017         "!!ARBfp1.0\n"
2018         "TEMP R0;\n"
2019         "TEX R0.x, fragment.texcoord[0], texture[0], CUBE;\n"
2020         "MOV result.depth.z, R0.x;\n"
2021         "END\n",
2022         /* tex_rect */
2023         "!!ARBfp1.0\n"
2024         "TEMP R0;\n"
2025         "TEX R0.x, fragment.texcoord[0], texture[0], RECT;\n"
2026         "MOV result.depth.z, R0.x;\n"
2027         "END\n",
2028     };
2029
2030     if (!blt_fprograms[tex_type])
2031     {
2032         FIXME("tex_type %#x not supported\n", tex_type);
2033         tex_type = tex_2d;
2034     }
2035
2036     GL_EXTCALL(glGenProgramsARB(1, &program_id));
2037     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, program_id));
2038     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, strlen(blt_fprograms[tex_type]), blt_fprograms[tex_type]));
2039
2040     if (glGetError() == GL_INVALID_OPERATION) {
2041         GLint pos;
2042         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
2043         FIXME("Fragment program error at position %d: %s\n", pos,
2044             debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
2045     }
2046
2047     return program_id;
2048 }
2049
2050 static void arbfp_add_sRGB_correction(SHADER_BUFFER *buffer, const char *fragcolor, const char *tmp1,
2051                                       const char *tmp2, const char *tmp3, const char *tmp4, BOOL condcode) {
2052     /* Perform sRGB write correction. See GLX_EXT_framebuffer_sRGB */
2053
2054     if(condcode)
2055     {
2056         /* Sigh. MOVC CC doesn't work, so use one of the temps as dummy dest */
2057         shader_addline(buffer, "SUBC %s, %s.x, srgb_consts1.y;\n", tmp1, fragcolor);
2058         /* Calculate the > 0.0031308 case */
2059         shader_addline(buffer, "POW %s.x (GE), %s.x, srgb_consts1.z;\n", fragcolor, fragcolor);
2060         shader_addline(buffer, "POW %s.y (GE), %s.y, srgb_consts1.z;\n", fragcolor, fragcolor);
2061         shader_addline(buffer, "POW %s.z (GE), %s.z, srgb_consts1.z;\n", fragcolor, fragcolor);
2062         shader_addline(buffer, "MUL %s.xyz (GE), %s, srgb_consts1.w;\n", fragcolor, fragcolor);
2063         shader_addline(buffer, "SUB %s.xyz (GE), %s, srgb_consts2.x;\n", fragcolor, fragcolor);
2064         /* Calculate the < case */
2065         shader_addline(buffer, "MUL %s.xyz (LT), srgb_consts1.x, %s;\n", fragcolor, fragcolor);
2066     }
2067     else
2068     {
2069         /* Calculate the > 0.0031308 case */
2070         shader_addline(buffer, "POW %s.x, %s.x, srgb_consts1.z;\n", tmp1, fragcolor);
2071         shader_addline(buffer, "POW %s.y, %s.y, srgb_consts1.z;\n", tmp1, fragcolor);
2072         shader_addline(buffer, "POW %s.z, %s.z, srgb_consts1.z;\n", tmp1, fragcolor);
2073         shader_addline(buffer, "MUL %s, %s, srgb_consts1.w;\n", tmp1, tmp1);
2074         shader_addline(buffer, "SUB %s, %s, srgb_consts2.x;\n", tmp1, tmp1);
2075         /* Calculate the < case */
2076         shader_addline(buffer, "MUL %s, srgb_consts1.x, %s;\n", tmp2, fragcolor);
2077         /* Get 1.0 / 0.0 masks for > 0.0031308 and < 0.0031308 */
2078         shader_addline(buffer, "SLT %s, srgb_consts1.y, %s;\n", tmp3, fragcolor);
2079         shader_addline(buffer, "SGE %s, srgb_consts1.y, %s;\n", tmp4, fragcolor);
2080         /* Store the components > 0.0031308 in the destination */
2081         shader_addline(buffer, "MUL %s.xyz, %s, %s;\n", fragcolor, tmp1, tmp3);
2082         /* Add the components that are < 0.0031308 */
2083         shader_addline(buffer, "MAD %s.xyz, %s, %s, %s;\n", fragcolor, tmp2, tmp4, fragcolor);
2084         /* Move everything into result.color at once. Nvidia hardware cannot handle partial
2085         * result.color writes(.rgb first, then .a), or handle overwriting already written
2086         * components. The assembler uses a temporary register in this case, which is usually
2087         * not allocated from one of our registers that were used earlier.
2088         */
2089     }
2090     shader_addline(buffer, "MOV result.color, %s;\n", fragcolor);
2091     /* [0.0;1.0] clamping. Not needed, this is done implicitly */
2092 }
2093
2094 /* GL locking is done by the caller */
2095 static GLuint shader_arb_generate_pshader(IWineD3DPixelShaderImpl *This,
2096         SHADER_BUFFER *buffer, const struct arb_ps_compile_args *args)
2097 {
2098     const shader_reg_maps* reg_maps = &This->baseShader.reg_maps;
2099     CONST DWORD *function = This->baseShader.function;
2100     const WineD3D_GL_Info *gl_info = &((IWineD3DDeviceImpl *)This->baseShader.device)->adapter->gl_info;
2101     const local_constant *lconst;
2102     GLuint retval;
2103     char fragcolor[16];
2104     DWORD *lconst_map = local_const_mapping((IWineD3DBaseShaderImpl *) This);
2105     struct shader_arb_ctx_priv priv_ctx;
2106     BOOL dcl_tmp = args->super.srgb_correction, dcl_td = FALSE;
2107     BOOL want_nv_prog = FALSE;
2108
2109     char srgbtmp[4][4];
2110     unsigned int i, found = 0;
2111
2112     for(i = 0; i < This->baseShader.limits.temporary; i++) {
2113
2114         /* Don't overwrite the color source */
2115         if(This->color0_mov && i == This->color0_reg) continue;
2116         else if(reg_maps->shader_version.major < 2 && i == 0) continue;
2117
2118         if(reg_maps->temporary[i]) {
2119             sprintf(srgbtmp[found], "R%u", i);
2120             found++;
2121             if(found == 4) break;
2122         }
2123     }
2124
2125     switch(found) {
2126         case 4: dcl_tmp = FALSE; break;
2127         case 0:
2128             sprintf(srgbtmp[0], "TA");
2129             sprintf(srgbtmp[1], "TB");
2130             sprintf(srgbtmp[2], "TC");
2131             sprintf(srgbtmp[3], "TD");
2132             dcl_td = TRUE;
2133             break;
2134         case 1:
2135             sprintf(srgbtmp[1], "TA");
2136             sprintf(srgbtmp[2], "TB");
2137             sprintf(srgbtmp[3], "TC");
2138             break;
2139         case 2:
2140             sprintf(srgbtmp[2], "TA");
2141             sprintf(srgbtmp[3], "TB");
2142             break;
2143         case 3:
2144             sprintf(srgbtmp[3], "TA");
2145             break;
2146     }
2147
2148     /*  Create the hw ARB shader */
2149     memset(&priv_ctx, 0, sizeof(priv_ctx));
2150     priv_ctx.cur_ps_args = args;
2151     list_init(&priv_ctx.if_frames);
2152
2153     /* Avoid enabling NV_fragment_program* if we do not need it.
2154      *
2155      * Enabling GL_NV_fragment_program_option causes the driver to occupy a temporary register,
2156      * and it slows down the shader execution noticeably(about 5%). Usually our instruction emulation
2157      * is faster than what we gain from using higher native instructions. There are some things though
2158      * that cannot be emulated. In that case enable the extensions.
2159      * If the extension is enabled, instruction handlers that support both ways will use it.
2160      *
2161      * Testing shows no performance difference between OPTION NV_fragment_program2 and NV_fragment_program.
2162      * So enable the best we can get.
2163      */
2164     if(reg_maps->usesdsx || reg_maps->usesdsy || reg_maps->loop_depth > 0)
2165     {
2166         want_nv_prog = TRUE;
2167     }
2168
2169     shader_addline(buffer, "!!ARBfp1.0\n");
2170     if(want_nv_prog && GL_SUPPORT(NV_FRAGMENT_PROGRAM2)) {
2171         shader_addline(buffer, "OPTION NV_fragment_program2;\n");
2172         priv_ctx.target_version = NV3;
2173     } else if(want_nv_prog && GL_SUPPORT(NV_FRAGMENT_PROGRAM_OPTION)) {
2174         shader_addline(buffer, "OPTION NV_fragment_program;\n");
2175         priv_ctx.target_version = NV2;
2176     } else {
2177         if(want_nv_prog)
2178         {
2179             /* This is an error - either we're advertising the wrong shader version, or aren't enforcing some
2180              * limits properly
2181              */
2182             ERR("The shader requires instructions that are not available in plain GL_ARB_fragment_program\n");
2183             ERR("Try GLSL\n");
2184         }
2185         priv_ctx.target_version = ARB;
2186     }
2187
2188     if (reg_maps->shader_version.major < 3)
2189     {
2190         switch(args->super.fog) {
2191             case FOG_OFF:
2192                 break;
2193             case FOG_LINEAR:
2194                 shader_addline(buffer, "OPTION ARB_fog_linear;\n");
2195                 break;
2196             case FOG_EXP:
2197                 shader_addline(buffer, "OPTION ARB_fog_exp;\n");
2198                 break;
2199             case FOG_EXP2:
2200                 shader_addline(buffer, "OPTION ARB_fog_exp2;\n");
2201                 break;
2202         }
2203     }
2204
2205     /* For now always declare the temps. At least the Nvidia assembler optimizes completely
2206      * unused temps away(but occupies them for the whole shader if they're used once). Always
2207      * declaring them avoids tricky bookkeeping work
2208      */
2209     shader_addline(buffer, "TEMP TA;\n");      /* Used for modifiers */
2210     shader_addline(buffer, "TEMP TB;\n");      /* Used for modifiers */
2211     shader_addline(buffer, "TEMP TC;\n");      /* Used for modifiers */
2212     if(dcl_td) shader_addline(buffer, "TEMP TD;\n"); /* Used for sRGB writing */
2213     shader_addline(buffer, "PARAM coefdiv = { 0.5, 0.25, 0.125, 0.0625 };\n");
2214     shader_addline(buffer, "PARAM coefmul = { 2, 4, 8, 16 };\n");
2215     shader_addline(buffer, "PARAM one = { 1.0, 1.0, 1.0, 1.0 };\n");
2216
2217     if (reg_maps->shader_version.major < 2)
2218     {
2219         strcpy(fragcolor, "R0");
2220     } else {
2221         if(args->super.srgb_correction) {
2222             if(This->color0_mov) {
2223                 sprintf(fragcolor, "R%u", This->color0_reg);
2224             } else {
2225                 shader_addline(buffer, "TEMP TMP_COLOR;\n");
2226                 strcpy(fragcolor, "TMP_COLOR");
2227             }
2228         } else {
2229             strcpy(fragcolor, "result.color");
2230         }
2231     }
2232
2233     if(args->super.srgb_correction) {
2234         shader_addline(buffer, "PARAM srgb_consts1 = {%f, %f, %f, %f};\n",
2235                        srgb_mul_low, srgb_cmp, srgb_pow, srgb_mul_high);
2236         shader_addline(buffer, "PARAM srgb_consts2 = {%f, %f, %f, %f};\n",
2237                        srgb_sub_high, 0.0, 0.0, 0.0);
2238     }
2239
2240     /* Base Declarations */
2241     shader_generate_arb_declarations( (IWineD3DBaseShader*) This, reg_maps, buffer, &GLINFO_LOCATION, lconst_map);
2242
2243     /* Base Shader Body */
2244     shader_generate_main((IWineD3DBaseShader *)This, buffer, reg_maps, function, &priv_ctx);
2245
2246     if(args->super.srgb_correction) {
2247         arbfp_add_sRGB_correction(buffer, fragcolor, srgbtmp[0], srgbtmp[1], srgbtmp[2], srgbtmp[3],
2248                                   priv_ctx.target_version >= NV2);
2249     } else if(reg_maps->shader_version.major < 2) {
2250         shader_addline(buffer, "MOV result.color, %s;\n", fragcolor);
2251     }
2252     shader_addline(buffer, "END\n");
2253
2254     /* TODO: change to resource.glObjectHandle or something like that */
2255     GL_EXTCALL(glGenProgramsARB(1, &retval));
2256
2257     TRACE("Creating a hw pixel shader, prg=%d\n", retval);
2258     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, retval));
2259
2260     TRACE("Created hw pixel shader, prg=%d\n", retval);
2261     /* Create the program and check for errors */
2262     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
2263                buffer->bsize, buffer->buffer));
2264
2265     if (glGetError() == GL_INVALID_OPERATION) {
2266         GLint errPos;
2267         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &errPos);
2268         FIXME("HW PixelShader Error at position %d: %s\n",
2269               errPos, debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
2270         retval = 0;
2271     }
2272
2273     /* Load immediate constants */
2274     if(lconst_map) {
2275         LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
2276             const float *value = (const float *)lconst->value;
2277             GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, lconst_map[lconst->idx], value));
2278             checkGLcall("glProgramLocalParameter4fvARB");
2279         }
2280         HeapFree(GetProcessHeap(), 0, lconst_map);
2281     }
2282
2283     return retval;
2284 }
2285
2286 /* GL locking is done by the caller */
2287 static GLuint shader_arb_generate_vshader(IWineD3DVertexShaderImpl *This,
2288         SHADER_BUFFER *buffer, const struct arb_vs_compile_args *args)
2289 {
2290     const shader_reg_maps *reg_maps = &This->baseShader.reg_maps;
2291     CONST DWORD *function = This->baseShader.function;
2292     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *)This->baseShader.device;
2293     const WineD3D_GL_Info *gl_info = &device->adapter->gl_info;
2294     const local_constant *lconst;
2295     GLuint ret;
2296     DWORD *lconst_map = local_const_mapping((IWineD3DBaseShaderImpl *) This);
2297     struct shader_arb_ctx_priv priv_ctx;
2298     unsigned int i;
2299
2300     memset(&priv_ctx, 0, sizeof(priv_ctx));
2301     priv_ctx.cur_vs_args = args;
2302     list_init(&priv_ctx.if_frames);
2303
2304     /*  Create the hw ARB shader */
2305     shader_addline(buffer, "!!ARBvp1.0\n");
2306
2307     /* Always enable the NV extension if available. Unlike fragment shaders, there is no
2308      * mesurable performance penalty, and we can always make use of it for clipplanes.
2309      */
2310     if(GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION)) {
2311         shader_addline(buffer, "OPTION NV_vertex_program2;\n");
2312         priv_ctx.target_version = NV2;
2313     } else {
2314         priv_ctx.target_version = ARB;
2315     }
2316
2317     shader_addline(buffer, "TEMP TMP_OUT;\n");
2318     if(need_helper_const(gl_info)) {
2319         shader_addline(buffer, "PARAM helper_const = { 2.0, -1.0, %d.0, 0.0 };\n", This->rel_offset);
2320     }
2321     if(need_mova_const((IWineD3DBaseShader *) This, gl_info)) {
2322         shader_addline(buffer, "PARAM mova_const = { 0.5, 0.0, 2.0, 1.0 };\n");
2323         shader_addline(buffer, "TEMP A0_SHADOW;\n");
2324     }
2325
2326     shader_addline(buffer, "TEMP TA;\n");
2327
2328     /* Base Declarations */
2329     shader_generate_arb_declarations( (IWineD3DBaseShader*) This, reg_maps, buffer, &GLINFO_LOCATION, lconst_map);
2330
2331     /* We need a constant to fixup the final position */
2332     shader_addline(buffer, "PARAM posFixup = program.env[%d];\n", ARB_SHADER_PRIVCONST_POS);
2333
2334     /* Initialize output parameters. GL_ARB_vertex_program does not require special initialization values
2335      * for output parameters. D3D in theory does not do that either, but some applications depend on a
2336      * proper initialization of the secondary color, and programs using the fixed function pipeline without
2337      * a replacement shader depend on the texcoord.w being set properly.
2338      *
2339      * GL_NV_vertex_program defines that all output values are initialized to {0.0, 0.0, 0.0, 1.0}. This
2340      * assertion is in effect even when using GL_ARB_vertex_program without any NV specific additions. So
2341      * skip this if NV_vertex_program is supported. Otherwise, initialize the secondary color. For the tex-
2342      * coords, we have a flag in the opengl caps. Many cards do not require the texcoord being set, and
2343      * this can eat a number of instructions, so skip it unless this cap is set as well
2344      */
2345     if(!GL_SUPPORT(NV_VERTEX_PROGRAM)) {
2346         shader_addline(buffer, "MOV result.color.secondary, -helper_const.wwwy;\n");
2347
2348         if((GLINFO_LOCATION).set_texcoord_w && !device->frag_pipe->ffp_proj_control) {
2349             int i;
2350             for(i = 0; i < min(8, MAX_REG_TEXCRD); i++) {
2351                 if(This->baseShader.reg_maps.texcoord_mask[i] != 0 &&
2352                 This->baseShader.reg_maps.texcoord_mask[i] != WINED3DSP_WRITEMASK_ALL) {
2353                     shader_addline(buffer, "MOV result.texcoord[%u].w, -helper_const.y;\n", i);
2354                 }
2355             }
2356         }
2357     }
2358
2359     /* Base Shader Body */
2360     shader_generate_main((IWineD3DBaseShader *)This, buffer, reg_maps, function, &priv_ctx);
2361
2362     /* The D3DRS_FOGTABLEMODE render state defines if the shader-generated fog coord is used
2363      * or if the fragment depth is used. If the fragment depth is used(FOGTABLEMODE != NONE),
2364      * the fog frag coord is thrown away. If the fog frag coord is used, but not written by
2365      * the shader, it is set to 0.0(fully fogged, since start = 1.0, end = 0.0)
2366      */
2367     if(args->super.fog_src == VS_FOG_Z) {
2368         shader_addline(buffer, "MOV result.fogcoord, TMP_OUT.z;\n");
2369     } else if (!reg_maps->fog) {
2370         /* posFixup.x is always 1.0, so we can savely use it */
2371         shader_addline(buffer, "ADD result.fogcoord, posFixup.x, -posFixup.x;\n");
2372     }
2373
2374     /* Write the final position.
2375      *
2376      * OpenGL coordinates specify the center of the pixel while d3d coords specify
2377      * the corner. The offsets are stored in z and w in posFixup. posFixup.y contains
2378      * 1.0 or -1.0 to turn the rendering upside down for offscreen rendering. PosFixup.x
2379      * contains 1.0 to allow a mad, but arb vs swizzles are too restricted for that.
2380      */
2381     shader_addline(buffer, "MUL TA, posFixup, TMP_OUT.w;\n");
2382     shader_addline(buffer, "ADD TMP_OUT.x, TMP_OUT.x, TA.z;\n");
2383     shader_addline(buffer, "MAD TMP_OUT.y, TMP_OUT.y, posFixup.y, TA.w;\n");
2384
2385     if(priv_ctx.target_version >= NV2)
2386     {
2387         for(i = 0; i < GL_LIMITS(clipplanes); i++)
2388         {
2389             shader_addline(buffer, "DP4 result.clip[%u].x, TMP_OUT, state.clip[%u].plane;\n", i, i);
2390         }
2391     }
2392
2393     /* Z coord [0;1]->[-1;1] mapping, see comment in transform_projection in state.c
2394      * and the glsl equivalent
2395      */
2396     if(need_helper_const(gl_info)) {
2397         shader_addline(buffer, "MAD TMP_OUT.z, TMP_OUT.z, helper_const.x, -TMP_OUT.w;\n");
2398     } else {
2399         shader_addline(buffer, "ADD TMP_OUT.z, TMP_OUT.z, TMP_OUT.z;\n");
2400         shader_addline(buffer, "ADD TMP_OUT.z, TMP_OUT.z, -TMP_OUT.w;\n");
2401     }
2402
2403     shader_addline(buffer, "MOV result.position, TMP_OUT;\n");
2404
2405     shader_addline(buffer, "END\n");
2406
2407     /* TODO: change to resource.glObjectHandle or something like that */
2408     GL_EXTCALL(glGenProgramsARB(1, &ret));
2409
2410     TRACE("Creating a hw vertex shader, prg=%d\n", ret);
2411     GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, ret));
2412
2413     TRACE("Created hw vertex shader, prg=%d\n", ret);
2414     /* Create the program and check for errors */
2415     GL_EXTCALL(glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
2416                buffer->bsize, buffer->buffer));
2417
2418     if (glGetError() == GL_INVALID_OPERATION) {
2419         GLint errPos;
2420         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &errPos);
2421         FIXME("HW VertexShader Error at position %d: %s\n",
2422               errPos, debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
2423         ret = -1;
2424     } else {
2425         /* Load immediate constants */
2426         if(lconst_map) {
2427             LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
2428                 const float *value = (const float *)lconst->value;
2429                 GL_EXTCALL(glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, lconst_map[lconst->idx], value));
2430             }
2431         }
2432     }
2433     HeapFree(GetProcessHeap(), 0, lconst_map);
2434
2435     return ret;
2436 }
2437
2438 /* GL locking is done by the caller */
2439 static GLuint find_arb_pshader(IWineD3DPixelShaderImpl *shader, const struct arb_ps_compile_args *args)
2440 {
2441     UINT i;
2442     DWORD new_size;
2443     struct arb_ps_compiled_shader *new_array;
2444     SHADER_BUFFER buffer;
2445     struct arb_pshader_private *shader_data;
2446     GLuint ret;
2447
2448     if(!shader->backend_priv) {
2449         shader->backend_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data));
2450     }
2451     shader_data = shader->backend_priv;
2452
2453     /* Usually we have very few GL shaders for each d3d shader(just 1 or maybe 2),
2454      * so a linear search is more performant than a hashmap or a binary search
2455      * (cache coherency etc)
2456      */
2457     for(i = 0; i < shader_data->num_gl_shaders; i++) {
2458         if(memcmp(&shader_data->gl_shaders[i].args, args, sizeof(*args)) == 0) {
2459             return shader_data->gl_shaders[i].prgId;
2460         }
2461     }
2462
2463     TRACE("No matching GL shader found, compiling a new shader\n");
2464     if(shader_data->shader_array_size == shader_data->num_gl_shaders) {
2465         if (shader_data->num_gl_shaders)
2466         {
2467             new_size = shader_data->shader_array_size + max(1, shader_data->shader_array_size / 2);
2468             new_array = HeapReAlloc(GetProcessHeap(), 0, shader_data->gl_shaders,
2469                                     new_size * sizeof(*shader_data->gl_shaders));
2470         } else {
2471             new_array = HeapAlloc(GetProcessHeap(), 0, sizeof(*shader_data->gl_shaders));
2472             new_size = 1;
2473         }
2474
2475         if(!new_array) {
2476             ERR("Out of memory\n");
2477             return 0;
2478         }
2479         shader_data->gl_shaders = new_array;
2480         shader_data->shader_array_size = new_size;
2481     }
2482
2483     shader_data->gl_shaders[shader_data->num_gl_shaders].args = *args;
2484
2485     pixelshader_update_samplers(&shader->baseShader.reg_maps,
2486             ((IWineD3DDeviceImpl *)shader->baseShader.device)->stateBlock->textures);
2487
2488     shader_buffer_init(&buffer);
2489     ret = shader_arb_generate_pshader(shader, &buffer, args);
2490     shader_buffer_free(&buffer);
2491     shader_data->gl_shaders[shader_data->num_gl_shaders++].prgId = ret;
2492
2493     return ret;
2494 }
2495
2496 static inline BOOL vs_args_equal(const struct arb_vs_compile_args *stored, const struct arb_vs_compile_args *new,
2497                                  const DWORD use_map) {
2498     if((stored->super.swizzle_map & use_map) != new->super.swizzle_map) return FALSE;
2499     if(stored->super.fog_src != new->super.fog_src) return FALSE;
2500     return stored->bools == new->bools;
2501 }
2502
2503 static GLuint find_arb_vshader(IWineD3DVertexShaderImpl *shader, const struct arb_vs_compile_args *args)
2504 {
2505     UINT i;
2506     DWORD new_size;
2507     struct arb_vs_compiled_shader *new_array;
2508     DWORD use_map = ((IWineD3DDeviceImpl *)shader->baseShader.device)->strided_streams.use_map;
2509     SHADER_BUFFER buffer;
2510     struct arb_vshader_private *shader_data;
2511     GLuint ret;
2512
2513     if(!shader->backend_priv) {
2514         shader->backend_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data));
2515     }
2516     shader_data = shader->backend_priv;
2517
2518     /* Usually we have very few GL shaders for each d3d shader(just 1 or maybe 2),
2519      * so a linear search is more performant than a hashmap or a binary search
2520      * (cache coherency etc)
2521      */
2522     for(i = 0; i < shader_data->num_gl_shaders; i++) {
2523         if(vs_args_equal(&shader_data->gl_shaders[i].args, args, use_map)) {
2524             return shader_data->gl_shaders[i].prgId;
2525         }
2526     }
2527
2528     TRACE("No matching GL shader found, compiling a new shader\n");
2529
2530     if(shader_data->shader_array_size == shader_data->num_gl_shaders) {
2531         if (shader_data->num_gl_shaders)
2532         {
2533             new_size = shader_data->shader_array_size + max(1, shader_data->shader_array_size / 2);
2534             new_array = HeapReAlloc(GetProcessHeap(), 0, shader_data->gl_shaders,
2535                                     new_size * sizeof(*shader_data->gl_shaders));
2536         } else {
2537             new_array = HeapAlloc(GetProcessHeap(), 0, sizeof(*shader_data->gl_shaders));
2538             new_size = 1;
2539         }
2540
2541         if(!new_array) {
2542             ERR("Out of memory\n");
2543             return 0;
2544         }
2545         shader_data->gl_shaders = new_array;
2546         shader_data->shader_array_size = new_size;
2547     }
2548
2549     shader_data->gl_shaders[shader_data->num_gl_shaders].args = *args;
2550
2551     shader_buffer_init(&buffer);
2552     ret = shader_arb_generate_vshader(shader, &buffer, args);
2553     shader_buffer_free(&buffer);
2554     shader_data->gl_shaders[shader_data->num_gl_shaders++].prgId = ret;
2555
2556     return ret;
2557 }
2558
2559 static inline void find_arb_ps_compile_args(IWineD3DPixelShaderImpl *shader, IWineD3DStateBlockImpl *stateblock,
2560         struct arb_ps_compile_args *args)
2561 {
2562     int i;
2563     find_ps_compile_args(shader, stateblock, &args->super);
2564
2565     /* This forces all local boolean constants to 1 to make them stateblock independent */
2566     args->bools = shader->baseShader.reg_maps.local_bool_consts;
2567
2568     for(i = 0; i < MAX_CONST_B; i++)
2569     {
2570         if(stateblock->pixelShaderConstantB[i]) args->bools |= ( 1 << i);
2571     }
2572
2573 }
2574
2575 static inline void find_arb_vs_compile_args(IWineD3DVertexShaderImpl *shader, IWineD3DStateBlockImpl *stateblock,
2576         struct arb_vs_compile_args *args)
2577 {
2578     int i;
2579     find_vs_compile_args(shader, stateblock, &args->super);
2580
2581     /* This forces all local boolean constants to 1 to make them stateblock independent */
2582     args->bools = shader->baseShader.reg_maps.local_bool_consts;
2583
2584     /* TODO: Figure out if it would be better to store bool constants as bitmasks in the stateblock */
2585     for(i = 0; i < MAX_CONST_B; i++)
2586     {
2587         if(stateblock->vertexShaderConstantB[i]) args->bools |= ( 1 << i);
2588     }
2589
2590 }
2591
2592 /* GL locking is done by the caller */
2593 static void shader_arb_select(IWineD3DDevice *iface, BOOL usePS, BOOL useVS) {
2594     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
2595     struct shader_arb_priv *priv = This->shader_priv;
2596     const WineD3D_GL_Info *gl_info = &This->adapter->gl_info;
2597
2598     if (useVS) {
2599         struct arb_vs_compile_args compile_args;
2600
2601         TRACE("Using vertex shader\n");
2602         find_arb_vs_compile_args((IWineD3DVertexShaderImpl *) This->stateBlock->vertexShader, This->stateBlock, &compile_args);
2603         priv->current_vprogram_id = find_arb_vshader((IWineD3DVertexShaderImpl *) This->stateBlock->vertexShader, &compile_args);
2604
2605         /* Bind the vertex program */
2606         GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id));
2607         checkGLcall("glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id);");
2608
2609         /* Enable OpenGL vertex programs */
2610         glEnable(GL_VERTEX_PROGRAM_ARB);
2611         checkGLcall("glEnable(GL_VERTEX_PROGRAM_ARB);");
2612         TRACE("(%p) : Bound vertex program %u and enabled GL_VERTEX_PROGRAM_ARB\n", This, priv->current_vprogram_id);
2613     } else if(GL_SUPPORT(ARB_VERTEX_PROGRAM)) {
2614         priv->current_vprogram_id = 0;
2615         glDisable(GL_VERTEX_PROGRAM_ARB);
2616         checkGLcall("glDisable(GL_VERTEX_PROGRAM_ARB)");
2617     }
2618
2619     if (usePS) {
2620         struct arb_ps_compile_args compile_args;
2621         TRACE("Using pixel shader\n");
2622         find_arb_ps_compile_args((IWineD3DPixelShaderImpl *) This->stateBlock->pixelShader, This->stateBlock, &compile_args);
2623         priv->current_fprogram_id = find_arb_pshader((IWineD3DPixelShaderImpl *) This->stateBlock->pixelShader,
2624                                                      &compile_args);
2625
2626         /* Bind the fragment program */
2627         GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id));
2628         checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id);");
2629
2630         if(!priv->use_arbfp_fixed_func) {
2631             /* Enable OpenGL fragment programs */
2632             glEnable(GL_FRAGMENT_PROGRAM_ARB);
2633             checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB);");
2634         }
2635         TRACE("(%p) : Bound fragment program %u and enabled GL_FRAGMENT_PROGRAM_ARB\n", This, priv->current_fprogram_id);
2636
2637         shader_arb_ps_local_constants(This);
2638     } else if(GL_SUPPORT(ARB_FRAGMENT_PROGRAM) && !priv->use_arbfp_fixed_func) {
2639         /* Disable only if we're not using arbfp fixed function fragment processing. If this is used,
2640          * keep GL_FRAGMENT_PROGRAM_ARB enabled, and the fixed function pipeline will bind the fixed function
2641          * replacement shader
2642          */
2643         glDisable(GL_FRAGMENT_PROGRAM_ARB);
2644         checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
2645         priv->current_fprogram_id = 0;
2646     }
2647 }
2648
2649 /* GL locking is done by the caller */
2650 static void shader_arb_select_depth_blt(IWineD3DDevice *iface, enum tex_types tex_type) {
2651     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
2652     struct shader_arb_priv *priv = This->shader_priv;
2653     GLuint *blt_fprogram = &priv->depth_blt_fprogram_id[tex_type];
2654     const WineD3D_GL_Info *gl_info = &This->adapter->gl_info;
2655
2656     if (!priv->depth_blt_vprogram_id) priv->depth_blt_vprogram_id = create_arb_blt_vertex_program(gl_info);
2657     GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->depth_blt_vprogram_id));
2658     glEnable(GL_VERTEX_PROGRAM_ARB);
2659
2660     if (!*blt_fprogram) *blt_fprogram = create_arb_blt_fragment_program(gl_info, tex_type);
2661     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, *blt_fprogram));
2662     glEnable(GL_FRAGMENT_PROGRAM_ARB);
2663 }
2664
2665 /* GL locking is done by the caller */
2666 static void shader_arb_deselect_depth_blt(IWineD3DDevice *iface) {
2667     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
2668     struct shader_arb_priv *priv = This->shader_priv;
2669     const WineD3D_GL_Info *gl_info = &This->adapter->gl_info;
2670
2671     if (priv->current_vprogram_id) {
2672         GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id));
2673         checkGLcall("glBindProgramARB(GL_VERTEX_PROGRAM_ARB, vertexShader->prgId);");
2674
2675         glEnable(GL_VERTEX_PROGRAM_ARB);
2676         checkGLcall("glEnable(GL_VERTEX_PROGRAM_ARB);");
2677
2678         TRACE("(%p) : Bound vertex program %u and enabled GL_VERTEX_PROGRAM_ARB\n", This, priv->current_vprogram_id);
2679     } else {
2680         glDisable(GL_VERTEX_PROGRAM_ARB);
2681         checkGLcall("glDisable(GL_VERTEX_PROGRAM_ARB)");
2682     }
2683
2684     if (priv->current_fprogram_id) {
2685         GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id));
2686         checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, pixelShader->prgId);");
2687
2688         glEnable(GL_FRAGMENT_PROGRAM_ARB);
2689         checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB);");
2690
2691         TRACE("(%p) : Bound fragment program %u and enabled GL_FRAGMENT_PROGRAM_ARB\n", This, priv->current_fprogram_id);
2692     } else {
2693         glDisable(GL_FRAGMENT_PROGRAM_ARB);
2694         checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
2695     }
2696 }
2697
2698 static void shader_arb_destroy(IWineD3DBaseShader *iface) {
2699     IWineD3DBaseShaderImpl *baseShader = (IWineD3DBaseShaderImpl *) iface;
2700     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *)baseShader->baseShader.device;
2701     const WineD3D_GL_Info *gl_info = &device->adapter->gl_info;
2702
2703     ActivateContext(device, device->lastActiveRenderTarget, CTXUSAGE_RESOURCELOAD);
2704
2705     if (shader_is_pshader_version(baseShader->baseShader.reg_maps.shader_version.type))
2706     {
2707         IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *) iface;
2708         struct arb_pshader_private *shader_data = This->backend_priv;
2709         UINT i;
2710
2711         if(!shader_data) return; /* This can happen if a shader was never compiled */
2712         ENTER_GL();
2713         for(i = 0; i < shader_data->num_gl_shaders; i++) {
2714             GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId));
2715             checkGLcall("GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId))");
2716         }
2717         LEAVE_GL();
2718         HeapFree(GetProcessHeap(), 0, shader_data->gl_shaders);
2719         HeapFree(GetProcessHeap(), 0, shader_data);
2720         This->backend_priv = NULL;
2721     } else {
2722         IWineD3DVertexShaderImpl *This = (IWineD3DVertexShaderImpl *) iface;
2723         struct arb_vshader_private *shader_data = This->backend_priv;
2724         UINT i;
2725
2726         if(!shader_data) return; /* This can happen if a shader was never compiled */
2727         ENTER_GL();
2728         for(i = 0; i < shader_data->num_gl_shaders; i++) {
2729             GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId));
2730             checkGLcall("GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId))");
2731         }
2732         LEAVE_GL();
2733         HeapFree(GetProcessHeap(), 0, shader_data->gl_shaders);
2734         HeapFree(GetProcessHeap(), 0, shader_data);
2735         This->backend_priv = NULL;
2736     }
2737 }
2738
2739 static HRESULT shader_arb_alloc(IWineD3DDevice *iface) {
2740     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
2741     This->shader_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(struct shader_arb_priv));
2742     return WINED3D_OK;
2743 }
2744
2745 static void shader_arb_free(IWineD3DDevice *iface) {
2746     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
2747     const WineD3D_GL_Info *gl_info = &This->adapter->gl_info;
2748     struct shader_arb_priv *priv = This->shader_priv;
2749     int i;
2750
2751     ENTER_GL();
2752     if(priv->depth_blt_vprogram_id) {
2753         GL_EXTCALL(glDeleteProgramsARB(1, &priv->depth_blt_vprogram_id));
2754     }
2755     for (i = 0; i < tex_type_count; ++i) {
2756         if (priv->depth_blt_fprogram_id[i]) {
2757             GL_EXTCALL(glDeleteProgramsARB(1, &priv->depth_blt_fprogram_id[i]));
2758         }
2759     }
2760     LEAVE_GL();
2761
2762     HeapFree(GetProcessHeap(), 0, This->shader_priv);
2763 }
2764
2765 static BOOL shader_arb_dirty_const(IWineD3DDevice *iface) {
2766     return TRUE;
2767 }
2768
2769 static void shader_arb_get_caps(WINED3DDEVTYPE devtype, const WineD3D_GL_Info *gl_info, struct shader_caps *pCaps)
2770 {
2771     /* We don't have an ARB fixed function pipeline yet, so let the none backend set its caps,
2772      * then overwrite the shader specific ones
2773      */
2774     none_shader_backend.shader_get_caps(devtype, gl_info, pCaps);
2775
2776     if(GL_SUPPORT(ARB_VERTEX_PROGRAM)) {
2777         pCaps->VertexShaderVersion = WINED3DVS_VERSION(1,1);
2778         TRACE_(d3d_caps)("Hardware vertex shader version 1.1 enabled (ARB_PROGRAM)\n");
2779         pCaps->MaxVertexShaderConst = GL_LIMITS(vshader_constantsF) - 1;
2780     }
2781
2782     if(GL_SUPPORT(ARB_FRAGMENT_PROGRAM)) {
2783         pCaps->PixelShaderVersion    = WINED3DPS_VERSION(1,4);
2784         pCaps->PixelShader1xMaxValue = 8.0;
2785         TRACE_(d3d_caps)("Hardware pixel shader version 1.4 enabled (ARB_PROGRAM)\n");
2786         pCaps->MaxPixelShaderConst = GL_LIMITS(pshader_constantsF);
2787     }
2788
2789     pCaps->VSClipping = GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION);
2790 }
2791
2792 static BOOL shader_arb_color_fixup_supported(struct color_fixup_desc fixup)
2793 {
2794     if (TRACE_ON(d3d_shader) && TRACE_ON(d3d))
2795     {
2796         TRACE("Checking support for color_fixup:\n");
2797         dump_color_fixup_desc(fixup);
2798     }
2799
2800     /* We support everything except YUV conversions. */
2801     if (!is_yuv_fixup(fixup))
2802     {
2803         TRACE("[OK]\n");
2804         return TRUE;
2805     }
2806
2807     TRACE("[FAILED]\n");
2808     return FALSE;
2809 }
2810
2811 static void shader_arb_add_instruction_modifiers(const struct wined3d_shader_instruction *ins) {
2812     DWORD shift;
2813     char write_mask[20], regstr[50];
2814     SHADER_BUFFER *buffer = ins->ctx->buffer;
2815     BOOL is_color = FALSE;
2816     const struct wined3d_shader_dst_param *dst;
2817
2818     if (!ins->dst_count) return;
2819
2820     dst = &ins->dst[0];
2821     shift = dst->shift;
2822     if(shift == 0) return; /* Saturate alone is handled by the instructions */
2823
2824     shader_arb_get_write_mask(ins, dst, write_mask);
2825     shader_arb_get_register_name(ins, &dst->reg, regstr, &is_color);
2826
2827     /* Generate a line that does the output modifier computation
2828      * FIXME: _SAT vs shift? _SAT alone is already handled in the instructions, if this
2829      * maps problems in e.g. _d4_sat modify shader_arb_get_modifier
2830      */
2831     shader_addline(buffer, "MUL%s %s%s, %s, %s;\n", shader_arb_get_modifier(ins),
2832                    regstr, write_mask, regstr, shift_tab[shift]);
2833 }
2834
2835 static const SHADER_HANDLER shader_arb_instruction_handler_table[WINED3DSIH_TABLE_SIZE] =
2836 {
2837     /* WINED3DSIH_ABS           */ shader_hw_map2gl,
2838     /* WINED3DSIH_ADD           */ shader_hw_map2gl,
2839     /* WINED3DSIH_BEM           */ pshader_hw_bem,
2840     /* WINED3DSIH_BREAK         */ NULL,
2841     /* WINED3DSIH_BREAKC        */ NULL,
2842     /* WINED3DSIH_BREAKP        */ NULL,
2843     /* WINED3DSIH_CALL          */ NULL,
2844     /* WINED3DSIH_CALLNZ        */ NULL,
2845     /* WINED3DSIH_CMP           */ pshader_hw_cmp,
2846     /* WINED3DSIH_CND           */ pshader_hw_cnd,
2847     /* WINED3DSIH_CRS           */ shader_hw_map2gl,
2848     /* WINED3DSIH_DCL           */ NULL,
2849     /* WINED3DSIH_DEF           */ NULL,
2850     /* WINED3DSIH_DEFB          */ NULL,
2851     /* WINED3DSIH_DEFI          */ NULL,
2852     /* WINED3DSIH_DP2ADD        */ pshader_hw_dp2add,
2853     /* WINED3DSIH_DP3           */ shader_hw_map2gl,
2854     /* WINED3DSIH_DP4           */ shader_hw_map2gl,
2855     /* WINED3DSIH_DST           */ shader_hw_map2gl,
2856     /* WINED3DSIH_DSX           */ shader_hw_map2gl,
2857     /* WINED3DSIH_DSY           */ NULL,
2858     /* WINED3DSIH_ELSE          */ NULL,
2859     /* WINED3DSIH_ENDIF         */ NULL,
2860     /* WINED3DSIH_ENDLOOP       */ NULL,
2861     /* WINED3DSIH_ENDREP        */ NULL,
2862     /* WINED3DSIH_EXP           */ shader_hw_map2gl,
2863     /* WINED3DSIH_EXPP          */ shader_hw_map2gl,
2864     /* WINED3DSIH_FRC           */ shader_hw_map2gl,
2865     /* WINED3DSIH_IF            */ NULL,
2866     /* WINED3DSIH_IFC           */ NULL,
2867     /* WINED3DSIH_LABEL         */ NULL,
2868     /* WINED3DSIH_LIT           */ shader_hw_map2gl,
2869     /* WINED3DSIH_LOG           */ shader_hw_map2gl,
2870     /* WINED3DSIH_LOGP          */ shader_hw_map2gl,
2871     /* WINED3DSIH_LOOP          */ NULL,
2872     /* WINED3DSIH_LRP           */ shader_hw_lrp,
2873     /* WINED3DSIH_M3x2          */ shader_hw_mnxn,
2874     /* WINED3DSIH_M3x3          */ shader_hw_mnxn,
2875     /* WINED3DSIH_M3x4          */ shader_hw_mnxn,
2876     /* WINED3DSIH_M4x3          */ shader_hw_mnxn,
2877     /* WINED3DSIH_M4x4          */ shader_hw_mnxn,
2878     /* WINED3DSIH_MAD           */ shader_hw_map2gl,
2879     /* WINED3DSIH_MAX           */ shader_hw_map2gl,
2880     /* WINED3DSIH_MIN           */ shader_hw_map2gl,
2881     /* WINED3DSIH_MOV           */ shader_hw_mov,
2882     /* WINED3DSIH_MOVA          */ shader_hw_mov,
2883     /* WINED3DSIH_MUL           */ shader_hw_map2gl,
2884     /* WINED3DSIH_NOP           */ shader_hw_nop,
2885     /* WINED3DSIH_NRM           */ shader_hw_nrm,
2886     /* WINED3DSIH_PHASE         */ NULL,
2887     /* WINED3DSIH_POW           */ shader_hw_map2gl,
2888     /* WINED3DSIH_RCP           */ shader_hw_rsq_rcp,
2889     /* WINED3DSIH_REP           */ NULL,
2890     /* WINED3DSIH_RET           */ NULL,
2891     /* WINED3DSIH_RSQ           */ shader_hw_rsq_rcp,
2892     /* WINED3DSIH_SETP          */ NULL,
2893     /* WINED3DSIH_SGE           */ shader_hw_map2gl,
2894     /* WINED3DSIH_SGN           */ shader_hw_sgn,
2895     /* WINED3DSIH_SINCOS        */ shader_hw_sincos,
2896     /* WINED3DSIH_SLT           */ shader_hw_map2gl,
2897     /* WINED3DSIH_SUB           */ shader_hw_map2gl,
2898     /* WINED3DSIH_TEX           */ pshader_hw_tex,
2899     /* WINED3DSIH_TEXBEM        */ pshader_hw_texbem,
2900     /* WINED3DSIH_TEXBEML       */ pshader_hw_texbem,
2901     /* WINED3DSIH_TEXCOORD      */ pshader_hw_texcoord,
2902     /* WINED3DSIH_TEXDEPTH      */ pshader_hw_texdepth,
2903     /* WINED3DSIH_TEXDP3        */ pshader_hw_texdp3,
2904     /* WINED3DSIH_TEXDP3TEX     */ pshader_hw_texdp3tex,
2905     /* WINED3DSIH_TEXKILL       */ pshader_hw_texkill,
2906     /* WINED3DSIH_TEXLDD        */ NULL,
2907     /* WINED3DSIH_TEXLDL        */ NULL,
2908     /* WINED3DSIH_TEXM3x2DEPTH  */ pshader_hw_texm3x2depth,
2909     /* WINED3DSIH_TEXM3x2PAD    */ pshader_hw_texm3x2pad,
2910     /* WINED3DSIH_TEXM3x2TEX    */ pshader_hw_texm3x2tex,
2911     /* WINED3DSIH_TEXM3x3       */ pshader_hw_texm3x3,
2912     /* WINED3DSIH_TEXM3x3DIFF   */ NULL,
2913     /* WINED3DSIH_TEXM3x3PAD    */ pshader_hw_texm3x3pad,
2914     /* WINED3DSIH_TEXM3x3SPEC   */ pshader_hw_texm3x3spec,
2915     /* WINED3DSIH_TEXM3x3TEX    */ pshader_hw_texm3x3tex,
2916     /* WINED3DSIH_TEXM3x3VSPEC  */ pshader_hw_texm3x3vspec,
2917     /* WINED3DSIH_TEXREG2AR     */ pshader_hw_texreg2ar,
2918     /* WINED3DSIH_TEXREG2GB     */ pshader_hw_texreg2gb,
2919     /* WINED3DSIH_TEXREG2RGB    */ pshader_hw_texreg2rgb,
2920 };
2921
2922 static inline BOOL get_bool_const(const struct wined3d_shader_instruction *ins, IWineD3DBaseShaderImpl *This, DWORD idx)
2923 {
2924     BOOL vshader = shader_is_vshader_version(This->baseShader.reg_maps.shader_version.type);
2925     WORD bools = 0;
2926     WORD flag = (1 << idx);
2927     const local_constant *constant;
2928     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2929
2930     if(This->baseShader.reg_maps.local_bool_consts & flag)
2931     {
2932         /* What good is a if(bool) with a hardcoded local constant? I don't know, but handle it */
2933         LIST_FOR_EACH_ENTRY(constant, &This->baseShader.constantsB, local_constant, entry)
2934         {
2935             if (constant->idx == idx)
2936             {
2937                 return constant->value[0];
2938             }
2939         }
2940         ERR("Local constant not found\n");
2941         return FALSE;
2942     }
2943     else
2944     {
2945         if(vshader) bools = priv->cur_vs_args->bools;
2946         else bools = priv->cur_ps_args->bools;
2947         return bools & flag;
2948     }
2949 }
2950
2951 static void shader_arb_handle_instruction(const struct wined3d_shader_instruction *ins) {
2952     SHADER_HANDLER hw_fct;
2953     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2954     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
2955     struct if_frame *if_frame;
2956     SHADER_BUFFER *buffer = ins->ctx->buffer;
2957
2958     /* boolean if */
2959     if(ins->handler_idx == WINED3DSIH_IF)
2960     {
2961         if_frame = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*if_frame));
2962         list_add_head(&priv->if_frames, &if_frame->entry);
2963
2964         if(!priv->muted && get_bool_const(ins, This, ins->src[0].reg.idx) == FALSE)
2965         {
2966             shader_addline(buffer, "#if(FALSE){\n");
2967             priv->muted = TRUE;
2968             if_frame->muting = TRUE;
2969         }
2970         else shader_addline(buffer, "#if(TRUE) {\n");
2971
2972         return; /* Instruction is handled */
2973     }
2974     else if(ins->handler_idx == WINED3DSIH_IFC)
2975     {
2976         /* IF(bool) and if_cond(a, b) use the same ELSE and ENDIF tokens */
2977         if_frame = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*if_frame));
2978         if_frame->ifc = TRUE;
2979         list_add_head(&priv->if_frames, &if_frame->entry);
2980     }
2981     else if(ins->handler_idx == WINED3DSIH_ELSE)
2982     {
2983         struct list *e = list_head(&priv->if_frames);
2984         if_frame = LIST_ENTRY(e, struct if_frame, entry);
2985
2986         if(if_frame->ifc == FALSE)
2987         {
2988             shader_addline(buffer, "#} else {\n");
2989             if(!priv->muted && !if_frame->muting)
2990             {
2991                 priv->muted = TRUE;
2992                 if_frame->muting = TRUE;
2993             }
2994             else if(if_frame->muting) priv->muted = FALSE;
2995             return; /* Instruction is handled. */
2996         }
2997         /* In case of an ifc, generate a HW shader instruction */
2998     }
2999     else if(ins->handler_idx == WINED3DSIH_ENDIF)
3000     {
3001         struct list *e = list_head(&priv->if_frames);
3002         if_frame = LIST_ENTRY(e, struct if_frame, entry);
3003
3004         if(!if_frame->ifc)
3005         {
3006             shader_addline(buffer, "#} endif\n");
3007             if(if_frame->muting) priv->muted = FALSE;
3008             list_remove(&if_frame->entry);
3009             HeapFree(GetProcessHeap(), 0, if_frame);
3010             return; /* Instruction is handled */
3011         }
3012         else
3013         {
3014             list_remove(&if_frame->entry);
3015             HeapFree(GetProcessHeap(), 0, if_frame);
3016             /* ifc - generate a hw endif */
3017         }
3018     }
3019
3020     if(priv->muted) return;
3021
3022     /* Select handler */
3023     hw_fct = shader_arb_instruction_handler_table[ins->handler_idx];
3024
3025     /* Unhandled opcode */
3026     if (!hw_fct)
3027     {
3028         FIXME("Backend can't handle opcode %#x\n", ins->handler_idx);
3029         return;
3030     }
3031     hw_fct(ins);
3032
3033     shader_arb_add_instruction_modifiers(ins);
3034 }
3035
3036 const shader_backend_t arb_program_shader_backend = {
3037     shader_arb_handle_instruction,
3038     shader_arb_select,
3039     shader_arb_select_depth_blt,
3040     shader_arb_deselect_depth_blt,
3041     shader_arb_update_float_vertex_constants,
3042     shader_arb_update_float_pixel_constants,
3043     shader_arb_load_constants,
3044     shader_arb_load_np2fixup_constants,
3045     shader_arb_destroy,
3046     shader_arb_alloc,
3047     shader_arb_free,
3048     shader_arb_dirty_const,
3049     shader_arb_get_caps,
3050     shader_arb_color_fixup_supported,
3051 };
3052
3053 /* ARB_fragment_program fixed function pipeline replacement definitions */
3054 #define ARB_FFP_CONST_TFACTOR           0
3055 #define ARB_FFP_CONST_SPECULAR_ENABLE   ((ARB_FFP_CONST_TFACTOR) + 1)
3056 #define ARB_FFP_CONST_CONSTANT(i)       ((ARB_FFP_CONST_SPECULAR_ENABLE) + 1 + i)
3057 #define ARB_FFP_CONST_BUMPMAT(i)        ((ARB_FFP_CONST_CONSTANT(7)) + 1 + i)
3058 #define ARB_FFP_CONST_LUMINANCE(i)      ((ARB_FFP_CONST_BUMPMAT(7)) + 1 + i)
3059
3060 struct arbfp_ffp_desc
3061 {
3062     struct ffp_frag_desc parent;
3063     GLuint shader;
3064     unsigned int num_textures_used;
3065 };
3066
3067 static void arbfp_enable(IWineD3DDevice *iface, BOOL enable) {
3068     ENTER_GL();
3069     if(enable) {
3070         glEnable(GL_FRAGMENT_PROGRAM_ARB);
3071         checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB)");
3072     } else {
3073         glDisable(GL_FRAGMENT_PROGRAM_ARB);
3074         checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
3075     }
3076     LEAVE_GL();
3077 }
3078
3079 static HRESULT arbfp_alloc(IWineD3DDevice *iface) {
3080     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *) iface;
3081     struct shader_arb_priv *priv;
3082     /* Share private data between the shader backend and the pipeline replacement, if both
3083      * are the arb implementation. This is needed to figure out whether ARBfp should be disabled
3084      * if no pixel shader is bound or not
3085      */
3086     if(This->shader_backend == &arb_program_shader_backend) {
3087         This->fragment_priv = This->shader_priv;
3088     } else {
3089         This->fragment_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(struct shader_arb_priv));
3090         if(!This->fragment_priv) return E_OUTOFMEMORY;
3091     }
3092     priv = This->fragment_priv;
3093     if (wine_rb_init(&priv->fragment_shaders, &wined3d_ffp_frag_program_rb_functions) == -1)
3094     {
3095         ERR("Failed to initialize rbtree.\n");
3096         HeapFree(GetProcessHeap(), 0, This->fragment_priv);
3097         return E_OUTOFMEMORY;
3098     }
3099     priv->use_arbfp_fixed_func = TRUE;
3100     return WINED3D_OK;
3101 }
3102
3103 static void arbfp_free_ffpshader(struct wine_rb_entry *entry, void *context)
3104 {
3105     const WineD3D_GL_Info *gl_info = context;
3106     struct arbfp_ffp_desc *entry_arb = WINE_RB_ENTRY_VALUE(entry, struct arbfp_ffp_desc, parent.entry);
3107
3108     ENTER_GL();
3109     GL_EXTCALL(glDeleteProgramsARB(1, &entry_arb->shader));
3110     checkGLcall("glDeleteProgramsARB(1, &entry_arb->shader)");
3111     HeapFree(GetProcessHeap(), 0, entry_arb);
3112     LEAVE_GL();
3113 }
3114
3115 static void arbfp_free(IWineD3DDevice *iface) {
3116     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *) iface;
3117     struct shader_arb_priv *priv = This->fragment_priv;
3118
3119     wine_rb_destroy(&priv->fragment_shaders, arbfp_free_ffpshader, &This->adapter->gl_info);
3120     priv->use_arbfp_fixed_func = FALSE;
3121
3122     if(This->shader_backend != &arb_program_shader_backend) {
3123         HeapFree(GetProcessHeap(), 0, This->fragment_priv);
3124     }
3125 }
3126
3127 static void arbfp_get_caps(WINED3DDEVTYPE devtype, const WineD3D_GL_Info *gl_info, struct fragment_caps *caps)
3128 {
3129     caps->TextureOpCaps =  WINED3DTEXOPCAPS_DISABLE                     |
3130                            WINED3DTEXOPCAPS_SELECTARG1                  |
3131                            WINED3DTEXOPCAPS_SELECTARG2                  |
3132                            WINED3DTEXOPCAPS_MODULATE4X                  |
3133                            WINED3DTEXOPCAPS_MODULATE2X                  |
3134                            WINED3DTEXOPCAPS_MODULATE                    |
3135                            WINED3DTEXOPCAPS_ADDSIGNED2X                 |
3136                            WINED3DTEXOPCAPS_ADDSIGNED                   |
3137                            WINED3DTEXOPCAPS_ADD                         |
3138                            WINED3DTEXOPCAPS_SUBTRACT                    |
3139                            WINED3DTEXOPCAPS_ADDSMOOTH                   |
3140                            WINED3DTEXOPCAPS_BLENDCURRENTALPHA           |
3141                            WINED3DTEXOPCAPS_BLENDFACTORALPHA            |
3142                            WINED3DTEXOPCAPS_BLENDTEXTUREALPHA           |
3143                            WINED3DTEXOPCAPS_BLENDDIFFUSEALPHA           |
3144                            WINED3DTEXOPCAPS_BLENDTEXTUREALPHAPM         |
3145                            WINED3DTEXOPCAPS_MODULATEALPHA_ADDCOLOR      |
3146                            WINED3DTEXOPCAPS_MODULATECOLOR_ADDALPHA      |
3147                            WINED3DTEXOPCAPS_MODULATEINVCOLOR_ADDALPHA   |
3148                            WINED3DTEXOPCAPS_MODULATEINVALPHA_ADDCOLOR   |
3149                            WINED3DTEXOPCAPS_DOTPRODUCT3                 |
3150                            WINED3DTEXOPCAPS_MULTIPLYADD                 |
3151                            WINED3DTEXOPCAPS_LERP                        |
3152                            WINED3DTEXOPCAPS_BUMPENVMAP                  |
3153                            WINED3DTEXOPCAPS_BUMPENVMAPLUMINANCE;
3154
3155     /* TODO: Implement WINED3DTEXOPCAPS_PREMODULATE */
3156
3157     caps->MaxTextureBlendStages   = 8;
3158     caps->MaxSimultaneousTextures = min(GL_LIMITS(fragment_samplers), 8);
3159
3160     caps->PrimitiveMiscCaps |= WINED3DPMISCCAPS_TSSARGTEMP;
3161 }
3162 #undef GLINFO_LOCATION
3163
3164 #define GLINFO_LOCATION stateblock->wineD3DDevice->adapter->gl_info
3165 static void state_texfactor_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
3166     float col[4];
3167     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
3168
3169     /* Don't load the parameter if we're using an arbfp pixel shader, otherwise we'll overwrite
3170      * application provided constants
3171      */
3172     if(device->shader_backend == &arb_program_shader_backend) {
3173         if (use_ps(stateblock)) return;
3174
3175         device = stateblock->wineD3DDevice;
3176         device->activeContext->pshader_const_dirty[ARB_FFP_CONST_TFACTOR] = 1;
3177         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_TFACTOR + 1);
3178     }
3179
3180     D3DCOLORTOGLFLOAT4(stateblock->renderState[WINED3DRS_TEXTUREFACTOR], col);
3181     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_TFACTOR, col));
3182     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_TFACTOR, col)");
3183
3184 }
3185
3186 static void state_arb_specularenable(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
3187     float col[4];
3188     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
3189
3190     /* Don't load the parameter if we're using an arbfp pixel shader, otherwise we'll overwrite
3191      * application provided constants
3192      */
3193     if(device->shader_backend == &arb_program_shader_backend) {
3194         if (use_ps(stateblock)) return;
3195
3196         device = stateblock->wineD3DDevice;
3197         device->activeContext->pshader_const_dirty[ARB_FFP_CONST_SPECULAR_ENABLE] = 1;
3198         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_SPECULAR_ENABLE + 1);
3199     }
3200
3201     if(stateblock->renderState[WINED3DRS_SPECULARENABLE]) {
3202         /* The specular color has no alpha */
3203         col[0] = 1.0; col[1] = 1.0;
3204         col[2] = 1.0; col[3] = 0.0;
3205     } else {
3206         col[0] = 0.0; col[1] = 0.0;
3207         col[2] = 0.0; col[3] = 0.0;
3208     }
3209     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_SPECULAR_ENABLE, col));
3210     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_SPECULAR_ENABLE, col)");
3211 }
3212
3213 static void set_bumpmat_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
3214     DWORD stage = (state - STATE_TEXTURESTAGE(0, 0)) / (WINED3D_HIGHEST_TEXTURE_STATE + 1);
3215     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
3216     float mat[2][2];
3217
3218     if (use_ps(stateblock))
3219     {
3220         if(stage != 0 &&
3221            ((IWineD3DPixelShaderImpl *) stateblock->pixelShader)->baseShader.reg_maps.bumpmat[stage]) {
3222             /* The pixel shader has to know the bump env matrix. Do a constants update if it isn't scheduled
3223              * anyway
3224              */
3225             if(!isStateDirty(context, STATE_PIXELSHADERCONSTANT)) {
3226                 device->StateTable[STATE_PIXELSHADERCONSTANT].apply(STATE_PIXELSHADERCONSTANT, stateblock, context);
3227             }
3228         }
3229
3230         if(device->shader_backend == &arb_program_shader_backend) {
3231             /* Exit now, don't set the bumpmat below, otherwise we may overwrite pixel shader constants */
3232             return;
3233         }
3234     } else if(device->shader_backend == &arb_program_shader_backend) {
3235         device->activeContext->pshader_const_dirty[ARB_FFP_CONST_BUMPMAT(stage)] = 1;
3236         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_BUMPMAT(stage) + 1);
3237     }
3238
3239     mat[0][0] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT00]);
3240     mat[0][1] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT01]);
3241     mat[1][0] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT10]);
3242     mat[1][1] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT11]);
3243
3244     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_BUMPMAT(stage), &mat[0][0]));
3245     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_BUMPMAT(stage), &mat[0][0])");
3246 }
3247
3248 static void tex_bumpenvlum_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
3249     DWORD stage = (state - STATE_TEXTURESTAGE(0, 0)) / (WINED3D_HIGHEST_TEXTURE_STATE + 1);
3250     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
3251     float param[4];
3252
3253     if (use_ps(stateblock))
3254     {
3255         if(stage != 0 &&
3256            ((IWineD3DPixelShaderImpl *) stateblock->pixelShader)->baseShader.reg_maps.luminanceparams[stage]) {
3257             /* The pixel shader has to know the luminance offset. Do a constants update if it
3258              * isn't scheduled anyway
3259              */
3260             if(!isStateDirty(context, STATE_PIXELSHADERCONSTANT)) {
3261                 device->StateTable[STATE_PIXELSHADERCONSTANT].apply(STATE_PIXELSHADERCONSTANT, stateblock, context);
3262             }
3263         }
3264
3265         if(device->shader_backend == &arb_program_shader_backend) {
3266             /* Exit now, don't set the bumpmat below, otherwise we may overwrite pixel shader constants */
3267             return;
3268         }
3269     } else if(device->shader_backend == &arb_program_shader_backend) {
3270         device->activeContext->pshader_const_dirty[ARB_FFP_CONST_LUMINANCE(stage)] = 1;
3271         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_LUMINANCE(stage) + 1);
3272     }
3273
3274     param[0] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVLSCALE]);
3275     param[1] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVLOFFSET]);
3276     param[2] = 0.0;
3277     param[3] = 0.0;
3278
3279     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_LUMINANCE(stage), param));
3280     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_LUMINANCE(stage), param)");
3281 }
3282
3283 static const char *get_argreg(SHADER_BUFFER *buffer, DWORD argnum, unsigned int stage, DWORD arg) {
3284     const char *ret;
3285
3286     if(arg == ARG_UNUSED) return "unused"; /* This is the marker for unused registers */
3287
3288     switch(arg & WINED3DTA_SELECTMASK) {
3289         case WINED3DTA_DIFFUSE:
3290             ret = "fragment.color.primary"; break;
3291
3292         case WINED3DTA_CURRENT:
3293             if(stage == 0) ret = "fragment.color.primary";
3294             else ret = "ret";
3295             break;
3296
3297         case WINED3DTA_TEXTURE:
3298             switch(stage) {
3299                 case 0: ret = "tex0"; break;
3300                 case 1: ret = "tex1"; break;
3301                 case 2: ret = "tex2"; break;
3302                 case 3: ret = "tex3"; break;
3303                 case 4: ret = "tex4"; break;
3304                 case 5: ret = "tex5"; break;
3305                 case 6: ret = "tex6"; break;
3306                 case 7: ret = "tex7"; break;
3307                 default: ret = "unknown texture";
3308             }
3309             break;
3310
3311         case WINED3DTA_TFACTOR:
3312             ret = "tfactor"; break;
3313
3314         case WINED3DTA_SPECULAR:
3315             ret = "fragment.color.secondary"; break;
3316
3317         case WINED3DTA_TEMP:
3318             ret = "tempreg"; break;
3319
3320         case WINED3DTA_CONSTANT:
3321             FIXME("Implement perstage constants\n");
3322             switch(stage) {
3323                 case 0: ret = "const0"; break;
3324                 case 1: ret = "const1"; break;
3325                 case 2: ret = "const2"; break;
3326                 case 3: ret = "const3"; break;
3327                 case 4: ret = "const4"; break;
3328                 case 5: ret = "const5"; break;
3329                 case 6: ret = "const6"; break;
3330                 case 7: ret = "const7"; break;
3331                 default: ret = "unknown constant";
3332             }
3333             break;
3334
3335         default:
3336             return "unknown";
3337     }
3338
3339     if(arg & WINED3DTA_COMPLEMENT) {
3340         shader_addline(buffer, "SUB arg%u, const.x, %s;\n", argnum, ret);
3341         if(argnum == 0) ret = "arg0";
3342         if(argnum == 1) ret = "arg1";
3343         if(argnum == 2) ret = "arg2";
3344     }
3345     if(arg & WINED3DTA_ALPHAREPLICATE) {
3346         shader_addline(buffer, "MOV arg%u, %s.w;\n", argnum, ret);
3347         if(argnum == 0) ret = "arg0";
3348         if(argnum == 1) ret = "arg1";
3349         if(argnum == 2) ret = "arg2";
3350     }
3351     return ret;
3352 }
3353
3354 static void gen_ffp_instr(SHADER_BUFFER *buffer, unsigned int stage, BOOL color, BOOL alpha,
3355                           DWORD dst, DWORD op, DWORD dw_arg0, DWORD dw_arg1, DWORD dw_arg2) {
3356     const char *dstmask, *dstreg, *arg0, *arg1, *arg2;
3357     unsigned int mul = 1;
3358     BOOL mul_final_dest = FALSE;
3359
3360     if(color && alpha) dstmask = "";
3361     else if(color) dstmask = ".xyz";
3362     else dstmask = ".w";
3363
3364     if(dst == tempreg) dstreg = "tempreg";
3365     else dstreg = "ret";
3366
3367     arg0 = get_argreg(buffer, 0, stage, dw_arg0);
3368     arg1 = get_argreg(buffer, 1, stage, dw_arg1);
3369     arg2 = get_argreg(buffer, 2, stage, dw_arg2);
3370
3371     switch(op) {
3372         case WINED3DTOP_DISABLE:
3373             if(stage == 0) shader_addline(buffer, "MOV %s%s, fragment.color.primary;\n", dstreg, dstmask);
3374             break;
3375
3376         case WINED3DTOP_SELECTARG2:
3377             arg1 = arg2;
3378         case WINED3DTOP_SELECTARG1:
3379             shader_addline(buffer, "MOV %s%s, %s;\n", dstreg, dstmask, arg1);
3380             break;
3381
3382         case WINED3DTOP_MODULATE4X:
3383             mul = 2;
3384         case WINED3DTOP_MODULATE2X:
3385             mul *= 2;
3386             if(strcmp(dstreg, "result.color") == 0) {
3387                 dstreg = "ret";
3388                 mul_final_dest = TRUE;
3389             }
3390         case WINED3DTOP_MODULATE:
3391             shader_addline(buffer, "MUL %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
3392             break;
3393
3394         case WINED3DTOP_ADDSIGNED2X:
3395             mul = 2;
3396             if(strcmp(dstreg, "result.color") == 0) {
3397                 dstreg = "ret";
3398                 mul_final_dest = TRUE;
3399             }
3400         case WINED3DTOP_ADDSIGNED:
3401             shader_addline(buffer, "SUB arg2, %s, const.w;\n", arg2);
3402             arg2 = "arg2";
3403         case WINED3DTOP_ADD:
3404             shader_addline(buffer, "ADD_SAT %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
3405             break;
3406
3407         case WINED3DTOP_SUBTRACT:
3408             shader_addline(buffer, "SUB_SAT %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
3409             break;
3410
3411         case WINED3DTOP_ADDSMOOTH:
3412             shader_addline(buffer, "SUB arg1, const.x, %s;\n", arg1);
3413             shader_addline(buffer, "MAD_SAT %s%s, arg1, %s, %s;\n", dstreg, dstmask, arg2, arg1);
3414             break;
3415
3416         case WINED3DTOP_BLENDCURRENTALPHA:
3417             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_CURRENT);
3418             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
3419             break;
3420         case WINED3DTOP_BLENDFACTORALPHA:
3421             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TFACTOR);
3422             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
3423             break;
3424         case WINED3DTOP_BLENDTEXTUREALPHA:
3425             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TEXTURE);
3426             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
3427             break;
3428         case WINED3DTOP_BLENDDIFFUSEALPHA:
3429             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_DIFFUSE);
3430             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
3431             break;
3432
3433         case WINED3DTOP_BLENDTEXTUREALPHAPM:
3434             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TEXTURE);
3435             shader_addline(buffer, "SUB arg0.w, const.x, %s.w;\n", arg0);
3436             shader_addline(buffer, "MAD_SAT %s%s, %s, arg0.w, %s;\n", dstreg, dstmask, arg2, arg1);
3437             break;
3438
3439         /* D3DTOP_PREMODULATE ???? */
3440
3441         case WINED3DTOP_MODULATEINVALPHA_ADDCOLOR:
3442             shader_addline(buffer, "SUB arg0.w, const.x, %s;\n", arg1);
3443             shader_addline(buffer, "MAD_SAT %s%s, arg0.w, %s, %s;\n", dstreg, dstmask, arg2, arg1);
3444             break;
3445         case WINED3DTOP_MODULATEALPHA_ADDCOLOR:
3446             shader_addline(buffer, "MAD_SAT %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg1, arg2, arg1);
3447             break;
3448         case WINED3DTOP_MODULATEINVCOLOR_ADDALPHA:
3449             shader_addline(buffer, "SUB arg0, const.x, %s;\n", arg1);
3450             shader_addline(buffer, "MAD_SAT %s%s, arg0, %s, %s.w;\n", dstreg, dstmask, arg2, arg1);
3451             break;
3452         case WINED3DTOP_MODULATECOLOR_ADDALPHA:
3453             shader_addline(buffer, "MAD_SAT %s%s, %s, %s, %s.w;\n", dstreg, dstmask, arg1, arg2, arg1);
3454             break;
3455
3456         case WINED3DTOP_DOTPRODUCT3:
3457             mul = 4;
3458             if(strcmp(dstreg, "result.color") == 0) {
3459                 dstreg = "ret";
3460                 mul_final_dest = TRUE;
3461             }
3462             shader_addline(buffer, "SUB arg1, %s, const.w;\n", arg1);
3463             shader_addline(buffer, "SUB arg2, %s, const.w;\n", arg2);
3464             shader_addline(buffer, "DP3_SAT %s%s, arg1, arg2;\n", dstreg, dstmask);
3465             break;
3466
3467         case WINED3DTOP_MULTIPLYADD:
3468             shader_addline(buffer, "MAD_SAT %s%s, %s, %s, %s;\n", dstreg, dstmask, arg1, arg2, arg0);
3469             break;
3470
3471         case WINED3DTOP_LERP:
3472             /* The msdn is not quite right here */
3473             shader_addline(buffer, "LRP %s%s, %s, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
3474             break;
3475
3476         case WINED3DTOP_BUMPENVMAP:
3477         case WINED3DTOP_BUMPENVMAPLUMINANCE:
3478             /* Those are handled in the first pass of the shader(generation pass 1 and 2) already */
3479             break;
3480
3481         default:
3482             FIXME("Unhandled texture op %08x\n", op);
3483     }
3484
3485     if(mul == 2) {
3486         shader_addline(buffer, "MUL_SAT %s%s, %s, const.y;\n", mul_final_dest ? "result.color" : dstreg, dstmask, dstreg);
3487     } else if(mul == 4) {
3488         shader_addline(buffer, "MUL_SAT %s%s, %s, const.z;\n", mul_final_dest ? "result.color" : dstreg, dstmask, dstreg);
3489     }
3490 }
3491
3492 /* The stateblock is passed for GLINFO_LOCATION */
3493 static GLuint gen_arbfp_ffp_shader(const struct ffp_frag_settings *settings, IWineD3DStateBlockImpl *stateblock)
3494 {
3495     unsigned int stage;
3496     SHADER_BUFFER buffer;
3497     BOOL tex_read[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
3498     BOOL bump_used[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
3499     BOOL luminance_used[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
3500     const char *textype;
3501     const char *instr, *sat;
3502     char colorcor_dst[8];
3503     GLuint ret;
3504     DWORD arg0, arg1, arg2;
3505     BOOL tempreg_used = FALSE, tfactor_used = FALSE;
3506     BOOL op_equal;
3507     const char *final_combiner_src = "ret";
3508
3509     /* Find out which textures are read */
3510     for(stage = 0; stage < MAX_TEXTURES; stage++) {
3511         if(settings->op[stage].cop == WINED3DTOP_DISABLE) break;
3512         arg0 = settings->op[stage].carg0 & WINED3DTA_SELECTMASK;
3513         arg1 = settings->op[stage].carg1 & WINED3DTA_SELECTMASK;
3514         arg2 = settings->op[stage].carg2 & WINED3DTA_SELECTMASK;
3515         if(arg0 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
3516         if(arg1 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
3517         if(arg2 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
3518
3519         if(settings->op[stage].cop == WINED3DTOP_BLENDTEXTUREALPHA) tex_read[stage] = TRUE;
3520         if(settings->op[stage].cop == WINED3DTOP_BLENDTEXTUREALPHAPM) tex_read[stage] = TRUE;
3521         if(settings->op[stage].cop == WINED3DTOP_BUMPENVMAP) {
3522             bump_used[stage] = TRUE;
3523             tex_read[stage] = TRUE;
3524         }
3525         if(settings->op[stage].cop == WINED3DTOP_BUMPENVMAPLUMINANCE) {
3526             bump_used[stage] = TRUE;
3527             tex_read[stage] = TRUE;
3528             luminance_used[stage] = TRUE;
3529         } else if(settings->op[stage].cop == WINED3DTOP_BLENDFACTORALPHA) {
3530             tfactor_used = TRUE;
3531         }
3532
3533         if(arg0 == WINED3DTA_TFACTOR || arg1 == WINED3DTA_TFACTOR || arg2 == WINED3DTA_TFACTOR) {
3534             tfactor_used = TRUE;
3535         }
3536
3537         if(settings->op[stage].dst == tempreg) tempreg_used = TRUE;
3538         if(arg0 == WINED3DTA_TEMP || arg1 == WINED3DTA_TEMP || arg2 == WINED3DTA_TEMP) {
3539             tempreg_used = TRUE;
3540         }
3541
3542         if(settings->op[stage].aop == WINED3DTOP_DISABLE) continue;
3543         arg0 = settings->op[stage].aarg0 & WINED3DTA_SELECTMASK;
3544         arg1 = settings->op[stage].aarg1 & WINED3DTA_SELECTMASK;
3545         arg2 = settings->op[stage].aarg2 & WINED3DTA_SELECTMASK;
3546         if(arg0 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
3547         if(arg1 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
3548         if(arg2 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
3549
3550         if(arg0 == WINED3DTA_TEMP || arg1 == WINED3DTA_TEMP || arg2 == WINED3DTA_TEMP) {
3551             tempreg_used = TRUE;
3552         }
3553         if(arg0 == WINED3DTA_TFACTOR || arg1 == WINED3DTA_TFACTOR || arg2 == WINED3DTA_TFACTOR) {
3554             tfactor_used = TRUE;
3555         }
3556     }
3557
3558     /* Shader header */
3559     shader_buffer_init(&buffer);
3560
3561     shader_addline(&buffer, "!!ARBfp1.0\n");
3562
3563     switch(settings->fog) {
3564         case FOG_OFF:                                                         break;
3565         case FOG_LINEAR: shader_addline(&buffer, "OPTION ARB_fog_linear;\n"); break;
3566         case FOG_EXP:    shader_addline(&buffer, "OPTION ARB_fog_exp;\n");    break;
3567         case FOG_EXP2:   shader_addline(&buffer, "OPTION ARB_fog_exp2;\n");   break;
3568         default: FIXME("Unexpected fog setting %d\n", settings->fog);
3569     }
3570
3571     shader_addline(&buffer, "PARAM const = {1, 2, 4, 0.5};\n");
3572     shader_addline(&buffer, "TEMP TMP;\n");
3573     shader_addline(&buffer, "TEMP ret;\n");
3574     if(tempreg_used || settings->sRGB_write) shader_addline(&buffer, "TEMP tempreg;\n");
3575     shader_addline(&buffer, "TEMP arg0;\n");
3576     shader_addline(&buffer, "TEMP arg1;\n");
3577     shader_addline(&buffer, "TEMP arg2;\n");
3578     for(stage = 0; stage < MAX_TEXTURES; stage++) {
3579         if(!tex_read[stage]) continue;
3580         shader_addline(&buffer, "TEMP tex%u;\n", stage);
3581         if(!bump_used[stage]) continue;
3582         shader_addline(&buffer, "PARAM bumpmat%u = program.env[%u];\n", stage, ARB_FFP_CONST_BUMPMAT(stage));
3583         if(!luminance_used[stage]) continue;
3584         shader_addline(&buffer, "PARAM luminance%u = program.env[%u];\n", stage, ARB_FFP_CONST_LUMINANCE(stage));
3585     }
3586     if(tfactor_used) {
3587         shader_addline(&buffer, "PARAM tfactor = program.env[%u];\n", ARB_FFP_CONST_TFACTOR);
3588     }
3589         shader_addline(&buffer, "PARAM specular_enable = program.env[%u];\n", ARB_FFP_CONST_SPECULAR_ENABLE);
3590
3591     if(settings->sRGB_write) {
3592         shader_addline(&buffer, "PARAM srgb_consts1 = {%f, %f, %f, %f};\n",
3593                        srgb_mul_low, srgb_cmp, srgb_pow, srgb_mul_high);
3594         shader_addline(&buffer, "PARAM srgb_consts2 = {%f, %f, %f, %f};\n",
3595                        srgb_sub_high, 0.0, 0.0, 0.0);
3596     }
3597
3598     /* Generate texture sampling instructions) */
3599     for(stage = 0; stage < MAX_TEXTURES && settings->op[stage].cop != WINED3DTOP_DISABLE; stage++) {
3600         if(!tex_read[stage]) continue;
3601
3602         switch(settings->op[stage].tex_type) {
3603             case tex_1d:                    textype = "1D";     break;
3604             case tex_2d:                    textype = "2D";     break;
3605             case tex_3d:                    textype = "3D";     break;
3606             case tex_cube:                  textype = "CUBE";   break;
3607             case tex_rect:                  textype = "RECT";   break;
3608             default: textype = "unexpected_textype";   break;
3609         }
3610
3611         if(settings->op[stage].cop == WINED3DTOP_BUMPENVMAP ||
3612            settings->op[stage].cop == WINED3DTOP_BUMPENVMAPLUMINANCE) {
3613             sat = "";
3614         } else {
3615             sat = "_SAT";
3616         }
3617
3618         if(settings->op[stage].projected == proj_none) {
3619             instr = "TEX";
3620         } else if(settings->op[stage].projected == proj_count4 ||
3621                   settings->op[stage].projected == proj_count3) {
3622             instr = "TXP";
3623         } else {
3624             FIXME("Unexpected projection mode %d\n", settings->op[stage].projected);
3625             instr = "TXP";
3626         }
3627
3628         if(stage > 0 &&
3629            (settings->op[stage - 1].cop == WINED3DTOP_BUMPENVMAP ||
3630             settings->op[stage - 1].cop == WINED3DTOP_BUMPENVMAPLUMINANCE)) {
3631             shader_addline(&buffer, "SWZ arg1, bumpmat%u, x, z, 0, 0;\n", stage - 1);
3632             shader_addline(&buffer, "DP3 ret.x, arg1, tex%u;\n", stage - 1);
3633             shader_addline(&buffer, "SWZ arg1, bumpmat%u, y, w, 0, 0;\n", stage - 1);
3634             shader_addline(&buffer, "DP3 ret.y, arg1, tex%u;\n", stage - 1);
3635
3636             /* with projective textures, texbem only divides the static texture coord, not the displacement,
3637              * so multiply the displacement with the dividing parameter before passing it to TXP
3638              */
3639             if (settings->op[stage].projected != proj_none) {
3640                 if(settings->op[stage].projected == proj_count4) {
3641                     shader_addline(&buffer, "MOV ret.w, fragment.texcoord[%u].w;\n", stage);
3642                     shader_addline(&buffer, "MUL ret.xyz, ret, fragment.texcoord[%u].w, fragment.texcoord[%u];\n", stage, stage);
3643                 } else {
3644                     shader_addline(&buffer, "MOV ret.w, fragment.texcoord[%u].z;\n", stage);
3645                     shader_addline(&buffer, "MAD ret.xyz, ret, fragment.texcoord[%u].z, fragment.texcoord[%u];\n", stage, stage);
3646                 }
3647             } else {
3648                 shader_addline(&buffer, "ADD ret, ret, fragment.texcoord[%u];\n", stage);
3649             }
3650
3651             shader_addline(&buffer, "%s%s tex%u, ret, texture[%u], %s;\n",
3652                            instr, sat, stage, stage, textype);
3653             if(settings->op[stage - 1].cop == WINED3DTOP_BUMPENVMAPLUMINANCE) {
3654                 shader_addline(&buffer, "MAD_SAT ret.x, tex%u.z, luminance%u.x, luminance%u.y;\n",
3655                                stage - 1, stage - 1, stage - 1);
3656                 shader_addline(&buffer, "MUL tex%u, tex%u, ret.x;\n", stage, stage);
3657             }
3658         } else if(settings->op[stage].projected == proj_count3) {
3659             shader_addline(&buffer, "MOV ret, fragment.texcoord[%u];\n", stage);
3660             shader_addline(&buffer, "MOV ret.w, ret.z;\n");
3661             shader_addline(&buffer, "%s%s tex%u, ret, texture[%u], %s;\n",
3662                             instr, sat, stage, stage, textype);
3663         } else {
3664             shader_addline(&buffer, "%s%s tex%u, fragment.texcoord[%u], texture[%u], %s;\n",
3665                             instr, sat, stage, stage, stage, textype);
3666         }
3667
3668         sprintf(colorcor_dst, "tex%u", stage);
3669         gen_color_correction(&buffer, colorcor_dst, WINED3DSP_WRITEMASK_ALL, "const.x", "const.y",
3670                 settings->op[stage].color_fixup);
3671     }
3672
3673     /* Generate the main shader */
3674     for(stage = 0; stage < MAX_TEXTURES; stage++) {
3675         if(settings->op[stage].cop == WINED3DTOP_DISABLE) {
3676             if(stage == 0) {
3677                 final_combiner_src = "fragment.color.primary";
3678             }
3679             break;
3680         }
3681
3682         if(settings->op[stage].cop == WINED3DTOP_SELECTARG1 &&
3683            settings->op[stage].aop == WINED3DTOP_SELECTARG1) {
3684             op_equal = settings->op[stage].carg1 == settings->op[stage].aarg1;
3685         } else if(settings->op[stage].cop == WINED3DTOP_SELECTARG1 &&
3686                   settings->op[stage].aop == WINED3DTOP_SELECTARG2) {
3687             op_equal = settings->op[stage].carg1 == settings->op[stage].aarg2;
3688         } else if(settings->op[stage].cop == WINED3DTOP_SELECTARG2 &&
3689                   settings->op[stage].aop == WINED3DTOP_SELECTARG1) {
3690             op_equal = settings->op[stage].carg2 == settings->op[stage].aarg1;
3691         } else if(settings->op[stage].cop == WINED3DTOP_SELECTARG2 &&
3692                   settings->op[stage].aop == WINED3DTOP_SELECTARG2) {
3693             op_equal = settings->op[stage].carg2 == settings->op[stage].aarg2;
3694         } else {
3695             op_equal = settings->op[stage].aop   == settings->op[stage].cop &&
3696                        settings->op[stage].carg0 == settings->op[stage].aarg0 &&
3697                        settings->op[stage].carg1 == settings->op[stage].aarg1 &&
3698                        settings->op[stage].carg2 == settings->op[stage].aarg2;
3699         }
3700
3701         if(settings->op[stage].aop == WINED3DTOP_DISABLE) {
3702             gen_ffp_instr(&buffer, stage, TRUE, FALSE, settings->op[stage].dst,
3703                           settings->op[stage].cop, settings->op[stage].carg0,
3704                           settings->op[stage].carg1, settings->op[stage].carg2);
3705             if(stage == 0) {
3706                 shader_addline(&buffer, "MOV ret.w, fragment.color.primary.w;\n");
3707             }
3708         } else if(op_equal) {
3709             gen_ffp_instr(&buffer, stage, TRUE, TRUE, settings->op[stage].dst,
3710                           settings->op[stage].cop, settings->op[stage].carg0,
3711                           settings->op[stage].carg1, settings->op[stage].carg2);
3712         } else {
3713             gen_ffp_instr(&buffer, stage, TRUE, FALSE, settings->op[stage].dst,
3714                           settings->op[stage].cop, settings->op[stage].carg0,
3715                           settings->op[stage].carg1, settings->op[stage].carg2);
3716             gen_ffp_instr(&buffer, stage, FALSE, TRUE, settings->op[stage].dst,
3717                           settings->op[stage].aop, settings->op[stage].aarg0,
3718                           settings->op[stage].aarg1, settings->op[stage].aarg2);
3719         }
3720     }
3721
3722     if(settings->sRGB_write) {
3723         shader_addline(&buffer, "MAD ret, fragment.color.secondary, specular_enable, %s;\n", final_combiner_src);
3724         arbfp_add_sRGB_correction(&buffer, "ret", "arg0", "arg1", "arg2", "tempreg", FALSE);
3725     } else {
3726         shader_addline(&buffer, "MAD result.color, fragment.color.secondary, specular_enable, %s;\n", final_combiner_src);
3727     }
3728
3729     /* Footer */
3730     shader_addline(&buffer, "END\n");
3731
3732     /* Generate the shader */
3733     GL_EXTCALL(glGenProgramsARB(1, &ret));
3734     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, ret));
3735     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, strlen(buffer.buffer), buffer.buffer));
3736
3737     if (glGetError() == GL_INVALID_OPERATION) {
3738         GLint pos;
3739         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
3740         FIXME("Fragment program error at position %d: %s\n", pos,
3741               debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
3742     }
3743     shader_buffer_free(&buffer);
3744     return ret;
3745 }
3746
3747 static void fragment_prog_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
3748     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
3749     struct shader_arb_priv *priv = device->fragment_priv;
3750     BOOL use_pshader = use_ps(stateblock);
3751     BOOL use_vshader = use_vs(stateblock);
3752     struct ffp_frag_settings settings;
3753     const struct arbfp_ffp_desc *desc;
3754     unsigned int i;
3755
3756     TRACE("state %#x, stateblock %p, context %p\n", state, stateblock, context);
3757
3758     if(isStateDirty(context, STATE_RENDER(WINED3DRS_FOGENABLE))) {
3759         if(!use_pshader && device->shader_backend == &arb_program_shader_backend && context->last_was_pshader) {
3760             /* Reload fixed function constants since they collide with the pixel shader constants */
3761             for(i = 0; i < MAX_TEXTURES; i++) {
3762                 set_bumpmat_arbfp(STATE_TEXTURESTAGE(i, WINED3DTSS_BUMPENVMAT00), stateblock, context);
3763             }
3764             state_texfactor_arbfp(STATE_RENDER(WINED3DRS_TEXTUREFACTOR), stateblock, context);
3765             state_arb_specularenable(STATE_RENDER(WINED3DRS_SPECULARENABLE), stateblock, context);
3766         } else if(use_pshader && !isStateDirty(context, device->StateTable[STATE_VSHADER].representative)) {
3767             device->shader_backend->shader_select((IWineD3DDevice *)stateblock->wineD3DDevice, use_pshader, use_vshader);
3768         }
3769         return;
3770     }
3771
3772     if(!use_pshader) {
3773         /* Find or create a shader implementing the fixed function pipeline settings, then activate it */
3774         gen_ffp_frag_op(stateblock, &settings, FALSE);
3775         desc = (const struct arbfp_ffp_desc *)find_ffp_frag_shader(&priv->fragment_shaders, &settings);
3776         if(!desc) {
3777             struct arbfp_ffp_desc *new_desc = HeapAlloc(GetProcessHeap(), 0, sizeof(*new_desc));
3778             if (!new_desc)
3779             {
3780                 ERR("Out of memory\n");
3781                 return;
3782             }
3783             new_desc->num_textures_used = 0;
3784             for(i = 0; i < GL_LIMITS(texture_stages); i++) {
3785                 if(settings.op[i].cop == WINED3DTOP_DISABLE) break;
3786                 new_desc->num_textures_used = i;
3787             }
3788
3789             memcpy(&new_desc->parent.settings, &settings, sizeof(settings));
3790             new_desc->shader = gen_arbfp_ffp_shader(&settings, stateblock);
3791             add_ffp_frag_shader(&priv->fragment_shaders, &new_desc->parent);
3792             TRACE("Allocated fixed function replacement shader descriptor %p\n", new_desc);
3793             desc = new_desc;
3794         }
3795
3796         /* Now activate the replacement program. GL_FRAGMENT_PROGRAM_ARB is already active(however, note the
3797          * comment above the shader_select call below). If e.g. GLSL is active, the shader_select call will
3798          * deactivate it.
3799          */
3800         GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, desc->shader));
3801         checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, desc->shader)");
3802         priv->current_fprogram_id = desc->shader;
3803
3804         if(device->shader_backend == &arb_program_shader_backend && context->last_was_pshader) {
3805             /* Reload fixed function constants since they collide with the pixel shader constants */
3806             for(i = 0; i < MAX_TEXTURES; i++) {
3807                 set_bumpmat_arbfp(STATE_TEXTURESTAGE(i, WINED3DTSS_BUMPENVMAT00), stateblock, context);
3808             }
3809             state_texfactor_arbfp(STATE_RENDER(WINED3DRS_TEXTUREFACTOR), stateblock, context);
3810             state_arb_specularenable(STATE_RENDER(WINED3DRS_SPECULARENABLE), stateblock, context);
3811         }
3812         context->last_was_pshader = FALSE;
3813     } else {
3814         context->last_was_pshader = TRUE;
3815     }
3816
3817     /* Finally, select the shader. If a pixel shader is used, it will be set and enabled by the shader backend.
3818      * If this shader backend is arbfp(most likely), then it will simply overwrite the last fixed function replace-
3819      * ment shader. If the shader backend is not ARB, it currently is important that the opengl implementation
3820      * type overwrites GL_ARB_fragment_program. This is currently the case with GLSL. If we really want to use
3821      * atifs or nvrc pixel shaders with arb fragment programs we'd have to disable GL_FRAGMENT_PROGRAM_ARB here
3822      *
3823      * Don't call shader_select if the vertex shader is dirty, because it will be called later on by the vertex
3824      * shader handler
3825      */
3826     if(!isStateDirty(context, device->StateTable[STATE_VSHADER].representative)) {
3827         device->shader_backend->shader_select((IWineD3DDevice *)stateblock->wineD3DDevice, use_pshader, use_vshader);
3828
3829         if (!isStateDirty(context, STATE_VERTEXSHADERCONSTANT) && (use_vshader || use_pshader)) {
3830             device->StateTable[STATE_VERTEXSHADERCONSTANT].apply(STATE_VERTEXSHADERCONSTANT, stateblock, context);
3831         }
3832     }
3833     if(use_pshader) {
3834         device->StateTable[STATE_PIXELSHADERCONSTANT].apply(STATE_PIXELSHADERCONSTANT, stateblock, context);
3835     }
3836 }
3837
3838 /* We can't link the fog states to the fragment state directly since the vertex pipeline links them
3839  * to FOGENABLE. A different linking in different pipeline parts can't be expressed in the combined
3840  * state table, so we need to handle that with a forwarding function. The other invisible side effect
3841  * is that changing the fog start and fog end(which links to FOGENABLE in vertex) results in the
3842  * fragment_prog_arbfp function being called because FOGENABLE is dirty, which calls this function here
3843  */
3844 static void state_arbfp_fog(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
3845     enum fogsource new_source;
3846
3847     TRACE("state %#x, stateblock %p, context %p\n", state, stateblock, context);
3848
3849     if(!isStateDirty(context, STATE_PIXELSHADER)) {
3850         fragment_prog_arbfp(state, stateblock, context);
3851     }
3852
3853     if(!stateblock->renderState[WINED3DRS_FOGENABLE]) return;
3854
3855     if(stateblock->renderState[WINED3DRS_FOGTABLEMODE] == WINED3DFOG_NONE) {
3856         if(use_vs(stateblock)) {
3857             new_source = FOGSOURCE_VS;
3858         } else {
3859             if(stateblock->renderState[WINED3DRS_FOGVERTEXMODE] == WINED3DFOG_NONE || context->last_was_rhw) {
3860                 new_source = FOGSOURCE_COORD;
3861             } else {
3862                 new_source = FOGSOURCE_FFP;
3863             }
3864         }
3865     } else {
3866         new_source = FOGSOURCE_FFP;
3867     }
3868     if(new_source != context->fog_source) {
3869         context->fog_source = new_source;
3870         state_fogstartend(STATE_RENDER(WINED3DRS_FOGSTART), stateblock, context);
3871     }
3872 }
3873
3874 static void textransform(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
3875     if(!isStateDirty(context, STATE_PIXELSHADER)) {
3876         fragment_prog_arbfp(state, stateblock, context);
3877     }
3878 }
3879
3880 #undef GLINFO_LOCATION
3881
3882 static const struct StateEntryTemplate arbfp_fragmentstate_template[] = {
3883     {STATE_RENDER(WINED3DRS_TEXTUREFACTOR),               { STATE_RENDER(WINED3DRS_TEXTUREFACTOR),              state_texfactor_arbfp   }, WINED3D_GL_EXT_NONE             },
3884     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3885     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3886     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3887     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3888     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3889     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3890     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3891     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3892     {STATE_TEXTURESTAGE(0, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3893     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3894     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3895     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3896     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3897     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
3898     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
3899     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3900     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3901     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3902     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3903     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3904     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3905     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3906     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3907     {STATE_TEXTURESTAGE(1, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3908     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3909     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3910     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3911     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3912     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
3913     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
3914     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3915     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3916     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3917     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3918     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3919     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3920     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3921     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3922     {STATE_TEXTURESTAGE(2, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3923     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3924     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3925     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3926     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3927     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
3928     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
3929     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3930     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3931     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3932     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3933     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3934     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3935     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3936     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3937     {STATE_TEXTURESTAGE(3, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3938     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3939     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3940     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3941     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3942     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
3943     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
3944     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3945     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3946     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3947     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3948     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3949     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3950     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3951     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3952     {STATE_TEXTURESTAGE(4, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3953     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3954     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3955     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3956     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3957     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
3958     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
3959     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3960     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3961     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3962     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3963     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3964     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3965     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3966     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3967     {STATE_TEXTURESTAGE(5, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3968     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3969     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3970     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3971     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3972     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
3973     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
3974     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3975     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3976     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3977     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3978     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3979     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3980     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3981     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3982     {STATE_TEXTURESTAGE(6, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3983     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3984     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3985     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3986     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3987     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
3988     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
3989     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3990     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3991     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3992     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3993     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3994     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3995     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3996     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3997     {STATE_TEXTURESTAGE(7, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
3998     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
3999     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
4000     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
4001     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
4002     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
4003     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
4004     {STATE_SAMPLER(0),                                    { STATE_SAMPLER(0),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
4005     {STATE_SAMPLER(1),                                    { STATE_SAMPLER(1),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
4006     {STATE_SAMPLER(2),                                    { STATE_SAMPLER(2),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
4007     {STATE_SAMPLER(3),                                    { STATE_SAMPLER(3),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
4008     {STATE_SAMPLER(4),                                    { STATE_SAMPLER(4),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
4009     {STATE_SAMPLER(5),                                    { STATE_SAMPLER(5),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
4010     {STATE_SAMPLER(6),                                    { STATE_SAMPLER(6),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
4011     {STATE_SAMPLER(7),                                    { STATE_SAMPLER(7),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
4012     {STATE_PIXELSHADER,                                   { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
4013     {STATE_RENDER(WINED3DRS_FOGENABLE),                   { STATE_RENDER(WINED3DRS_FOGENABLE),                  state_arbfp_fog         }, WINED3D_GL_EXT_NONE             },
4014     {STATE_RENDER(WINED3DRS_FOGTABLEMODE),                { STATE_RENDER(WINED3DRS_FOGENABLE),                  state_arbfp_fog         }, WINED3D_GL_EXT_NONE             },
4015     {STATE_RENDER(WINED3DRS_FOGVERTEXMODE),               { STATE_RENDER(WINED3DRS_FOGENABLE),                  state_arbfp_fog         }, WINED3D_GL_EXT_NONE             },
4016     {STATE_RENDER(WINED3DRS_FOGSTART),                    { STATE_RENDER(WINED3DRS_FOGSTART),                   state_fogstartend       }, WINED3D_GL_EXT_NONE             },
4017     {STATE_RENDER(WINED3DRS_FOGEND),                      { STATE_RENDER(WINED3DRS_FOGSTART),                   state_fogstartend       }, WINED3D_GL_EXT_NONE             },
4018     {STATE_RENDER(WINED3DRS_SRGBWRITEENABLE),             { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
4019     {STATE_RENDER(WINED3DRS_FOGCOLOR),                    { STATE_RENDER(WINED3DRS_FOGCOLOR),                   state_fogcolor          }, WINED3D_GL_EXT_NONE             },
4020     {STATE_RENDER(WINED3DRS_FOGDENSITY),                  { STATE_RENDER(WINED3DRS_FOGDENSITY),                 state_fogdensity        }, WINED3D_GL_EXT_NONE             },
4021     {STATE_TEXTURESTAGE(0,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(0, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
4022     {STATE_TEXTURESTAGE(1,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(1, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
4023     {STATE_TEXTURESTAGE(2,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(2, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
4024     {STATE_TEXTURESTAGE(3,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(3, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
4025     {STATE_TEXTURESTAGE(4,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(4, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
4026     {STATE_TEXTURESTAGE(5,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(5, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
4027     {STATE_TEXTURESTAGE(6,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(6, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
4028     {STATE_TEXTURESTAGE(7,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(7, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
4029     {STATE_RENDER(WINED3DRS_SPECULARENABLE),              { STATE_RENDER(WINED3DRS_SPECULARENABLE),             state_arb_specularenable}, WINED3D_GL_EXT_NONE             },
4030     {0 /* Terminate */,                                   { 0,                                                  0                       }, WINED3D_GL_EXT_NONE             },
4031 };
4032
4033 const struct fragment_pipeline arbfp_fragment_pipeline = {
4034     arbfp_enable,
4035     arbfp_get_caps,
4036     arbfp_alloc,
4037     arbfp_free,
4038     shader_arb_color_fixup_supported,
4039     arbfp_fragmentstate_template,
4040     TRUE /* We can disable projected textures */
4041 };
4042
4043 #define GLINFO_LOCATION device->adapter->gl_info
4044
4045 struct arbfp_blit_priv {
4046     GLenum yuy2_rect_shader, yuy2_2d_shader;
4047     GLenum uyvy_rect_shader, uyvy_2d_shader;
4048     GLenum yv12_rect_shader, yv12_2d_shader;
4049 };
4050
4051 static HRESULT arbfp_blit_alloc(IWineD3DDevice *iface) {
4052     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
4053     device->blit_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(struct arbfp_blit_priv));
4054     if(!device->blit_priv) {
4055         ERR("Out of memory\n");
4056         return E_OUTOFMEMORY;
4057     }
4058     return WINED3D_OK;
4059 }
4060 static void arbfp_blit_free(IWineD3DDevice *iface) {
4061     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
4062     struct arbfp_blit_priv *priv = device->blit_priv;
4063
4064     ENTER_GL();
4065     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yuy2_rect_shader));
4066     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yuy2_2d_shader));
4067     GL_EXTCALL(glDeleteProgramsARB(1, &priv->uyvy_rect_shader));
4068     GL_EXTCALL(glDeleteProgramsARB(1, &priv->uyvy_2d_shader));
4069     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yv12_rect_shader));
4070     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yv12_2d_shader));
4071     checkGLcall("Delete yuv programs\n");
4072     LEAVE_GL();
4073 }
4074
4075 static BOOL gen_planar_yuv_read(SHADER_BUFFER *buffer, enum yuv_fixup yuv_fixup, GLenum textype, char *luminance)
4076 {
4077     char chroma;
4078     const char *tex, *texinstr;
4079
4080     if (yuv_fixup == YUV_FIXUP_UYVY) {
4081         chroma = 'x';
4082         *luminance = 'w';
4083     } else {
4084         chroma = 'w';
4085         *luminance = 'x';
4086     }
4087     switch(textype) {
4088         case GL_TEXTURE_2D:             tex = "2D";     texinstr = "TXP"; break;
4089         case GL_TEXTURE_RECTANGLE_ARB:  tex = "RECT";   texinstr = "TEX"; break;
4090         default:
4091             /* This is more tricky than just replacing the texture type - we have to navigate
4092              * properly in the texture to find the correct chroma values
4093              */
4094             FIXME("Implement yuv correction for non-2d, non-rect textures\n");
4095             return FALSE;
4096     }
4097
4098     /* First we have to read the chroma values. This means we need at least two pixels(no filtering),
4099      * or 4 pixels(with filtering). To get the unmodified chromas, we have to rid ourselves of the
4100      * filtering when we sample the texture.
4101      *
4102      * These are the rules for reading the chroma:
4103      *
4104      * Even pixel: Cr
4105      * Even pixel: U
4106      * Odd pixel: V
4107      *
4108      * So we have to get the sampling x position in non-normalized coordinates in integers
4109      */
4110     if(textype != GL_TEXTURE_RECTANGLE_ARB) {
4111         shader_addline(buffer, "MUL texcrd.xy, fragment.texcoord[0], size.x;\n");
4112         shader_addline(buffer, "MOV texcrd.w, size.x;\n");
4113     } else {
4114         shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
4115     }
4116     /* We must not allow filtering between pixel x and x+1, this would mix U and V
4117      * Vertical filtering is ok. However, bear in mind that the pixel center is at
4118      * 0.5, so add 0.5.
4119      */
4120     shader_addline(buffer, "FLR texcrd.x, texcrd.x;\n");
4121     shader_addline(buffer, "ADD texcrd.x, texcrd.x, coef.y;\n");
4122
4123     /* Divide the x coordinate by 0.5 and get the fraction. This gives 0.25 and 0.75 for the
4124      * even and odd pixels respectively
4125      */
4126     shader_addline(buffer, "MUL texcrd2, texcrd, coef.y;\n");
4127     shader_addline(buffer, "FRC texcrd2, texcrd2;\n");
4128
4129     /* Sample Pixel 1 */
4130     shader_addline(buffer, "%s luminance, texcrd, texture[0], %s;\n", texinstr, tex);
4131
4132     /* Put the value into either of the chroma values */
4133     shader_addline(buffer, "SGE temp.x, texcrd2.x, coef.y;\n");
4134     shader_addline(buffer, "MUL chroma.x, luminance.%c, temp.x;\n", chroma);
4135     shader_addline(buffer, "SLT temp.x, texcrd2.x, coef.y;\n");
4136     shader_addline(buffer, "MUL chroma.y, luminance.%c, temp.x;\n", chroma);
4137
4138     /* Sample pixel 2. If we read an even pixel(SLT above returned 1), sample
4139      * the pixel right to the current one. Otherwise, sample the left pixel.
4140      * Bias and scale the SLT result to -1;1 and add it to the texcrd.x.
4141      */
4142     shader_addline(buffer, "MAD temp.x, temp.x, coef.z, -coef.x;\n");
4143     shader_addline(buffer, "ADD texcrd.x, texcrd, temp.x;\n");
4144     shader_addline(buffer, "%s luminance, texcrd, texture[0], %s;\n", texinstr, tex);
4145
4146     /* Put the value into the other chroma */
4147     shader_addline(buffer, "SGE temp.x, texcrd2.x, coef.y;\n");
4148     shader_addline(buffer, "MAD chroma.y, luminance.%c, temp.x, chroma.y;\n", chroma);
4149     shader_addline(buffer, "SLT temp.x, texcrd2.x, coef.y;\n");
4150     shader_addline(buffer, "MAD chroma.x, luminance.%c, temp.x, chroma.x;\n", chroma);
4151
4152     /* TODO: If filtering is enabled, sample a 2nd pair of pixels left or right of
4153      * the current one and lerp the two U and V values
4154      */
4155
4156     /* This gives the correctly filtered luminance value */
4157     shader_addline(buffer, "TEX luminance, fragment.texcoord[0], texture[0], %s;\n", tex);
4158
4159     return TRUE;
4160 }
4161
4162 static BOOL gen_yv12_read(SHADER_BUFFER *buffer, GLenum textype, char *luminance)
4163 {
4164     const char *tex;
4165
4166     switch(textype) {
4167         case GL_TEXTURE_2D:             tex = "2D";     break;
4168         case GL_TEXTURE_RECTANGLE_ARB:  tex = "RECT";   break;
4169         default:
4170             FIXME("Implement yv12 correction for non-2d, non-rect textures\n");
4171             return FALSE;
4172     }
4173
4174     /* YV12 surfaces contain a WxH sized luminance plane, followed by a (W/2)x(H/2)
4175      * V and a (W/2)x(H/2) U plane, each with 8 bit per pixel. So the effective
4176      * bitdepth is 12 bits per pixel. Since the U and V planes have only half the
4177      * pitch of the luminance plane, the packing into the gl texture is a bit
4178      * unfortunate. If the whole texture is interpreted as luminance data it looks
4179      * approximately like this:
4180      *
4181      *        +----------------------------------+----
4182      *        |                                  |
4183      *        |                                  |
4184      *        |                                  |
4185      *        |                                  |
4186      *        |                                  |   2
4187      *        |            LUMINANCE             |   -
4188      *        |                                  |   3
4189      *        |                                  |
4190      *        |                                  |
4191      *        |                                  |
4192      *        |                                  |
4193      *        +----------------+-----------------+----
4194      *        |                |                 |
4195      *        |  U even rows   |  U odd rows     |
4196      *        |                |                 |   1
4197      *        +----------------+------------------   -
4198      *        |                |                 |   3
4199      *        |  V even rows   |  V odd rows     |
4200      *        |                |                 |
4201      *        +----------------+-----------------+----
4202      *        |                |                 |
4203      *        |     0.5        |       0.5       |
4204      *
4205      * So it appears as if there are 4 chroma images, but in fact the odd rows
4206      * in the chroma images are in the same row as the even ones. So its is
4207      * kinda tricky to read
4208      *
4209      * When reading from rectangle textures, keep in mind that the input y coordinates
4210      * go from 0 to d3d_height, whereas the opengl texture height is 1.5 * d3d_height
4211      */
4212     shader_addline(buffer, "PARAM yv12_coef = {%f, %f, %f, %f};\n",
4213                    2.0 / 3.0, 1.0 / 6.0, (2.0 / 3.0) + (1.0 / 6.0), 1.0 / 3.0);
4214
4215     shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
4216     /* the chroma planes have only half the width */
4217     shader_addline(buffer, "MUL texcrd.x, texcrd.x, coef.y;\n");
4218
4219     /* The first value is between 2/3 and 5/6th of the texture's height, so scale+bias
4220      * the coordinate. Also read the right side of the image when reading odd lines
4221      *
4222      * Don't forget to clamp the y values in into the range, otherwise we'll get filtering
4223      * bleeding
4224      */
4225     if(textype == GL_TEXTURE_2D) {
4226
4227         shader_addline(buffer, "RCP chroma.w, size.y;\n");
4228
4229         shader_addline(buffer, "MUL texcrd2.y, texcrd.y, size.y;\n");
4230
4231         shader_addline(buffer, "FLR texcrd2.y, texcrd2.y;\n");
4232         shader_addline(buffer, "MAD texcrd.y, texcrd.y, yv12_coef.y, yv12_coef.x;\n");
4233
4234         /* Read odd lines from the right side(add size * 0.5 to the x coordinate */
4235         shader_addline(buffer, "ADD texcrd2.x, texcrd2.y, yv12_coef.y;\n"); /* To avoid 0.5 == 0.5 comparisons */
4236         shader_addline(buffer, "FRC texcrd2.x, texcrd2.x;\n");
4237         shader_addline(buffer, "SGE texcrd2.x, texcrd2.x, coef.y;\n");
4238         shader_addline(buffer, "MAD texcrd.x, texcrd2.x, coef.y, texcrd.x;\n");
4239
4240         /* clamp, keep the half pixel origin in mind */
4241         shader_addline(buffer, "MAD temp.y, coef.y, chroma.w, yv12_coef.x;\n");
4242         shader_addline(buffer, "MAX texcrd.y, temp.y, texcrd.y;\n");
4243         shader_addline(buffer, "MAD temp.y, -coef.y, chroma.w, yv12_coef.z;\n");
4244         shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
4245     } else {
4246         /* Read from [size - size+size/4] */
4247         shader_addline(buffer, "FLR texcrd.y, texcrd.y;\n");
4248         shader_addline(buffer, "MAD texcrd.y, texcrd.y, coef.w, size.y;\n");
4249
4250         /* Read odd lines from the right side(add size * 0.5 to the x coordinate */
4251         shader_addline(buffer, "ADD texcrd2.x, texcrd.y, yv12_coef.y;\n"); /* To avoid 0.5 == 0.5 comparisons */
4252         shader_addline(buffer, "FRC texcrd2.x, texcrd2.x;\n");
4253         shader_addline(buffer, "SGE texcrd2.x, texcrd2.x, coef.y;\n");
4254         shader_addline(buffer, "MUL texcrd2.x, texcrd2.x, size.x;\n");
4255         shader_addline(buffer, "MAD texcrd.x, texcrd2.x, coef.y, texcrd.x;\n");
4256
4257         /* Make sure to read exactly from the pixel center */
4258         shader_addline(buffer, "FLR texcrd.y, texcrd.y;\n");
4259         shader_addline(buffer, "ADD texcrd.y, texcrd.y, coef.y;\n");
4260
4261         /* Clamp */
4262         shader_addline(buffer, "MAD temp.y, size.y, coef.w, size.y;\n");
4263         shader_addline(buffer, "ADD temp.y, temp.y, -coef.y;\n");
4264         shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
4265         shader_addline(buffer, "ADD temp.y, size.y, -coef.y;\n");
4266         shader_addline(buffer, "MAX texcrd.y, temp.y, texcrd.y;\n");
4267     }
4268     /* Read the texture, put the result into the output register */
4269     shader_addline(buffer, "TEX temp, texcrd, texture[0], %s;\n", tex);
4270     shader_addline(buffer, "MOV chroma.x, temp.w;\n");
4271
4272     /* The other chroma value is 1/6th of the texture lower, from 5/6th to 6/6th
4273      * No need to clamp because we're just reusing the already clamped value from above
4274      */
4275     if(textype == GL_TEXTURE_2D) {
4276         shader_addline(buffer, "ADD texcrd.y, texcrd.y, yv12_coef.y;\n");
4277     } else {
4278         shader_addline(buffer, "MAD texcrd.y, size.y, coef.w, texcrd.y;\n");
4279     }
4280     shader_addline(buffer, "TEX temp, texcrd, texture[0], %s;\n", tex);
4281     shader_addline(buffer, "MOV chroma.y, temp.w;\n");
4282
4283     /* Sample the luminance value. It is in the top 2/3rd of the texture, so scale the y coordinate.
4284      * Clamp the y coordinate to prevent the chroma values from bleeding into the sampled luminance
4285      * values due to filtering
4286      */
4287     shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
4288     if(textype == GL_TEXTURE_2D) {
4289         /* Multiply the y coordinate by 2/3 and clamp it */
4290         shader_addline(buffer, "MUL texcrd.y, texcrd.y, yv12_coef.x;\n");
4291         shader_addline(buffer, "MAD temp.y, -coef.y, chroma.w, yv12_coef.x;\n");
4292         shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
4293         shader_addline(buffer, "TEX luminance, texcrd, texture[0], %s;\n", tex);
4294     } else {
4295         /* Reading from texture_rectangles is pretty straightforward, just use the unmodified
4296          * texture coordinate. It is still a good idea to clamp it though, since the opengl texture
4297          * is bigger
4298          */
4299         shader_addline(buffer, "ADD temp.x, size.y, -coef.y;\n");
4300         shader_addline(buffer, "MIN texcrd.y, texcrd.y, size.x;\n");
4301         shader_addline(buffer, "TEX luminance, texcrd, texture[0], %s;\n", tex);
4302     }
4303     *luminance = 'a';
4304
4305     return TRUE;
4306 }
4307
4308 static GLuint gen_yuv_shader(IWineD3DDeviceImpl *device, enum yuv_fixup yuv_fixup, GLenum textype)
4309 {
4310     GLenum shader;
4311     SHADER_BUFFER buffer;
4312     char luminance_component;
4313     struct arbfp_blit_priv *priv = device->blit_priv;
4314
4315     /* Shader header */
4316     shader_buffer_init(&buffer);
4317
4318     ENTER_GL();
4319     GL_EXTCALL(glGenProgramsARB(1, &shader));
4320     checkGLcall("GL_EXTCALL(glGenProgramsARB(1, &shader))");
4321     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader));
4322     checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader)");
4323     LEAVE_GL();
4324     if(!shader) {
4325         shader_buffer_free(&buffer);
4326         return 0;
4327     }
4328
4329     /* The YUY2 and UYVY formats contain two pixels packed into a 32 bit macropixel,
4330      * giving effectively 16 bit per pixel. The color consists of a luminance(Y) and
4331      * two chroma(U and V) values. Each macropixel has two luminance values, one for
4332      * each single pixel it contains, and one U and one V value shared between both
4333      * pixels.
4334      *
4335      * The data is loaded into an A8L8 texture. With YUY2, the luminance component
4336      * contains the luminance and alpha the chroma. With UYVY it is vice versa. Thus
4337      * take the format into account when generating the read swizzles
4338      *
4339      * Reading the Y value is straightforward - just sample the texture. The hardware
4340      * takes care of filtering in the horizontal and vertical direction.
4341      *
4342      * Reading the U and V values is harder. We have to avoid filtering horizontally,
4343      * because that would mix the U and V values of one pixel or two adjacent pixels.
4344      * Thus floor the texture coordinate and add 0.5 to get an unfiltered read,
4345      * regardless of the filtering setting. Vertical filtering works automatically
4346      * though - the U and V values of two rows are mixed nicely.
4347      *
4348      * Appart of avoiding filtering issues, the code has to know which value it just
4349      * read, and where it can find the other one. To determine this, it checks if
4350      * it sampled an even or odd pixel, and shifts the 2nd read accordingly.
4351      *
4352      * Handling horizontal filtering of U and V values requires reading a 2nd pair
4353      * of pixels, extracting U and V and mixing them. This is not implemented yet.
4354      *
4355      * An alternative implementation idea is to load the texture as A8R8G8B8 texture,
4356      * with width / 2. This way one read gives all 3 values, finding U and V is easy
4357      * in an unfiltered situation. Finding the luminance on the other hand requires
4358      * finding out if it is an odd or even pixel. The real drawback of this approach
4359      * is filtering. This would have to be emulated completely in the shader, reading
4360      * up two 2 packed pixels in up to 2 rows and interpolating both horizontally and
4361      * vertically. Beyond that it would require adjustments to the texture handling
4362      * code to deal with the width scaling
4363      */
4364     shader_addline(&buffer, "!!ARBfp1.0\n");
4365     shader_addline(&buffer, "TEMP luminance;\n");
4366     shader_addline(&buffer, "TEMP temp;\n");
4367     shader_addline(&buffer, "TEMP chroma;\n");
4368     shader_addline(&buffer, "TEMP texcrd;\n");
4369     shader_addline(&buffer, "TEMP texcrd2;\n");
4370     shader_addline(&buffer, "PARAM coef = {1.0, 0.5, 2.0, 0.25};\n");
4371     shader_addline(&buffer, "PARAM yuv_coef = {1.403, 0.344, 0.714, 1.770};\n");
4372     shader_addline(&buffer, "PARAM size = program.local[0];\n");
4373
4374     switch (yuv_fixup)
4375     {
4376         case YUV_FIXUP_UYVY:
4377         case YUV_FIXUP_YUY2:
4378             if (!gen_planar_yuv_read(&buffer, yuv_fixup, textype, &luminance_component))
4379             {
4380                 shader_buffer_free(&buffer);
4381                 return 0;
4382             }
4383             break;
4384
4385         case YUV_FIXUP_YV12:
4386             if (!gen_yv12_read(&buffer, textype, &luminance_component))
4387             {
4388                 shader_buffer_free(&buffer);
4389                 return 0;
4390             }
4391             break;
4392
4393         default:
4394             FIXME("Unsupported YUV fixup %#x\n", yuv_fixup);
4395             shader_buffer_free(&buffer);
4396             return 0;
4397     }
4398
4399     /* Calculate the final result. Formula is taken from
4400      * http://www.fourcc.org/fccyvrgb.php. Note that the chroma
4401      * ranges from -0.5 to 0.5
4402      */
4403     shader_addline(&buffer, "SUB chroma.xy, chroma, coef.y;\n");
4404
4405     shader_addline(&buffer, "MAD result.color.x, chroma.x, yuv_coef.x, luminance.%c;\n", luminance_component);
4406     shader_addline(&buffer, "MAD temp.x, -chroma.y, yuv_coef.y, luminance.%c;\n", luminance_component);
4407     shader_addline(&buffer, "MAD result.color.y, -chroma.x, yuv_coef.z, temp.x;\n");
4408     shader_addline(&buffer, "MAD result.color.z, chroma.y, yuv_coef.w, luminance.%c;\n", luminance_component);
4409     shader_addline(&buffer, "END\n");
4410
4411     ENTER_GL();
4412     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, strlen(buffer.buffer), buffer.buffer));
4413
4414     if (glGetError() == GL_INVALID_OPERATION) {
4415         GLint pos;
4416         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
4417         FIXME("Fragment program error at position %d: %s\n", pos,
4418               debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
4419     }
4420     shader_buffer_free(&buffer);
4421     LEAVE_GL();
4422
4423     switch (yuv_fixup)
4424     {
4425         case YUV_FIXUP_YUY2:
4426             if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->yuy2_rect_shader = shader;
4427             else priv->yuy2_2d_shader = shader;
4428             break;
4429
4430         case YUV_FIXUP_UYVY:
4431             if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->uyvy_rect_shader = shader;
4432             else priv->uyvy_2d_shader = shader;
4433             break;
4434
4435         case YUV_FIXUP_YV12:
4436             if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->yv12_rect_shader = shader;
4437             else priv->yv12_2d_shader = shader;
4438             break;
4439     }
4440
4441     return shader;
4442 }
4443
4444 static HRESULT arbfp_blit_set(IWineD3DDevice *iface, const struct GlPixelFormatDesc *format_desc,
4445         GLenum textype, UINT width, UINT height)
4446 {
4447     GLenum shader;
4448     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
4449     float size[4] = {width, height, 1, 1};
4450     struct arbfp_blit_priv *priv = device->blit_priv;
4451     enum yuv_fixup yuv_fixup;
4452
4453     if (!is_yuv_fixup(format_desc->color_fixup))
4454     {
4455         TRACE("Fixup:\n");
4456         dump_color_fixup_desc(format_desc->color_fixup);
4457         /* Don't bother setting up a shader for unconverted formats */
4458         ENTER_GL();
4459         glEnable(textype);
4460         checkGLcall("glEnable(textype)");
4461         LEAVE_GL();
4462         return WINED3D_OK;
4463     }
4464
4465     yuv_fixup = get_yuv_fixup(format_desc->color_fixup);
4466
4467     switch(yuv_fixup)
4468     {
4469         case YUV_FIXUP_YUY2:
4470             shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->yuy2_rect_shader : priv->yuy2_2d_shader;
4471             break;
4472
4473         case YUV_FIXUP_UYVY:
4474             shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->uyvy_rect_shader : priv->uyvy_2d_shader;
4475             break;
4476
4477         case YUV_FIXUP_YV12:
4478             shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->yv12_rect_shader : priv->yv12_2d_shader;
4479             break;
4480
4481         default:
4482             FIXME("Unsupported YUV fixup %#x, not setting a shader\n", yuv_fixup);
4483             ENTER_GL();
4484             glEnable(textype);
4485             checkGLcall("glEnable(textype)");
4486             LEAVE_GL();
4487             return E_NOTIMPL;
4488     }
4489
4490     if (!shader) shader = gen_yuv_shader(device, yuv_fixup, textype);
4491
4492     ENTER_GL();
4493     glEnable(GL_FRAGMENT_PROGRAM_ARB);
4494     checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB)");
4495     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader));
4496     checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader)");
4497     GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 0, size));
4498     checkGLcall("glProgramLocalParameter4fvARB");
4499     LEAVE_GL();
4500
4501     return WINED3D_OK;
4502 }
4503
4504 static void arbfp_blit_unset(IWineD3DDevice *iface) {
4505     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
4506
4507     ENTER_GL();
4508     glDisable(GL_FRAGMENT_PROGRAM_ARB);
4509     checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
4510     glDisable(GL_TEXTURE_2D);
4511     checkGLcall("glDisable(GL_TEXTURE_2D)");
4512     if(GL_SUPPORT(ARB_TEXTURE_CUBE_MAP)) {
4513         glDisable(GL_TEXTURE_CUBE_MAP_ARB);
4514         checkGLcall("glDisable(GL_TEXTURE_CUBE_MAP_ARB)");
4515     }
4516     if(GL_SUPPORT(ARB_TEXTURE_RECTANGLE)) {
4517         glDisable(GL_TEXTURE_RECTANGLE_ARB);
4518         checkGLcall("glDisable(GL_TEXTURE_RECTANGLE_ARB)");
4519     }
4520     LEAVE_GL();
4521 }
4522
4523 static BOOL arbfp_blit_color_fixup_supported(struct color_fixup_desc fixup)
4524 {
4525     enum yuv_fixup yuv_fixup;
4526
4527     if (TRACE_ON(d3d_shader) && TRACE_ON(d3d))
4528     {
4529         TRACE("Checking support for fixup:\n");
4530         dump_color_fixup_desc(fixup);
4531     }
4532
4533     if (is_identity_fixup(fixup))
4534     {
4535         TRACE("[OK]\n");
4536         return TRUE;
4537     }
4538
4539     /* We only support YUV conversions. */
4540     if (!is_yuv_fixup(fixup))
4541     {
4542         TRACE("[FAILED]\n");
4543         return FALSE;
4544     }
4545
4546     yuv_fixup = get_yuv_fixup(fixup);
4547     switch(yuv_fixup)
4548     {
4549         case YUV_FIXUP_YUY2:
4550         case YUV_FIXUP_UYVY:
4551         case YUV_FIXUP_YV12:
4552             TRACE("[OK]\n");
4553             return TRUE;
4554
4555         default:
4556             FIXME("Unsupported YUV fixup %#x\n", yuv_fixup);
4557             TRACE("[FAILED]\n");
4558             return FALSE;
4559     }
4560 }
4561
4562 const struct blit_shader arbfp_blit = {
4563     arbfp_blit_alloc,
4564     arbfp_blit_free,
4565     arbfp_blit_set,
4566     arbfp_blit_unset,
4567     arbfp_blit_color_fixup_supported,
4568 };
4569
4570 #undef GLINFO_LOCATION