wined3d: Cleanup IWineD3DDeviceImpl_UpdateTexture().
[wine] / dlls / wined3d / arb_program_shader.c
1 /*
2  * Pixel and vertex shaders implementation using ARB_vertex_program
3  * and ARB_fragment_program GL extensions.
4  *
5  * Copyright 2002-2003 Jason Edmeades
6  * Copyright 2002-2003 Raphael Junqueira
7  * Copyright 2004 Christian Costa
8  * Copyright 2005 Oliver Stieber
9  * Copyright 2006 Ivan Gyurdiev
10  * Copyright 2006 Jason Green
11  * Copyright 2006 Henri Verbeet
12  * Copyright 2007-2008 Stefan Dösinger for CodeWeavers
13  * Copyright 2009 Henri Verbeet for CodeWeavers
14  *
15  * This library is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU Lesser General Public
17  * License as published by the Free Software Foundation; either
18  * version 2.1 of the License, or (at your option) any later version.
19  *
20  * This library is distributed in the hope that it will be useful,
21  * but WITHOUT ANY WARRANTY; without even the implied warranty of
22  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23  * Lesser General Public License for more details.
24  *
25  * You should have received a copy of the GNU Lesser General Public
26  * License along with this library; if not, write to the Free Software
27  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
28  */
29
30 #include "config.h"
31
32 #include <math.h>
33 #include <stdio.h>
34
35 #include "wined3d_private.h"
36
37 WINE_DEFAULT_DEBUG_CHANNEL(d3d_shader);
38 WINE_DECLARE_DEBUG_CHANNEL(d3d_constants);
39 WINE_DECLARE_DEBUG_CHANNEL(d3d_caps);
40 WINE_DECLARE_DEBUG_CHANNEL(d3d);
41
42 #define GLINFO_LOCATION      (*gl_info)
43
44 /* GL locking for state handlers is done by the caller. */
45 static BOOL need_mova_const(IWineD3DBaseShader *shader, const struct wined3d_gl_info *gl_info)
46 {
47     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *) shader;
48     if(!This->baseShader.reg_maps.usesmova) return FALSE;
49     return !GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION);
50 }
51
52 /* Returns TRUE if result.clip from GL_NV_vertex_program2 should be used and FALSE otherwise */
53 static inline BOOL use_nv_clip(const struct wined3d_gl_info *gl_info)
54 {
55     return GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION) &&
56            !(GLINFO_LOCATION.quirks & WINED3D_QUIRK_NV_CLIP_BROKEN);
57 }
58
59 static BOOL need_helper_const(const struct wined3d_gl_info *gl_info)
60 {
61     if (!GL_SUPPORT(NV_VERTEX_PROGRAM) /* Need to init colors. */
62         || gl_info->quirks & WINED3D_QUIRK_ARB_VS_OFFSET_LIMIT /* Load the immval offset. */
63         || gl_info->quirks & WINED3D_QUIRK_SET_TEXCOORD_W /* Have to init texcoords. */
64         || (!use_nv_clip(gl_info)) /* Init the clip texcoord */)
65     {
66         return TRUE;
67     }
68     return FALSE;
69 }
70
71 static unsigned int reserved_vs_const(IWineD3DBaseShader *shader, const struct wined3d_gl_info *gl_info)
72 {
73     unsigned int ret = 1;
74     /* We use one PARAM for the pos fixup, and in some cases one to load
75      * some immediate values into the shader
76      */
77     if(need_helper_const(gl_info)) ret++;
78     if(need_mova_const(shader, gl_info)) ret++;
79     return ret;
80 }
81
82 static inline BOOL ffp_clip_emul(IWineD3DStateBlockImpl *stateblock)
83 {
84     return stateblock->lowest_disabled_stage < 7;
85 }
86
87 /* Internally used shader constants. Applications can use constants 0 to GL_LIMITS(vshader_constantsF) - 1,
88  * so upload them above that
89  */
90 #define ARB_SHADER_PRIVCONST_BASE (GL_LIMITS(vshader_constantsF) - 1)
91 #define ARB_SHADER_PRIVCONST_POS ARB_SHADER_PRIVCONST_BASE + 0
92
93 /* ARB_program_shader private data */
94
95 struct control_frame
96 {
97     struct                          list entry;
98     enum
99     {
100         IF,
101         IFC,
102         LOOP,
103         REP
104     } type;
105     BOOL                            muting;
106     BOOL                            outer_loop;
107     union
108     {
109         unsigned int                loop_no;
110         unsigned int                ifc_no;
111     };
112     struct wined3d_shader_loop_control loop_control;
113     BOOL                            had_else;
114 };
115
116 struct arb_ps_np2fixup_info
117 {
118     struct ps_np2fixup_info         super;
119     /* For ARB we need a offset value:
120      * With both GLSL and ARB mode the NP2 fixup information (the texture dimensions) are stored in a
121      * consecutive way (GLSL uses a uniform array). Since ARB doesn't know the notion of a "standalone"
122      * array we need an offset to the index inside the program local parameter array. */
123     UINT                            offset;
124 };
125
126 struct arb_ps_compile_args
127 {
128     struct ps_compile_args          super;
129     WORD                            bools;
130     WORD                            clip;  /* only a boolean, use a WORD for alignment */
131     unsigned char                   loop_ctrl[MAX_CONST_I][3];
132 };
133
134 struct stb_const_desc
135 {
136     unsigned char           texunit;
137     UINT                    const_num;
138 };
139
140 struct arb_ps_compiled_shader
141 {
142     struct arb_ps_compile_args      args;
143     struct arb_ps_np2fixup_info     np2fixup_info;
144     struct stb_const_desc           bumpenvmatconst[MAX_TEXTURES];
145     struct stb_const_desc           luminanceconst[MAX_TEXTURES];
146     UINT                            int_consts[MAX_CONST_I];
147     GLuint                          prgId;
148     UINT                            ycorrection;
149     unsigned char                   numbumpenvmatconsts;
150     char                            num_int_consts;
151 };
152
153 struct arb_vs_compile_args
154 {
155     struct vs_compile_args          super;
156     union
157     {
158         struct
159         {
160             WORD                    bools;
161             char                    clip_texcoord;
162             char                    clipplane_mask;
163         }                           boolclip;
164         DWORD                       boolclip_compare;
165     };
166     DWORD                           ps_signature;
167     union
168     {
169         unsigned char               vertex_samplers[4];
170         DWORD                       vertex_samplers_compare;
171     };
172     unsigned char                   loop_ctrl[MAX_CONST_I][3];
173 };
174
175 struct arb_vs_compiled_shader
176 {
177     struct arb_vs_compile_args      args;
178     GLuint                          prgId;
179     UINT                            int_consts[MAX_CONST_I];
180     char                            num_int_consts;
181     char                            need_color_unclamp;
182     UINT                            pos_fixup;
183 };
184
185 struct recorded_instruction
186 {
187     struct wined3d_shader_instruction ins;
188     struct list entry;
189 };
190
191 struct shader_arb_ctx_priv
192 {
193     char addr_reg[20];
194     enum
195     {
196         /* plain GL_ARB_vertex_program or GL_ARB_fragment_program */
197         ARB,
198         /* GL_NV_vertex_progam2_option or GL_NV_fragment_program_option */
199         NV2,
200         /* GL_NV_vertex_program3 or GL_NV_fragment_program2 */
201         NV3
202     } target_version;
203
204     const struct arb_vs_compile_args    *cur_vs_args;
205     const struct arb_ps_compile_args    *cur_ps_args;
206     const struct arb_ps_compiled_shader *compiled_fprog;
207     const struct arb_vs_compiled_shader *compiled_vprog;
208     struct arb_ps_np2fixup_info         *cur_np2fixup_info;
209     struct list                         control_frames;
210     struct list                         record;
211     BOOL                                recording;
212     BOOL                                muted;
213     unsigned int                        num_loops, loop_depth, num_ifcs;
214     int                                 aL;
215
216     unsigned int                        vs_clipplanes;
217     BOOL                                footer_written;
218     BOOL                                in_main_func;
219
220     /* For 3.0 vertex shaders */
221     const char                          *vs_output[MAX_REG_OUTPUT];
222     /* For 2.x and earlier vertex shaders */
223     const char                          *texcrd_output[8], *color_output[2], *fog_output;
224
225     /* 3.0 pshader input for compatibility with fixed function */
226     const char                          *ps_input[MAX_REG_INPUT];
227 };
228
229 struct ps_signature
230 {
231     struct wined3d_shader_signature_element *sig;
232     DWORD                               idx;
233     struct wine_rb_entry                entry;
234 };
235
236 struct arb_pshader_private {
237     struct arb_ps_compiled_shader   *gl_shaders;
238     UINT                            num_gl_shaders, shader_array_size;
239     BOOL                            has_signature_idx;
240     DWORD                           input_signature_idx;
241     DWORD                           clipplane_emulation;
242     BOOL                            clamp_consts;
243 };
244
245 struct arb_vshader_private {
246     struct arb_vs_compiled_shader   *gl_shaders;
247     UINT                            num_gl_shaders, shader_array_size;
248 };
249
250 struct shader_arb_priv
251 {
252     GLuint                  current_vprogram_id;
253     GLuint                  current_fprogram_id;
254     const struct arb_ps_compiled_shader *compiled_fprog;
255     const struct arb_vs_compiled_shader *compiled_vprog;
256     GLuint                  depth_blt_vprogram_id;
257     GLuint                  depth_blt_fprogram_id[tex_type_count];
258     BOOL                    use_arbfp_fixed_func;
259     struct wine_rb_tree     fragment_shaders;
260     BOOL                    last_ps_const_clamped;
261     BOOL                    last_vs_color_unclamp;
262
263     struct wine_rb_tree     signature_tree;
264     DWORD ps_sig_number;
265 };
266
267 /********************************************************
268  * ARB_[vertex/fragment]_program helper functions follow
269  ********************************************************/
270
271 /* Loads floating point constants into the currently set ARB_vertex/fragment_program.
272  * When constant_list == NULL, it will load all the constants.
273  *
274  * @target_type should be either GL_VERTEX_PROGRAM_ARB (for vertex shaders)
275  *  or GL_FRAGMENT_PROGRAM_ARB (for pixel shaders)
276  */
277 /* GL locking is done by the caller */
278 static unsigned int shader_arb_load_constantsF(IWineD3DBaseShaderImpl *This, const struct wined3d_gl_info *gl_info,
279         GLuint target_type, unsigned int max_constants, const float *constants, char *dirty_consts)
280 {
281     local_constant* lconst;
282     DWORD i, j;
283     unsigned int ret;
284
285     if (TRACE_ON(d3d_constants))
286     {
287         for(i = 0; i < max_constants; i++) {
288             if(!dirty_consts[i]) continue;
289             TRACE_(d3d_constants)("Loading constants %i: %f, %f, %f, %f\n", i,
290                         constants[i * 4 + 0], constants[i * 4 + 1],
291                         constants[i * 4 + 2], constants[i * 4 + 3]);
292         }
293     }
294
295     i = 0;
296
297     /* In 1.X pixel shaders constants are implicitly clamped in the range [-1;1] */
298     if (target_type == GL_FRAGMENT_PROGRAM_ARB && This->baseShader.reg_maps.shader_version.major == 1)
299     {
300         float lcl_const[4];
301         /* ps 1.x supports only 8 constants, clamp only those. When switching between 1.x and higher
302          * shaders, the first 8 constants are marked dirty for reload
303          */
304         for(; i < min(8, max_constants); i++) {
305             if(!dirty_consts[i]) continue;
306             dirty_consts[i] = 0;
307
308             j = 4 * i;
309             if (constants[j + 0] > 1.0f) lcl_const[0] = 1.0f;
310             else if (constants[j + 0] < -1.0f) lcl_const[0] = -1.0f;
311             else lcl_const[0] = constants[j + 0];
312
313             if (constants[j + 1] > 1.0f) lcl_const[1] = 1.0f;
314             else if (constants[j + 1] < -1.0f) lcl_const[1] = -1.0f;
315             else lcl_const[1] = constants[j + 1];
316
317             if (constants[j + 2] > 1.0f) lcl_const[2] = 1.0f;
318             else if (constants[j + 2] < -1.0f) lcl_const[2] = -1.0f;
319             else lcl_const[2] = constants[j + 2];
320
321             if (constants[j + 3] > 1.0f) lcl_const[3] = 1.0f;
322             else if (constants[j + 3] < -1.0f) lcl_const[3] = -1.0f;
323             else lcl_const[3] = constants[j + 3];
324
325             GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, i, lcl_const));
326         }
327
328         /* If further constants are dirty, reload them without clamping.
329          *
330          * The alternative is not to touch them, but then we cannot reset the dirty constant count
331          * to zero. That's bad for apps that only use PS 1.x shaders, because in that case the code
332          * above would always re-check the first 8 constants since max_constant remains at the init
333          * value
334          */
335     }
336
337     if(GL_SUPPORT(EXT_GPU_PROGRAM_PARAMETERS)) {
338         /* TODO: Benchmark if we're better of with finding the dirty constants ourselves,
339          * or just reloading *all* constants at once
340          *
341         GL_EXTCALL(glProgramEnvParameters4fvEXT(target_type, i, max_constants, constants + (i * 4)));
342          */
343         for(; i < max_constants; i++) {
344             if(!dirty_consts[i]) continue;
345
346             /* Find the next block of dirty constants */
347             dirty_consts[i] = 0;
348             j = i;
349             for(i++; (i < max_constants) && dirty_consts[i]; i++) {
350                 dirty_consts[i] = 0;
351             }
352
353             GL_EXTCALL(glProgramEnvParameters4fvEXT(target_type, j, i - j, constants + (j * 4)));
354         }
355     } else {
356         for(; i < max_constants; i++) {
357             if(dirty_consts[i]) {
358                 dirty_consts[i] = 0;
359                 GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, i, constants + (i * 4)));
360             }
361         }
362     }
363     checkGLcall("glProgramEnvParameter4fvARB()");
364
365     /* Load immediate constants */
366     if(This->baseShader.load_local_constsF) {
367         if (TRACE_ON(d3d_shader)) {
368             LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
369                 GLfloat* values = (GLfloat*)lconst->value;
370                 TRACE_(d3d_constants)("Loading local constants %i: %f, %f, %f, %f\n", lconst->idx,
371                         values[0], values[1], values[2], values[3]);
372             }
373         }
374         /* Immediate constants are clamped for 1.X shaders at loading times */
375         ret = 0;
376         LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
377             dirty_consts[lconst->idx] = 1; /* Dirtify so the non-immediate constant overwrites it next time */
378             ret = max(ret, lconst->idx + 1);
379             GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, lconst->idx, (GLfloat*)lconst->value));
380         }
381         checkGLcall("glProgramEnvParameter4fvARB()");
382         return ret; /* The loaded immediate constants need reloading for the next shader */
383     } else {
384         return 0; /* No constants are dirty now */
385     }
386 }
387
388 /**
389  * Loads the texture dimensions for NP2 fixup into the currently set ARB_[vertex/fragment]_programs.
390  */
391 static void shader_arb_load_np2fixup_constants(
392     IWineD3DDevice* device,
393     char usePixelShader,
394     char useVertexShader) {
395
396     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl *) device;
397     const struct shader_arb_priv* const priv = (const struct shader_arb_priv *) deviceImpl->shader_priv;
398     IWineD3DStateBlockImpl* stateBlock = deviceImpl->stateBlock;
399     const struct wined3d_gl_info *gl_info = &deviceImpl->adapter->gl_info;
400
401     if (!usePixelShader) {
402         /* NP2 texcoord fixup is (currently) only done for pixelshaders. */
403         return;
404     }
405
406     if (priv->compiled_fprog && priv->compiled_fprog->np2fixup_info.super.active) {
407         const struct arb_ps_np2fixup_info* const fixup = &priv->compiled_fprog->np2fixup_info;
408         UINT i;
409         WORD active = fixup->super.active;
410         GLfloat np2fixup_constants[4 * MAX_FRAGMENT_SAMPLERS];
411
412         for (i = 0; active; active >>= 1, ++i) {
413             const unsigned char idx = fixup->super.idx[i];
414             const IWineD3DTextureImpl* const tex = (const IWineD3DTextureImpl*) stateBlock->textures[i];
415             GLfloat* tex_dim = &np2fixup_constants[(idx >> 1) * 4];
416
417             if (!(active & 1)) continue;
418
419             if (!tex) {
420                 FIXME("Nonexistent texture is flagged for NP2 texcoord fixup\n");
421                 continue;
422             }
423
424             if (idx % 2) {
425                 tex_dim[2] = tex->baseTexture.pow2Matrix[0]; tex_dim[3] = tex->baseTexture.pow2Matrix[5];
426             } else {
427                 tex_dim[0] = tex->baseTexture.pow2Matrix[0]; tex_dim[1] = tex->baseTexture.pow2Matrix[5];
428             }
429         }
430
431         for (i = 0; i < fixup->super.num_consts; ++i) {
432             GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB,
433                                                    fixup->offset + i, &np2fixup_constants[i * 4]));
434         }
435     }
436 }
437
438 /* GL locking is done by the caller. */
439 static inline void shader_arb_ps_local_constants(IWineD3DDeviceImpl* deviceImpl)
440 {
441     const struct wined3d_context *context = context_get_current();
442     IWineD3DStateBlockImpl* stateBlock = deviceImpl->stateBlock;
443     const struct wined3d_gl_info *gl_info = context->gl_info;
444     unsigned char i;
445     struct shader_arb_priv *priv = deviceImpl->shader_priv;
446     const struct arb_ps_compiled_shader *gl_shader = priv->compiled_fprog;
447
448     for(i = 0; i < gl_shader->numbumpenvmatconsts; i++)
449     {
450         int texunit = gl_shader->bumpenvmatconst[i].texunit;
451
452         /* The state manager takes care that this function is always called if the bump env matrix changes */
453         const float *data = (const float *)&stateBlock->textureState[texunit][WINED3DTSS_BUMPENVMAT00];
454         GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, gl_shader->bumpenvmatconst[i].const_num, data));
455
456         if (gl_shader->luminanceconst[i].const_num != WINED3D_CONST_NUM_UNUSED)
457         {
458             /* WINED3DTSS_BUMPENVLSCALE and WINED3DTSS_BUMPENVLOFFSET are next to each other.
459              * point gl to the scale, and load 4 floats. x = scale, y = offset, z and w are junk, we
460              * don't care about them. The pointers are valid for sure because the stateblock is bigger.
461              * (they're WINED3DTSS_TEXTURETRANSFORMFLAGS and WINED3DTSS_ADDRESSW, so most likely 0 or NaN
462             */
463             const float *scale = (const float *)&stateBlock->textureState[texunit][WINED3DTSS_BUMPENVLSCALE];
464             GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, gl_shader->luminanceconst[i].const_num, scale));
465         }
466     }
467     checkGLcall("Load bumpmap consts");
468
469     if(gl_shader->ycorrection != WINED3D_CONST_NUM_UNUSED)
470     {
471         /* ycorrection.x: Backbuffer height(onscreen) or 0(offscreen).
472         * ycorrection.y: -1.0(onscreen), 1.0(offscreen)
473         * ycorrection.z: 1.0
474         * ycorrection.w: 0.0
475         */
476         float val[4];
477         val[0] = context->render_offscreen ? 0.0f
478                 : ((IWineD3DSurfaceImpl *) deviceImpl->render_targets[0])->currentDesc.Height;
479         val[1] = context->render_offscreen ? 1.0f : -1.0f;
480         val[2] = 1.0f;
481         val[3] = 0.0f;
482         GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, gl_shader->ycorrection, val));
483         checkGLcall("y correction loading");
484     }
485
486     if(gl_shader->num_int_consts == 0) return;
487
488     for(i = 0; i < MAX_CONST_I; i++)
489     {
490         if(gl_shader->int_consts[i] != WINED3D_CONST_NUM_UNUSED)
491         {
492             float val[4];
493             val[0] = stateBlock->pixelShaderConstantI[4 * i];
494             val[1] = stateBlock->pixelShaderConstantI[4 * i + 1];
495             val[2] = stateBlock->pixelShaderConstantI[4 * i + 2];
496             val[3] = -1.0f;
497
498             GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, gl_shader->int_consts[i], val));
499         }
500     }
501     checkGLcall("Load ps int consts");
502 }
503
504 /* GL locking is done by the caller. */
505 static inline void shader_arb_vs_local_constants(IWineD3DDeviceImpl* deviceImpl)
506 {
507     IWineD3DStateBlockImpl* stateBlock;
508     const struct wined3d_gl_info *gl_info = &deviceImpl->adapter->gl_info;
509     unsigned char i;
510     struct shader_arb_priv *priv = deviceImpl->shader_priv;
511     const struct arb_vs_compiled_shader *gl_shader = priv->compiled_vprog;
512
513     /* Upload the position fixup */
514     GL_EXTCALL(glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, gl_shader->pos_fixup, deviceImpl->posFixup));
515
516     if(gl_shader->num_int_consts == 0) return;
517
518     stateBlock = deviceImpl->stateBlock;
519
520     for(i = 0; i < MAX_CONST_I; i++)
521     {
522         if(gl_shader->int_consts[i] != WINED3D_CONST_NUM_UNUSED)
523         {
524             float val[4];
525             val[0] = stateBlock->vertexShaderConstantI[4 * i];
526             val[1] = stateBlock->vertexShaderConstantI[4 * i + 1];
527             val[2] = stateBlock->vertexShaderConstantI[4 * i + 2];
528             val[3] = -1.0f;
529
530             GL_EXTCALL(glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, gl_shader->int_consts[i], val));
531         }
532     }
533     checkGLcall("Load vs int consts");
534 }
535
536 /**
537  * Loads the app-supplied constants into the currently set ARB_[vertex/fragment]_programs.
538  *
539  * We only support float constants in ARB at the moment, so don't
540  * worry about the Integers or Booleans
541  */
542 /* GL locking is done by the caller (state handler) */
543 static void shader_arb_load_constants(const struct wined3d_context *context, char usePixelShader, char useVertexShader)
544 {
545     IWineD3DDeviceImpl *device = ((IWineD3DSurfaceImpl *)context->surface)->resource.wineD3DDevice;
546     IWineD3DStateBlockImpl* stateBlock = device->stateBlock;
547     const struct wined3d_gl_info *gl_info = context->gl_info;
548
549     if (useVertexShader) {
550         IWineD3DBaseShaderImpl* vshader = (IWineD3DBaseShaderImpl*) stateBlock->vertexShader;
551
552         /* Load DirectX 9 float constants for vertex shader */
553         device->highest_dirty_vs_const = shader_arb_load_constantsF(vshader, gl_info, GL_VERTEX_PROGRAM_ARB,
554                 device->highest_dirty_vs_const, stateBlock->vertexShaderConstantF, context->vshader_const_dirty);
555         shader_arb_vs_local_constants(device);
556     }
557
558     if (usePixelShader) {
559         IWineD3DBaseShaderImpl* pshader = (IWineD3DBaseShaderImpl*) stateBlock->pixelShader;
560
561         /* Load DirectX 9 float constants for pixel shader */
562         device->highest_dirty_ps_const = shader_arb_load_constantsF(pshader, gl_info, GL_FRAGMENT_PROGRAM_ARB,
563                 device->highest_dirty_ps_const, stateBlock->pixelShaderConstantF, context->pshader_const_dirty);
564         shader_arb_ps_local_constants(device);
565     }
566 }
567
568 static void shader_arb_update_float_vertex_constants(IWineD3DDevice *iface, UINT start, UINT count)
569 {
570     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
571     struct wined3d_context *context = context_get_current();
572
573     /* We don't want shader constant dirtification to be an O(contexts), so just dirtify the active
574      * context. On a context switch the old context will be fully dirtified */
575     if (!context || ((IWineD3DSurfaceImpl *)context->surface)->resource.wineD3DDevice != This) return;
576
577     memset(context->vshader_const_dirty + start, 1, sizeof(*context->vshader_const_dirty) * count);
578     This->highest_dirty_vs_const = max(This->highest_dirty_vs_const, start + count);
579 }
580
581 static void shader_arb_update_float_pixel_constants(IWineD3DDevice *iface, UINT start, UINT count)
582 {
583     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
584     struct wined3d_context *context = context_get_current();
585
586     /* We don't want shader constant dirtification to be an O(contexts), so just dirtify the active
587      * context. On a context switch the old context will be fully dirtified */
588     if (!context || ((IWineD3DSurfaceImpl *)context->surface)->resource.wineD3DDevice != This) return;
589
590     memset(context->pshader_const_dirty + start, 1, sizeof(*context->pshader_const_dirty) * count);
591     This->highest_dirty_ps_const = max(This->highest_dirty_ps_const, start + count);
592 }
593
594 static DWORD *local_const_mapping(IWineD3DBaseShaderImpl *This)
595 {
596     DWORD *ret;
597     DWORD idx = 0;
598     const local_constant *lconst;
599
600     if(This->baseShader.load_local_constsF || list_empty(&This->baseShader.constantsF)) return NULL;
601
602     ret = HeapAlloc(GetProcessHeap(), 0, sizeof(DWORD) * This->baseShader.limits.constant_float);
603     if(!ret) {
604         ERR("Out of memory\n");
605         return NULL;
606     }
607
608     LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
609         ret[lconst->idx] = idx++;
610     }
611     return ret;
612 }
613
614 /* Generate the variable & register declarations for the ARB_vertex_program output target */
615 static DWORD shader_generate_arb_declarations(IWineD3DBaseShader *iface, const shader_reg_maps *reg_maps,
616         struct wined3d_shader_buffer *buffer, const struct wined3d_gl_info *gl_info, DWORD *lconst_map,
617         DWORD *num_clipplanes, struct shader_arb_ctx_priv *ctx)
618 {
619     IWineD3DBaseShaderImpl* This = (IWineD3DBaseShaderImpl*) iface;
620     DWORD i, next_local = 0;
621     char pshader = shader_is_pshader_version(reg_maps->shader_version.type);
622     unsigned max_constantsF;
623     const local_constant *lconst;
624     DWORD map;
625
626     /* In pixel shaders, all private constants are program local, we don't need anything
627      * from program.env. Thus we can advertise the full set of constants in pixel shaders.
628      * If we need a private constant the GL implementation will squeeze it in somewhere
629      *
630      * With vertex shaders we need the posFixup and on some GL implementations 4 helper
631      * immediate values. The posFixup is loaded using program.env for now, so always
632      * subtract one from the number of constants. If the shader uses indirect addressing,
633      * account for the helper const too because we have to declare all availabke d3d constants
634      * and don't know which are actually used.
635      */
636     if(pshader) {
637         max_constantsF = GL_LIMITS(pshader_constantsF);
638     } else {
639         if(This->baseShader.reg_maps.usesrelconstF) {
640             DWORD highest_constf = 0, clip_limit;
641             max_constantsF = GL_LIMITS(vshader_constantsF) - reserved_vs_const(iface, gl_info);
642             max_constantsF -= count_bits(This->baseShader.reg_maps.integer_constants);
643
644             for(i = 0; i < This->baseShader.limits.constant_float; i++)
645             {
646                 DWORD idx = i >> 5;
647                 DWORD shift = i & 0x1f;
648                 if(reg_maps->constf[idx] & (1 << shift)) highest_constf = i;
649             }
650
651             if(use_nv_clip(gl_info) && ctx->target_version >= NV2)
652             {
653                 clip_limit = GL_LIMITS(clipplanes);
654             }
655             else
656             {
657                 unsigned int mask = ctx->cur_vs_args->boolclip.clipplane_mask;
658                 clip_limit = min(count_bits(mask), 4);
659             }
660             *num_clipplanes = min(clip_limit, max_constantsF - highest_constf - 1);
661             max_constantsF -= *num_clipplanes;
662             if(*num_clipplanes < clip_limit)
663             {
664                 WARN("Only %u clipplanes out of %u enabled\n", *num_clipplanes, GL_LIMITS(clipplanes));
665             }
666         }
667         else
668         {
669             if(ctx->target_version >= NV2) *num_clipplanes = GL_LIMITS(clipplanes);
670             else *num_clipplanes = min(GL_LIMITS(clipplanes), 4);
671             max_constantsF = GL_LIMITS(vshader_constantsF);
672         }
673     }
674
675     for (i = 0, map = reg_maps->temporary; map; map >>= 1, ++i)
676     {
677         if (map & 1) shader_addline(buffer, "TEMP R%u;\n", i);
678     }
679
680     for (i = 0, map = reg_maps->address; map; map >>= 1, ++i)
681     {
682         if (map & 1) shader_addline(buffer, "ADDRESS A%u;\n", i);
683     }
684
685     if (pshader && reg_maps->shader_version.major == 1 && reg_maps->shader_version.minor <= 3)
686     {
687         for (i = 0, map = reg_maps->texcoord; map; map >>= 1, ++i)
688         {
689             if (map & 1) shader_addline(buffer, "TEMP T%u;\n", i);
690         }
691     }
692
693     /* Load local constants using the program-local space,
694      * this avoids reloading them each time the shader is used
695      */
696     if(lconst_map) {
697         LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
698             shader_addline(buffer, "PARAM C%u = program.local[%u];\n", lconst->idx,
699                            lconst_map[lconst->idx]);
700             next_local = max(next_local, lconst_map[lconst->idx] + 1);
701         }
702     }
703
704     /* we use the array-based constants array if the local constants are marked for loading,
705      * because then we use indirect addressing, or when the local constant list is empty,
706      * because then we don't know if we're using indirect addressing or not. If we're hardcoding
707      * local constants do not declare the loaded constants as an array because ARB compilers usually
708      * do not optimize unused constants away
709      */
710     if(This->baseShader.reg_maps.usesrelconstF) {
711         /* Need to PARAM the environment parameters (constants) so we can use relative addressing */
712         shader_addline(buffer, "PARAM C[%d] = { program.env[0..%d] };\n",
713                     max_constantsF, max_constantsF - 1);
714     } else {
715         for(i = 0; i < max_constantsF; i++) {
716             DWORD idx, mask;
717             idx = i >> 5;
718             mask = 1 << (i & 0x1f);
719             if(!shader_constant_is_local(This, i) && (This->baseShader.reg_maps.constf[idx] & mask)) {
720                 shader_addline(buffer, "PARAM C%d = program.env[%d];\n",i, i);
721             }
722         }
723     }
724
725     return next_local;
726 }
727
728 static const char * const shift_tab[] = {
729     "dummy",     /*  0 (none) */
730     "coefmul.x", /*  1 (x2)   */
731     "coefmul.y", /*  2 (x4)   */
732     "coefmul.z", /*  3 (x8)   */
733     "coefmul.w", /*  4 (x16)  */
734     "dummy",     /*  5 (x32)  */
735     "dummy",     /*  6 (x64)  */
736     "dummy",     /*  7 (x128) */
737     "dummy",     /*  8 (d256) */
738     "dummy",     /*  9 (d128) */
739     "dummy",     /* 10 (d64)  */
740     "dummy",     /* 11 (d32)  */
741     "coefdiv.w", /* 12 (d16)  */
742     "coefdiv.z", /* 13 (d8)   */
743     "coefdiv.y", /* 14 (d4)   */
744     "coefdiv.x"  /* 15 (d2)   */
745 };
746
747 static void shader_arb_get_write_mask(const struct wined3d_shader_instruction *ins,
748         const struct wined3d_shader_dst_param *dst, char *write_mask)
749 {
750     char *ptr = write_mask;
751
752     if (dst->write_mask != WINED3DSP_WRITEMASK_ALL)
753     {
754         *ptr++ = '.';
755         if (dst->write_mask & WINED3DSP_WRITEMASK_0) *ptr++ = 'x';
756         if (dst->write_mask & WINED3DSP_WRITEMASK_1) *ptr++ = 'y';
757         if (dst->write_mask & WINED3DSP_WRITEMASK_2) *ptr++ = 'z';
758         if (dst->write_mask & WINED3DSP_WRITEMASK_3) *ptr++ = 'w';
759     }
760
761     *ptr = '\0';
762 }
763
764 static void shader_arb_get_swizzle(const struct wined3d_shader_src_param *param, BOOL fixup, char *swizzle_str)
765 {
766     /* For registers of type WINED3DDECLTYPE_D3DCOLOR, data is stored as "bgra",
767      * but addressed as "rgba". To fix this we need to swap the register's x
768      * and z components. */
769     const char *swizzle_chars = fixup ? "zyxw" : "xyzw";
770     char *ptr = swizzle_str;
771
772     /* swizzle bits fields: wwzzyyxx */
773     DWORD swizzle = param->swizzle;
774     DWORD swizzle_x = swizzle & 0x03;
775     DWORD swizzle_y = (swizzle >> 2) & 0x03;
776     DWORD swizzle_z = (swizzle >> 4) & 0x03;
777     DWORD swizzle_w = (swizzle >> 6) & 0x03;
778
779     /* If the swizzle is the default swizzle (ie, "xyzw"), we don't need to
780      * generate a swizzle string. Unless we need to our own swizzling. */
781     if (swizzle != WINED3DSP_NOSWIZZLE || fixup)
782     {
783         *ptr++ = '.';
784         if (swizzle_x == swizzle_y && swizzle_x == swizzle_z && swizzle_x == swizzle_w) {
785             *ptr++ = swizzle_chars[swizzle_x];
786         } else {
787             *ptr++ = swizzle_chars[swizzle_x];
788             *ptr++ = swizzle_chars[swizzle_y];
789             *ptr++ = swizzle_chars[swizzle_z];
790             *ptr++ = swizzle_chars[swizzle_w];
791         }
792     }
793
794     *ptr = '\0';
795 }
796
797 static void shader_arb_request_a0(const struct wined3d_shader_instruction *ins, const char *src)
798 {
799     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
800     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
801
802     if(strcmp(priv->addr_reg, src) == 0) return;
803
804     strcpy(priv->addr_reg, src);
805     shader_addline(buffer, "ARL A0.x, %s;\n", src);
806 }
807
808 static void shader_arb_get_src_param(const struct wined3d_shader_instruction *ins,
809         const struct wined3d_shader_src_param *src, unsigned int tmpreg, char *outregstr);
810
811 static void shader_arb_get_register_name(const struct wined3d_shader_instruction *ins,
812         const struct wined3d_shader_register *reg, char *register_name, BOOL *is_color)
813 {
814     /* oPos, oFog and oPts in D3D */
815     static const char * const rastout_reg_names[] = {"TMP_OUT", "result.fogcoord", "result.pointsize"};
816     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
817     BOOL pshader = shader_is_pshader_version(This->baseShader.reg_maps.shader_version.type);
818     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
819
820     *is_color = FALSE;
821
822     switch (reg->type)
823     {
824         case WINED3DSPR_TEMP:
825             sprintf(register_name, "R%u", reg->idx);
826             break;
827
828         case WINED3DSPR_INPUT:
829             if (pshader)
830             {
831                 if(This->baseShader.reg_maps.shader_version.major < 3)
832                 {
833                     if (reg->idx == 0) strcpy(register_name, "fragment.color.primary");
834                     else strcpy(register_name, "fragment.color.secondary");
835                 }
836                 else
837                 {
838                     if(reg->rel_addr)
839                     {
840                         char rel_reg[50];
841                         shader_arb_get_src_param(ins, reg->rel_addr, 0, rel_reg);
842
843                         if(strcmp(rel_reg, "**aL_emul**") == 0)
844                         {
845                             DWORD idx = ctx->aL + reg->idx;
846                             if(idx < MAX_REG_INPUT)
847                             {
848                                 strcpy(register_name, ctx->ps_input[idx]);
849                             }
850                             else
851                             {
852                                 ERR("Pixel shader input register out of bounds: %u\n", idx);
853                                 sprintf(register_name, "out_of_bounds_%u", idx);
854                             }
855                         }
856                         else if(This->baseShader.reg_maps.input_registers & 0x0300)
857                         {
858                             /* There are two ways basically:
859                              *
860                              * 1) Use the unrolling code that is used for loop emulation and unroll the loop.
861                              *    That means trouble if the loop also contains a breakc or if the control values
862                              *    aren't local constants.
863                              * 2) Generate an if block that checks if aL.y < 8, == 8 or == 9 and selects the
864                              *    source dynamically. The trouble is that we cannot simply read aL.y because it
865                              *    is an ADDRESS register. We could however push it, load .zw with a value and use
866                              *    ADAC to load the condition code register and pop it again afterwards
867                              */
868                             FIXME("Relative input register addressing with more than 8 registers\n");
869
870                             /* This is better than nothing for now */
871                             sprintf(register_name, "fragment.texcoord[%s + %u]", rel_reg, reg->idx);
872                         }
873                         else if(ctx->cur_ps_args->super.vp_mode != vertexshader)
874                         {
875                             /* This is problematic because we'd have to consult the ctx->ps_input strings
876                              * for where to find the varying. Some may be "0.0", others can be texcoords or
877                              * colors. This needs either a pipeline replacement to make the vertex shader feed
878                              * proper varyings, or loop unrolling
879                              *
880                              * For now use the texcoords and hope for the best
881                              */
882                             FIXME("Non-vertex shader varying input with indirect addressing\n");
883                             sprintf(register_name, "fragment.texcoord[%s + %u]", rel_reg, reg->idx);
884                         }
885                         else
886                         {
887                             /* D3D supports indirect addressing only with aL in loop registers. The loop instruction
888                              * pulls GL_NV_fragment_program2 in
889                              */
890                             sprintf(register_name, "fragment.texcoord[%s + %u]", rel_reg, reg->idx);
891                         }
892                     }
893                     else
894                     {
895                         if(reg->idx < MAX_REG_INPUT)
896                         {
897                             strcpy(register_name, ctx->ps_input[reg->idx]);
898                         }
899                         else
900                         {
901                             ERR("Pixel shader input register out of bounds: %u\n", reg->idx);
902                             sprintf(register_name, "out_of_bounds_%u", reg->idx);
903                         }
904                     }
905                 }
906             }
907             else
908             {
909                 if (ctx->cur_vs_args->super.swizzle_map & (1 << reg->idx)) *is_color = TRUE;
910                 sprintf(register_name, "vertex.attrib[%u]", reg->idx);
911             }
912             break;
913
914         case WINED3DSPR_CONST:
915             if (!pshader && reg->rel_addr)
916             {
917                 BOOL aL = FALSE;
918                 char rel_reg[50];
919                 UINT rel_offset = ((IWineD3DVertexShaderImpl *)This)->rel_offset;
920                 if(This->baseShader.reg_maps.shader_version.major < 2) {
921                     sprintf(rel_reg, "A0.x");
922                 } else {
923                     shader_arb_get_src_param(ins, reg->rel_addr, 0, rel_reg);
924                     if(ctx->target_version == ARB) {
925                         if(strcmp(rel_reg, "**aL_emul**") == 0) {
926                             aL = TRUE;
927                         } else {
928                             shader_arb_request_a0(ins, rel_reg);
929                             sprintf(rel_reg, "A0.x");
930                         }
931                     }
932                 }
933                 if(aL)
934                     sprintf(register_name, "C[%u]", ctx->aL + reg->idx);
935                 else if (reg->idx >= rel_offset)
936                     sprintf(register_name, "C[%s + %u]", rel_reg, reg->idx - rel_offset);
937                 else
938                     sprintf(register_name, "C[%s - %u]", rel_reg, -reg->idx + rel_offset);
939             }
940             else
941             {
942                 if (This->baseShader.reg_maps.usesrelconstF)
943                     sprintf(register_name, "C[%u]", reg->idx);
944                 else
945                     sprintf(register_name, "C%u", reg->idx);
946             }
947             break;
948
949         case WINED3DSPR_TEXTURE: /* case WINED3DSPR_ADDR: */
950             if (pshader) {
951                 if(This->baseShader.reg_maps.shader_version.major == 1 &&
952                    This->baseShader.reg_maps.shader_version.minor <= 3) {
953                     /* In ps <= 1.3, Tx is a temporary register as destination to all instructions,
954                      * and as source to most instructions. For some instructions it is the texcoord
955                      * input. Those instructions know about the special use
956                      */
957                     sprintf(register_name, "T%u", reg->idx);
958                 } else {
959                     /* in ps 1.4 and 2.x Tx is always a (read-only) varying */
960                     sprintf(register_name, "fragment.texcoord[%u]", reg->idx);
961                 }
962             }
963             else
964             {
965                 if(This->baseShader.reg_maps.shader_version.major == 1 || ctx->target_version >= NV2)
966                 {
967                     sprintf(register_name, "A%u", reg->idx);
968                 }
969                 else
970                 {
971                     sprintf(register_name, "A%u_SHADOW", reg->idx);
972                 }
973             }
974             break;
975
976         case WINED3DSPR_COLOROUT:
977             if(ctx->cur_ps_args->super.srgb_correction && reg->idx == 0)
978             {
979                 strcpy(register_name, "TMP_COLOR");
980             }
981             else
982             {
983                 if(ctx->cur_ps_args->super.srgb_correction) FIXME("sRGB correction on higher render targets\n");
984                 if(This->baseShader.reg_maps.highest_render_target > 0)
985                 {
986                     sprintf(register_name, "result.color[%u]", reg->idx);
987                 }
988                 else
989                 {
990                     strcpy(register_name, "result.color");
991                 }
992             }
993             break;
994
995         case WINED3DSPR_RASTOUT:
996             if(reg->idx == 1) sprintf(register_name, "%s", ctx->fog_output);
997             else sprintf(register_name, "%s", rastout_reg_names[reg->idx]);
998             break;
999
1000         case WINED3DSPR_DEPTHOUT:
1001             strcpy(register_name, "result.depth");
1002             break;
1003
1004         case WINED3DSPR_ATTROUT:
1005         /* case WINED3DSPR_OUTPUT: */
1006             if (pshader) sprintf(register_name, "oD[%u]", reg->idx);
1007             else strcpy(register_name, ctx->color_output[reg->idx]);
1008             break;
1009
1010         case WINED3DSPR_TEXCRDOUT:
1011             if (pshader)
1012             {
1013                 sprintf(register_name, "oT[%u]", reg->idx);
1014             }
1015             else
1016             {
1017                 if(This->baseShader.reg_maps.shader_version.major < 3)
1018                 {
1019                     strcpy(register_name, ctx->texcrd_output[reg->idx]);
1020                 }
1021                 else
1022                 {
1023                     strcpy(register_name, ctx->vs_output[reg->idx]);
1024                 }
1025             }
1026             break;
1027
1028         case WINED3DSPR_LOOP:
1029             if(ctx->target_version >= NV2)
1030             {
1031                 /* Pshader has an implicitly declared loop index counter A0.x that cannot be renamed */
1032                 if(pshader) sprintf(register_name, "A0.x");
1033                 else sprintf(register_name, "aL.y");
1034             }
1035             else
1036             {
1037                 /* Unfortunately this code cannot return the value of ctx->aL here. An immediate value
1038                  * would be valid, but if aL is used for indexing(its only use), there's likely an offset,
1039                  * thus the result would be something like C[15 + 30], which is not valid in the ARB program
1040                  * grammar. So return a marker for the emulated aL and intercept it in constant and varying
1041                  * indexing
1042                  */
1043                 sprintf(register_name, "**aL_emul**");
1044             }
1045
1046             break;
1047
1048         case WINED3DSPR_CONSTINT:
1049             sprintf(register_name, "I%u", reg->idx);
1050             break;
1051
1052         case WINED3DSPR_MISCTYPE:
1053             if(reg->idx == 0)
1054             {
1055                 sprintf(register_name, "vpos");
1056             }
1057             else if(reg->idx == 1)
1058             {
1059                 sprintf(register_name, "fragment.facing.x");
1060             }
1061             else
1062             {
1063                 FIXME("Unknown MISCTYPE register index %u\n", reg->idx);
1064             }
1065             break;
1066
1067         default:
1068             FIXME("Unhandled register type %#x[%u]\n", reg->type, reg->idx);
1069             sprintf(register_name, "unrecognized_register[%u]", reg->idx);
1070             break;
1071     }
1072 }
1073
1074 static void shader_arb_get_dst_param(const struct wined3d_shader_instruction *ins,
1075         const struct wined3d_shader_dst_param *wined3d_dst, char *str)
1076 {
1077     char register_name[255];
1078     char write_mask[6];
1079     BOOL is_color;
1080
1081     shader_arb_get_register_name(ins, &wined3d_dst->reg, register_name, &is_color);
1082     strcpy(str, register_name);
1083
1084     shader_arb_get_write_mask(ins, wined3d_dst, write_mask);
1085     strcat(str, write_mask);
1086 }
1087
1088 static const char *shader_arb_get_fixup_swizzle(enum fixup_channel_source channel_source)
1089 {
1090     switch(channel_source)
1091     {
1092         case CHANNEL_SOURCE_ZERO: return "0";
1093         case CHANNEL_SOURCE_ONE: return "1";
1094         case CHANNEL_SOURCE_X: return "x";
1095         case CHANNEL_SOURCE_Y: return "y";
1096         case CHANNEL_SOURCE_Z: return "z";
1097         case CHANNEL_SOURCE_W: return "w";
1098         default:
1099             FIXME("Unhandled channel source %#x\n", channel_source);
1100             return "undefined";
1101     }
1102 }
1103
1104 static void gen_color_correction(struct wined3d_shader_buffer *buffer, const char *reg,
1105         DWORD dst_mask, const char *one, const char *two, struct color_fixup_desc fixup)
1106 {
1107     DWORD mask;
1108
1109     if (is_yuv_fixup(fixup))
1110     {
1111         enum yuv_fixup yuv_fixup = get_yuv_fixup(fixup);
1112         FIXME("YUV fixup (%#x) not supported\n", yuv_fixup);
1113         return;
1114     }
1115
1116     mask = 0;
1117     if (fixup.x_source != CHANNEL_SOURCE_X) mask |= WINED3DSP_WRITEMASK_0;
1118     if (fixup.y_source != CHANNEL_SOURCE_Y) mask |= WINED3DSP_WRITEMASK_1;
1119     if (fixup.z_source != CHANNEL_SOURCE_Z) mask |= WINED3DSP_WRITEMASK_2;
1120     if (fixup.w_source != CHANNEL_SOURCE_W) mask |= WINED3DSP_WRITEMASK_3;
1121     mask &= dst_mask;
1122
1123     if (mask)
1124     {
1125         shader_addline(buffer, "SWZ %s, %s, %s, %s, %s, %s;\n", reg, reg,
1126                 shader_arb_get_fixup_swizzle(fixup.x_source), shader_arb_get_fixup_swizzle(fixup.y_source),
1127                 shader_arb_get_fixup_swizzle(fixup.z_source), shader_arb_get_fixup_swizzle(fixup.w_source));
1128     }
1129
1130     mask = 0;
1131     if (fixup.x_sign_fixup) mask |= WINED3DSP_WRITEMASK_0;
1132     if (fixup.y_sign_fixup) mask |= WINED3DSP_WRITEMASK_1;
1133     if (fixup.z_sign_fixup) mask |= WINED3DSP_WRITEMASK_2;
1134     if (fixup.w_sign_fixup) mask |= WINED3DSP_WRITEMASK_3;
1135     mask &= dst_mask;
1136
1137     if (mask)
1138     {
1139         char reg_mask[6];
1140         char *ptr = reg_mask;
1141
1142         if (mask != WINED3DSP_WRITEMASK_ALL)
1143         {
1144             *ptr++ = '.';
1145             if (mask & WINED3DSP_WRITEMASK_0) *ptr++ = 'x';
1146             if (mask & WINED3DSP_WRITEMASK_1) *ptr++ = 'y';
1147             if (mask & WINED3DSP_WRITEMASK_2) *ptr++ = 'z';
1148             if (mask & WINED3DSP_WRITEMASK_3) *ptr++ = 'w';
1149         }
1150         *ptr = '\0';
1151
1152         shader_addline(buffer, "MAD %s%s, %s, %s, -%s;\n", reg, reg_mask, reg, two, one);
1153     }
1154 }
1155
1156 static const char *shader_arb_get_modifier(const struct wined3d_shader_instruction *ins)
1157 {
1158     DWORD mod;
1159     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
1160     if (!ins->dst_count) return "";
1161
1162     mod = ins->dst[0].modifiers;
1163
1164     /* Silently ignore PARTIALPRECISION if its not supported */
1165     if(priv->target_version == ARB) mod &= ~WINED3DSPDM_PARTIALPRECISION;
1166
1167     if(mod & WINED3DSPDM_MSAMPCENTROID)
1168     {
1169         FIXME("Unhandled modifier WINED3DSPDM_MSAMPCENTROID\n");
1170         mod &= ~WINED3DSPDM_MSAMPCENTROID;
1171     }
1172
1173     switch(mod)
1174     {
1175         case WINED3DSPDM_SATURATE | WINED3DSPDM_PARTIALPRECISION:
1176             return "H_SAT";
1177
1178         case WINED3DSPDM_SATURATE:
1179             return "_SAT";
1180
1181         case WINED3DSPDM_PARTIALPRECISION:
1182             return "H";
1183
1184         case 0:
1185             return "";
1186
1187         default:
1188             FIXME("Unknown modifiers 0x%08x\n", mod);
1189             return "";
1190     }
1191 }
1192
1193 #define TEX_PROJ        0x1
1194 #define TEX_BIAS        0x2
1195 #define TEX_LOD         0x4
1196 #define TEX_DERIV       0x10
1197
1198 static void shader_hw_sample(const struct wined3d_shader_instruction *ins, DWORD sampler_idx,
1199         const char *dst_str, const char *coord_reg, WORD flags, const char *dsx, const char *dsy)
1200 {
1201     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1202     DWORD sampler_type = ins->ctx->reg_maps->sampler_type[sampler_idx];
1203     const char *tex_type;
1204     BOOL np2_fixup = FALSE;
1205     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
1206     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) This->baseShader.device;
1207     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
1208     const char *mod;
1209     BOOL pshader = shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type);
1210
1211     /* D3D vertex shader sampler IDs are vertex samplers(0-3), not global d3d samplers */
1212     if(!pshader) sampler_idx += MAX_FRAGMENT_SAMPLERS;
1213
1214     switch(sampler_type) {
1215         case WINED3DSTT_1D:
1216             tex_type = "1D";
1217             break;
1218
1219         case WINED3DSTT_2D:
1220             if(device->stateBlock->textures[sampler_idx] &&
1221                IWineD3DBaseTexture_GetTextureDimensions(device->stateBlock->textures[sampler_idx]) == GL_TEXTURE_RECTANGLE_ARB) {
1222                 tex_type = "RECT";
1223             } else {
1224                 tex_type = "2D";
1225             }
1226             if (shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type))
1227             {
1228                 if (priv->cur_np2fixup_info->super.active & (1 << sampler_idx))
1229                 {
1230                     if (flags) FIXME("Only ordinary sampling from NP2 textures is supported.\n");
1231                     else np2_fixup = TRUE;
1232                 }
1233             }
1234             break;
1235
1236         case WINED3DSTT_VOLUME:
1237             tex_type = "3D";
1238             break;
1239
1240         case WINED3DSTT_CUBE:
1241             tex_type = "CUBE";
1242             break;
1243
1244         default:
1245             ERR("Unexpected texture type %d\n", sampler_type);
1246             tex_type = "";
1247     }
1248
1249     /* TEX, TXL, TXD and TXP do not support the "H" modifier,
1250      * so don't use shader_arb_get_modifier
1251      */
1252     if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE) mod = "_SAT";
1253     else mod = "";
1254
1255     /* Fragment samplers always have indentity mapping */
1256     if(sampler_idx >= MAX_FRAGMENT_SAMPLERS)
1257     {
1258         sampler_idx = priv->cur_vs_args->vertex_samplers[sampler_idx - MAX_FRAGMENT_SAMPLERS];
1259     }
1260
1261     if (flags & TEX_DERIV)
1262     {
1263         if(flags & TEX_PROJ) FIXME("Projected texture sampling with custom derivatives\n");
1264         if(flags & TEX_BIAS) FIXME("Biased texture sampling with custom derivatives\n");
1265         shader_addline(buffer, "TXD%s %s, %s, %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg,
1266                        dsx, dsy,sampler_idx, tex_type);
1267     }
1268     else if(flags & TEX_LOD)
1269     {
1270         if(flags & TEX_PROJ) FIXME("Projected texture sampling with explicit lod\n");
1271         if(flags & TEX_BIAS) FIXME("Biased texture sampling with explicit lod\n");
1272         shader_addline(buffer, "TXL%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg,
1273                        sampler_idx, tex_type);
1274     }
1275     else if (flags & TEX_BIAS)
1276     {
1277         /* Shouldn't be possible, but let's check for it */
1278         if(flags & TEX_PROJ) FIXME("Biased and Projected texture sampling\n");
1279         /* TXB takes the 4th component of the source vector automatically, as d3d. Nothing more to do */
1280         shader_addline(buffer, "TXB%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg, sampler_idx, tex_type);
1281     }
1282     else if (flags & TEX_PROJ)
1283     {
1284         shader_addline(buffer, "TXP%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg, sampler_idx, tex_type);
1285     }
1286     else
1287     {
1288         if (np2_fixup)
1289         {
1290             const unsigned char idx = priv->cur_np2fixup_info->super.idx[sampler_idx];
1291             shader_addline(buffer, "MUL TA, np2fixup[%u].%s, %s;\n", idx >> 1,
1292                            (idx % 2) ? "zwxy" : "xyzw", coord_reg);
1293
1294             shader_addline(buffer, "TEX%s %s, TA, texture[%u], %s;\n", mod, dst_str, sampler_idx, tex_type);
1295         }
1296         else
1297             shader_addline(buffer, "TEX%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg, sampler_idx, tex_type);
1298     }
1299
1300     if (pshader)
1301     {
1302         gen_color_correction(buffer, dst_str, ins->dst[0].write_mask,
1303                 "one", "coefmul.x", priv->cur_ps_args->super.color_fixup[sampler_idx]);
1304     }
1305 }
1306
1307 static void shader_arb_get_src_param(const struct wined3d_shader_instruction *ins,
1308         const struct wined3d_shader_src_param *src, unsigned int tmpreg, char *outregstr)
1309 {
1310     /* Generate a line that does the input modifier computation and return the input register to use */
1311     BOOL is_color = FALSE;
1312     char regstr[256];
1313     char swzstr[20];
1314     int insert_line;
1315     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1316     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1317
1318     /* Assume a new line will be added */
1319     insert_line = 1;
1320
1321     /* Get register name */
1322     shader_arb_get_register_name(ins, &src->reg, regstr, &is_color);
1323     shader_arb_get_swizzle(src, is_color, swzstr);
1324
1325     switch (src->modifiers)
1326     {
1327     case WINED3DSPSM_NONE:
1328         sprintf(outregstr, "%s%s", regstr, swzstr);
1329         insert_line = 0;
1330         break;
1331     case WINED3DSPSM_NEG:
1332         sprintf(outregstr, "-%s%s", regstr, swzstr);
1333         insert_line = 0;
1334         break;
1335     case WINED3DSPSM_BIAS:
1336         shader_addline(buffer, "ADD T%c, %s, -coefdiv.x;\n", 'A' + tmpreg, regstr);
1337         break;
1338     case WINED3DSPSM_BIASNEG:
1339         shader_addline(buffer, "ADD T%c, -%s, coefdiv.x;\n", 'A' + tmpreg, regstr);
1340         break;
1341     case WINED3DSPSM_SIGN:
1342         shader_addline(buffer, "MAD T%c, %s, coefmul.x, -one.x;\n", 'A' + tmpreg, regstr);
1343         break;
1344     case WINED3DSPSM_SIGNNEG:
1345         shader_addline(buffer, "MAD T%c, %s, -coefmul.x, one.x;\n", 'A' + tmpreg, regstr);
1346         break;
1347     case WINED3DSPSM_COMP:
1348         shader_addline(buffer, "SUB T%c, one.x, %s;\n", 'A' + tmpreg, regstr);
1349         break;
1350     case WINED3DSPSM_X2:
1351         shader_addline(buffer, "ADD T%c, %s, %s;\n", 'A' + tmpreg, regstr, regstr);
1352         break;
1353     case WINED3DSPSM_X2NEG:
1354         shader_addline(buffer, "ADD T%c, -%s, -%s;\n", 'A' + tmpreg, regstr, regstr);
1355         break;
1356     case WINED3DSPSM_DZ:
1357         shader_addline(buffer, "RCP T%c, %s.z;\n", 'A' + tmpreg, regstr);
1358         shader_addline(buffer, "MUL T%c, %s, T%c;\n", 'A' + tmpreg, regstr, 'A' + tmpreg);
1359         break;
1360     case WINED3DSPSM_DW:
1361         shader_addline(buffer, "RCP T%c, %s.w;\n", 'A' + tmpreg, regstr);
1362         shader_addline(buffer, "MUL T%c, %s, T%c;\n", 'A' + tmpreg, regstr, 'A' + tmpreg);
1363         break;
1364     case WINED3DSPSM_ABS:
1365         if(ctx->target_version >= NV2) {
1366             sprintf(outregstr, "|%s%s|", regstr, swzstr);
1367             insert_line = 0;
1368         } else {
1369             shader_addline(buffer, "ABS T%c, %s;\n", 'A' + tmpreg, regstr);
1370         }
1371         break;
1372     case WINED3DSPSM_ABSNEG:
1373         if(ctx->target_version >= NV2) {
1374             sprintf(outregstr, "-|%s%s|", regstr, swzstr);
1375         } else {
1376             shader_addline(buffer, "ABS T%c, %s;\n", 'A' + tmpreg, regstr);
1377             sprintf(outregstr, "-T%c%s", 'A' + tmpreg, swzstr);
1378         }
1379         insert_line = 0;
1380         break;
1381     default:
1382         sprintf(outregstr, "%s%s", regstr, swzstr);
1383         insert_line = 0;
1384     }
1385
1386     /* Return modified or original register, with swizzle */
1387     if (insert_line)
1388         sprintf(outregstr, "T%c%s", 'A' + tmpreg, swzstr);
1389 }
1390
1391 static void pshader_hw_bem(const struct wined3d_shader_instruction *ins)
1392 {
1393     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1394     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1395     char dst_name[50];
1396     char src_name[2][50];
1397     DWORD sampler_code = dst->reg.idx;
1398
1399     shader_arb_get_dst_param(ins, dst, dst_name);
1400
1401     /* Sampling the perturbation map in Tsrc was done already, including the signedness correction if needed
1402      *
1403      * Keep in mind that src_name[1] can be "TB" and src_name[0] can be "TA" because modifiers like _x2 are valid
1404      * with bem. So delay loading the first parameter until after the perturbation calculation which needs two
1405      * temps is done.
1406      */
1407     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1408     shader_addline(buffer, "SWZ TA, bumpenvmat%d, x, z, 0, 0;\n", sampler_code);
1409     shader_addline(buffer, "DP3 TC.r, TA, %s;\n", src_name[1]);
1410     shader_addline(buffer, "SWZ TA, bumpenvmat%d, y, w, 0, 0;\n", sampler_code);
1411     shader_addline(buffer, "DP3 TC.g, TA, %s;\n", src_name[1]);
1412
1413     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
1414     shader_addline(buffer, "ADD %s, %s, TC;\n", dst_name, src_name[0]);
1415 }
1416
1417 static DWORD negate_modifiers(DWORD mod, char *extra_char)
1418 {
1419     *extra_char = ' ';
1420     switch(mod)
1421     {
1422         case WINED3DSPSM_NONE:      return WINED3DSPSM_NEG;
1423         case WINED3DSPSM_NEG:       return WINED3DSPSM_NONE;
1424         case WINED3DSPSM_BIAS:      return WINED3DSPSM_BIASNEG;
1425         case WINED3DSPSM_BIASNEG:   return WINED3DSPSM_BIAS;
1426         case WINED3DSPSM_SIGN:      return WINED3DSPSM_SIGNNEG;
1427         case WINED3DSPSM_SIGNNEG:   return WINED3DSPSM_SIGN;
1428         case WINED3DSPSM_COMP:      *extra_char = '-'; return WINED3DSPSM_COMP;
1429         case WINED3DSPSM_X2:        return WINED3DSPSM_X2NEG;
1430         case WINED3DSPSM_X2NEG:     return WINED3DSPSM_X2;
1431         case WINED3DSPSM_DZ:        *extra_char = '-'; return WINED3DSPSM_DZ;
1432         case WINED3DSPSM_DW:        *extra_char = '-'; return WINED3DSPSM_DW;
1433         case WINED3DSPSM_ABS:       return WINED3DSPSM_ABSNEG;
1434         case WINED3DSPSM_ABSNEG:    return WINED3DSPSM_ABS;
1435     }
1436     FIXME("Unknown modifier %u\n", mod);
1437     return mod;
1438 }
1439
1440 static void pshader_hw_cnd(const struct wined3d_shader_instruction *ins)
1441 {
1442     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1443     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1444     char dst_name[50];
1445     char src_name[3][50];
1446     DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
1447             ins->ctx->reg_maps->shader_version.minor);
1448     BOOL is_color;
1449
1450     shader_arb_get_dst_param(ins, dst, dst_name);
1451     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1452
1453     /* The coissue flag changes the semantic of the cnd instruction in <= 1.3 shaders */
1454     if (shader_version <= WINED3D_SHADER_VERSION(1, 3) && ins->coissue)
1455     {
1456         shader_addline(buffer, "MOV%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name[1]);
1457     } else {
1458         struct wined3d_shader_src_param src0_copy = ins->src[0];
1459         char extra_neg;
1460
1461         /* src0 may have a negate srcmod set, so we can't blindly add "-" to the name */
1462         src0_copy.modifiers = negate_modifiers(src0_copy.modifiers, &extra_neg);
1463
1464         shader_arb_get_src_param(ins, &src0_copy, 0, src_name[0]);
1465         shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1466         shader_addline(buffer, "ADD TA, %c%s, coefdiv.x;\n", extra_neg, src_name[0]);
1467         /* No modifiers supported on CMP */
1468         shader_addline(buffer, "CMP %s, TA, %s, %s;\n", dst_name, src_name[1], src_name[2]);
1469
1470         /* _SAT on CMP doesn't make much sense, but it is not a pure NOP */
1471         if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE)
1472         {
1473             shader_arb_get_register_name(ins, &dst->reg, src_name[0], &is_color);
1474             shader_addline(buffer, "MOV_SAT %s, %s;\n", dst_name, dst_name);
1475         }
1476     }
1477 }
1478
1479 static void pshader_hw_cmp(const struct wined3d_shader_instruction *ins)
1480 {
1481     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1482     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1483     char dst_name[50];
1484     char src_name[3][50];
1485     BOOL is_color;
1486
1487     shader_arb_get_dst_param(ins, dst, dst_name);
1488
1489     /* Generate input register names (with modifiers) */
1490     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
1491     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1492     shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1493
1494     /* No modifiers are supported on CMP */
1495     shader_addline(buffer, "CMP %s, %s, %s, %s;\n", dst_name,
1496                    src_name[0], src_name[2], src_name[1]);
1497
1498     if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE)
1499     {
1500         shader_arb_get_register_name(ins, &dst->reg, src_name[0], &is_color);
1501         shader_addline(buffer, "MOV_SAT %s, %s;\n", dst_name, src_name[0]);
1502     }
1503 }
1504
1505 /** Process the WINED3DSIO_DP2ADD instruction in ARB.
1506  * dst = dot2(src0, src1) + src2 */
1507 static void pshader_hw_dp2add(const struct wined3d_shader_instruction *ins)
1508 {
1509     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1510     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1511     char dst_name[50];
1512     char src_name[3][50];
1513     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1514
1515     shader_arb_get_dst_param(ins, dst, dst_name);
1516     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
1517     shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1518
1519     if(ctx->target_version >= NV3)
1520     {
1521         /* GL_NV_fragment_program2 has a 1:1 matching instruction */
1522         shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1523         shader_addline(buffer, "DP2A%s %s, %s, %s, %s;\n", shader_arb_get_modifier(ins),
1524                        dst_name, src_name[0], src_name[1], src_name[2]);
1525     }
1526     else if(ctx->target_version >= NV2)
1527     {
1528         /* dst.x = src2.?, src0.x, src1.x + src0.y * src1.y
1529          * dst.y = src2.?, src0.x, src1.z + src0.y * src1.w
1530          * dst.z = src2.?, src0.x, src1.x + src0.y * src1.y
1531          * dst.z = src2.?, src0.x, src1.z + src0.y * src1.w
1532          *
1533          * Make sure that src1.zw = src1.xy, then we get a classic dp2add
1534          *
1535          * .xyxy and other swizzles that we could get with this are not valid in
1536          * plain ARBfp, but luckily the NV extension grammar lifts this limitation.
1537          */
1538         struct wined3d_shader_src_param tmp_param = ins->src[1];
1539         DWORD swizzle = tmp_param.swizzle & 0xf; /* Selects .xy */
1540         tmp_param.swizzle = swizzle | (swizzle << 4); /* Creates .xyxy */
1541
1542         shader_arb_get_src_param(ins, &tmp_param, 1, src_name[1]);
1543
1544         shader_addline(buffer, "X2D%s %s, %s, %s, %s;\n", shader_arb_get_modifier(ins),
1545                        dst_name, src_name[2], src_name[0], src_name[1]);
1546     }
1547     else
1548     {
1549         shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1550         /* Emulate a DP2 with a DP3 and 0.0. Don't use the dest as temp register, it could be src[1] or src[2]
1551         * src_name[0] can be TA, but TA is a private temp for modifiers, so it is save to overwrite
1552         */
1553         shader_addline(buffer, "MOV TA, %s;\n", src_name[0]);
1554         shader_addline(buffer, "MOV TA.z, 0.0;\n");
1555         shader_addline(buffer, "DP3 TA, TA, %s;\n", src_name[1]);
1556         shader_addline(buffer, "ADD%s %s, TA, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name[2]);
1557     }
1558 }
1559
1560 /* Map the opcode 1-to-1 to the GL code */
1561 static void shader_hw_map2gl(const struct wined3d_shader_instruction *ins)
1562 {
1563     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1564     const char *instruction;
1565     char arguments[256], dst_str[50];
1566     unsigned int i;
1567     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1568
1569     switch (ins->handler_idx)
1570     {
1571         case WINED3DSIH_ABS: instruction = "ABS"; break;
1572         case WINED3DSIH_ADD: instruction = "ADD"; break;
1573         case WINED3DSIH_CRS: instruction = "XPD"; break;
1574         case WINED3DSIH_DP3: instruction = "DP3"; break;
1575         case WINED3DSIH_DP4: instruction = "DP4"; break;
1576         case WINED3DSIH_DST: instruction = "DST"; break;
1577         case WINED3DSIH_FRC: instruction = "FRC"; break;
1578         case WINED3DSIH_LIT: instruction = "LIT"; break;
1579         case WINED3DSIH_LRP: instruction = "LRP"; break;
1580         case WINED3DSIH_MAD: instruction = "MAD"; break;
1581         case WINED3DSIH_MAX: instruction = "MAX"; break;
1582         case WINED3DSIH_MIN: instruction = "MIN"; break;
1583         case WINED3DSIH_MOV: instruction = "MOV"; break;
1584         case WINED3DSIH_MUL: instruction = "MUL"; break;
1585         case WINED3DSIH_SGE: instruction = "SGE"; break;
1586         case WINED3DSIH_SLT: instruction = "SLT"; break;
1587         case WINED3DSIH_SUB: instruction = "SUB"; break;
1588         case WINED3DSIH_MOVA:instruction = "ARR"; break;
1589         case WINED3DSIH_DSX: instruction = "DDX"; break;
1590         default: instruction = "";
1591             FIXME("Unhandled opcode %#x\n", ins->handler_idx);
1592             break;
1593     }
1594
1595     /* Note that shader_arb_add_dst_param() adds spaces. */
1596     arguments[0] = '\0';
1597     shader_arb_get_dst_param(ins, dst, dst_str);
1598     for (i = 0; i < ins->src_count; ++i)
1599     {
1600         char operand[100];
1601         strcat(arguments, ", ");
1602         shader_arb_get_src_param(ins, &ins->src[i], i, operand);
1603         strcat(arguments, operand);
1604     }
1605     shader_addline(buffer, "%s%s %s%s;\n", instruction, shader_arb_get_modifier(ins), dst_str, arguments);
1606 }
1607
1608 static void shader_hw_nop(const struct wined3d_shader_instruction *ins)
1609 {
1610     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1611     shader_addline(buffer, "NOP;\n");
1612 }
1613
1614 static void shader_hw_mov(const struct wined3d_shader_instruction *ins)
1615 {
1616     IWineD3DBaseShaderImpl *shader = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
1617     BOOL pshader = shader_is_pshader_version(shader->baseShader.reg_maps.shader_version.type);
1618     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1619
1620     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1621     char src0_param[256];
1622
1623     if(ins->handler_idx == WINED3DSIH_MOVA) {
1624         char write_mask[6];
1625
1626         if(ctx->target_version >= NV2) {
1627             shader_hw_map2gl(ins);
1628             return;
1629         }
1630         shader_arb_get_src_param(ins, &ins->src[0], 0, src0_param);
1631         shader_arb_get_write_mask(ins, &ins->dst[0], write_mask);
1632
1633         /* This implements the mova formula used in GLSL. The first two instructions
1634          * prepare the sign() part. Note that it is fine to have my_sign(0.0) = 1.0
1635          * in this case:
1636          * mova A0.x, 0.0
1637          *
1638          * A0.x = arl(floor(abs(0.0) + 0.5) * 1.0) = floor(0.5) = 0.0 since arl does a floor
1639          *
1640          * The ARL is performed when A0 is used - the requested component is read from A0_SHADOW into
1641          * A0.x. We can use the overwritten component of A0_shadow as temporary storage for the sign.
1642          */
1643         shader_addline(buffer, "SGE A0_SHADOW%s, %s, mova_const.y;\n", write_mask, src0_param);
1644         shader_addline(buffer, "MAD A0_SHADOW%s, A0_SHADOW, mova_const.z, -mova_const.w;\n", write_mask);
1645
1646         shader_addline(buffer, "ABS TA%s, %s;\n", write_mask, src0_param);
1647         shader_addline(buffer, "ADD TA%s, TA, mova_const.x;\n", write_mask);
1648         shader_addline(buffer, "FLR TA%s, TA;\n", write_mask);
1649         if (((IWineD3DVertexShaderImpl *)shader)->rel_offset)
1650         {
1651             shader_addline(buffer, "ADD TA%s, TA, helper_const.z;\n", write_mask);
1652         }
1653         shader_addline(buffer, "MUL A0_SHADOW%s, TA, A0_SHADOW;\n", write_mask);
1654
1655         ((struct shader_arb_ctx_priv *)ins->ctx->backend_data)->addr_reg[0] = '\0';
1656     } else if (ins->ctx->reg_maps->shader_version.major == 1
1657           && !shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)
1658           && ins->dst[0].reg.type == WINED3DSPR_ADDR)
1659     {
1660         src0_param[0] = '\0';
1661         if (((IWineD3DVertexShaderImpl *)shader)->rel_offset)
1662         {
1663             shader_arb_get_src_param(ins, &ins->src[0], 0, src0_param);
1664             shader_addline(buffer, "ADD TA.x, %s, helper_const.z;\n", src0_param);
1665             shader_addline(buffer, "ARL A0.x, TA.x;\n");
1666         }
1667         else
1668         {
1669             /* Apple's ARB_vertex_program implementation does not accept an ARL source argument
1670              * with more than one component. Thus replicate the first source argument over all
1671              * 4 components. For example, .xyzw -> .x (or better: .xxxx), .zwxy -> .z, etc) */
1672             struct wined3d_shader_src_param tmp_src = ins->src[0];
1673             tmp_src.swizzle = (tmp_src.swizzle & 0x3) * 0x55;
1674             shader_arb_get_src_param(ins, &tmp_src, 0, src0_param);
1675             shader_addline(buffer, "ARL A0.x, %s;\n", src0_param);
1676         }
1677     }
1678     else if(ins->dst[0].reg.type == WINED3DSPR_COLOROUT && ins->dst[0].reg.idx == 0 && pshader)
1679     {
1680         IWineD3DPixelShaderImpl *ps = (IWineD3DPixelShaderImpl *) shader;
1681         if(ctx->cur_ps_args->super.srgb_correction && ps->color0_mov)
1682         {
1683             shader_addline(buffer, "#mov handled in srgb write code\n");
1684             return;
1685         }
1686         shader_hw_map2gl(ins);
1687     }
1688     else
1689     {
1690         shader_hw_map2gl(ins);
1691     }
1692 }
1693
1694 static void pshader_hw_texkill(const struct wined3d_shader_instruction *ins)
1695 {
1696     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1697     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1698     char reg_dest[40];
1699
1700     /* No swizzles are allowed in d3d's texkill. PS 1.x ignores the 4th component as documented,
1701      * but >= 2.0 honors it(undocumented, but tested by the d3d9 testsuit)
1702      */
1703     shader_arb_get_dst_param(ins, dst, reg_dest);
1704
1705     if (ins->ctx->reg_maps->shader_version.major >= 2)
1706     {
1707         const char *kilsrc = "TA";
1708         BOOL is_color;
1709
1710         shader_arb_get_register_name(ins, &dst->reg, reg_dest, &is_color);
1711         if(dst->write_mask == WINED3DSP_WRITEMASK_ALL)
1712         {
1713             kilsrc = reg_dest;
1714         }
1715         else
1716         {
1717             /* Sigh. KIL doesn't support swizzles/writemasks. KIL passes a writemask, but ".xy" for example
1718              * is not valid as a swizzle in ARB (needs ".xyyy"). Use SWZ to load the register properly, and set
1719              * masked out components to 0(won't kill)
1720              */
1721             char x = '0', y = '0', z = '0', w = '0';
1722             if(dst->write_mask & WINED3DSP_WRITEMASK_0) x = 'x';
1723             if(dst->write_mask & WINED3DSP_WRITEMASK_1) y = 'y';
1724             if(dst->write_mask & WINED3DSP_WRITEMASK_2) z = 'z';
1725             if(dst->write_mask & WINED3DSP_WRITEMASK_3) w = 'w';
1726             shader_addline(buffer, "SWZ TA, %s, %c, %c, %c, %c;\n", reg_dest, x, y, z, w);
1727         }
1728         shader_addline(buffer, "KIL %s;\n", kilsrc);
1729     } else {
1730         /* ARB fp doesn't like swizzles on the parameter of the KIL instruction. To mask the 4th component,
1731          * copy the register into our general purpose TMP variable, overwrite .w and pass TMP to KIL
1732          *
1733          * ps_1_3 shaders use the texcoord incarnation of the Tx register. ps_1_4 shaders can use the same,
1734          * or pass in any temporary register(in shader phase 2)
1735          */
1736         if(ins->ctx->reg_maps->shader_version.minor <= 3) {
1737             sprintf(reg_dest, "fragment.texcoord[%u]", dst->reg.idx);
1738         } else {
1739             shader_arb_get_dst_param(ins, dst, reg_dest);
1740         }
1741         shader_addline(buffer, "SWZ TA, %s, x, y, z, 1;\n", reg_dest);
1742         shader_addline(buffer, "KIL TA;\n");
1743     }
1744 }
1745
1746 static void pshader_hw_tex(const struct wined3d_shader_instruction *ins)
1747 {
1748     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1749     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1750     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1751     DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
1752             ins->ctx->reg_maps->shader_version.minor);
1753     struct wined3d_shader_src_param src;
1754
1755     char reg_dest[40];
1756     char reg_coord[40];
1757     DWORD reg_sampler_code;
1758     DWORD myflags = 0;
1759
1760     /* All versions have a destination register */
1761     shader_arb_get_dst_param(ins, dst, reg_dest);
1762
1763     /* 1.0-1.4: Use destination register number as texture code.
1764        2.0+: Use provided sampler number as texure code. */
1765     if (shader_version < WINED3D_SHADER_VERSION(2,0))
1766         reg_sampler_code = dst->reg.idx;
1767     else
1768         reg_sampler_code = ins->src[1].reg.idx;
1769
1770     /* 1.0-1.3: Use the texcoord varying.
1771        1.4+: Use provided coordinate source register. */
1772     if (shader_version < WINED3D_SHADER_VERSION(1,4))
1773         sprintf(reg_coord, "fragment.texcoord[%u]", reg_sampler_code);
1774     else {
1775         /* TEX is the only instruction that can handle DW and DZ natively */
1776         src = ins->src[0];
1777         if(src.modifiers == WINED3DSPSM_DW) src.modifiers = WINED3DSPSM_NONE;
1778         if(src.modifiers == WINED3DSPSM_DZ) src.modifiers = WINED3DSPSM_NONE;
1779         shader_arb_get_src_param(ins, &src, 0, reg_coord);
1780     }
1781
1782     /* projection flag:
1783      * 1.1, 1.2, 1.3: Use WINED3DTSS_TEXTURETRANSFORMFLAGS
1784      * 1.4: Use WINED3DSPSM_DZ or WINED3DSPSM_DW on src[0]
1785      * 2.0+: Use WINED3DSI_TEXLD_PROJECT on the opcode
1786      */
1787     if (shader_version < WINED3D_SHADER_VERSION(1,4))
1788     {
1789         DWORD flags = 0;
1790         if(reg_sampler_code < MAX_TEXTURES) {
1791             flags = deviceImpl->stateBlock->textureState[reg_sampler_code][WINED3DTSS_TEXTURETRANSFORMFLAGS];
1792         }
1793         if (flags & WINED3DTTFF_PROJECTED) {
1794             myflags |= TEX_PROJ;
1795         }
1796     }
1797     else if (shader_version < WINED3D_SHADER_VERSION(2,0))
1798     {
1799         DWORD src_mod = ins->src[0].modifiers;
1800         if (src_mod == WINED3DSPSM_DZ) {
1801             /* TXP cannot handle DZ natively, so move the z coordinate to .w. reg_coord is a read-only
1802              * varying register, so we need a temp reg
1803              */
1804             shader_addline(ins->ctx->buffer, "SWZ TA, %s, x, y, z, z;\n", reg_coord);
1805             strcpy(reg_coord, "TA");
1806             myflags |= TEX_PROJ;
1807         } else if(src_mod == WINED3DSPSM_DW) {
1808             myflags |= TEX_PROJ;
1809         }
1810     } else {
1811         if (ins->flags & WINED3DSI_TEXLD_PROJECT) myflags |= TEX_PROJ;
1812         if (ins->flags & WINED3DSI_TEXLD_BIAS) myflags |= TEX_BIAS;
1813     }
1814     shader_hw_sample(ins, reg_sampler_code, reg_dest, reg_coord, myflags, NULL, NULL);
1815 }
1816
1817 static void pshader_hw_texcoord(const struct wined3d_shader_instruction *ins)
1818 {
1819     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1820     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1821     DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
1822             ins->ctx->reg_maps->shader_version.minor);
1823     char dst_str[50];
1824
1825     if (shader_version < WINED3D_SHADER_VERSION(1,4))
1826     {
1827         DWORD reg = dst->reg.idx;
1828
1829         shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1830         shader_addline(buffer, "MOV_SAT %s, fragment.texcoord[%u];\n", dst_str, reg);
1831     } else {
1832         char reg_src[40];
1833
1834         shader_arb_get_src_param(ins, &ins->src[0], 0, reg_src);
1835         shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1836         shader_addline(buffer, "MOV %s, %s;\n", dst_str, reg_src);
1837    }
1838 }
1839
1840 static void pshader_hw_texreg2ar(const struct wined3d_shader_instruction *ins)
1841 {
1842      struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1843      IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1844      IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1845      DWORD flags;
1846
1847      DWORD reg1 = ins->dst[0].reg.idx;
1848      char dst_str[50];
1849      char src_str[50];
1850
1851      /* Note that texreg2ar treats Tx as a temporary register, not as a varying */
1852      shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1853      shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
1854      /* Move .x first in case src_str is "TA" */
1855      shader_addline(buffer, "MOV TA.y, %s.x;\n", src_str);
1856      shader_addline(buffer, "MOV TA.x, %s.w;\n", src_str);
1857      flags = reg1 < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg1][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1858      shader_hw_sample(ins, reg1, dst_str, "TA", flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
1859 }
1860
1861 static void pshader_hw_texreg2gb(const struct wined3d_shader_instruction *ins)
1862 {
1863      struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1864
1865      DWORD reg1 = ins->dst[0].reg.idx;
1866      char dst_str[50];
1867      char src_str[50];
1868
1869      /* Note that texreg2gb treats Tx as a temporary register, not as a varying */
1870      shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1871      shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
1872      shader_addline(buffer, "MOV TA.x, %s.y;\n", src_str);
1873      shader_addline(buffer, "MOV TA.y, %s.z;\n", src_str);
1874      shader_hw_sample(ins, reg1, dst_str, "TA", 0, NULL, NULL);
1875 }
1876
1877 static void pshader_hw_texreg2rgb(const struct wined3d_shader_instruction *ins)
1878 {
1879     DWORD reg1 = ins->dst[0].reg.idx;
1880     char dst_str[50];
1881     char src_str[50];
1882
1883     /* Note that texreg2rg treats Tx as a temporary register, not as a varying */
1884     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1885     shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
1886     shader_hw_sample(ins, reg1, dst_str, src_str, 0, NULL, NULL);
1887 }
1888
1889 static void pshader_hw_texbem(const struct wined3d_shader_instruction *ins)
1890 {
1891     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1892     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1893     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1894     char reg_coord[40], dst_reg[50], src_reg[50];
1895     DWORD reg_dest_code;
1896
1897     /* All versions have a destination register. The Tx where the texture coordinates come
1898      * from is the varying incarnation of the texture register
1899      */
1900     reg_dest_code = dst->reg.idx;
1901     shader_arb_get_dst_param(ins, &ins->dst[0], dst_reg);
1902     shader_arb_get_src_param(ins, &ins->src[0], 0, src_reg);
1903     sprintf(reg_coord, "fragment.texcoord[%u]", reg_dest_code);
1904
1905     /* Sampling the perturbation map in Tsrc was done already, including the signedness correction if needed
1906      * The Tx in which the perturbation map is stored is the tempreg incarnation of the texture register
1907      *
1908      * GL_NV_fragment_program_option could handle this in one instruction via X2D:
1909      * X2D TA.xy, fragment.texcoord, T%u, bumpenvmat%u.xzyw
1910      *
1911      * However, the NV extensions are never enabled for <= 2.0 shaders because of the performance penalty that
1912      * comes with it, and texbem is an 1.x only instruction. No 1.x instruction forces us to enable the NV
1913      * extension.
1914      */
1915     shader_addline(buffer, "SWZ TB, bumpenvmat%d, x, z, 0, 0;\n", reg_dest_code);
1916     shader_addline(buffer, "DP3 TA.x, TB, %s;\n", src_reg);
1917     shader_addline(buffer, "SWZ TB, bumpenvmat%d, y, w, 0, 0;\n", reg_dest_code);
1918     shader_addline(buffer, "DP3 TA.y, TB, %s;\n", src_reg);
1919
1920     /* with projective textures, texbem only divides the static texture coord, not the displacement,
1921      * so we can't let the GL handle this.
1922      */
1923     if (((IWineD3DDeviceImpl*) This->baseShader.device)->stateBlock->textureState[reg_dest_code][WINED3DTSS_TEXTURETRANSFORMFLAGS]
1924             & WINED3DTTFF_PROJECTED) {
1925         shader_addline(buffer, "RCP TB.w, %s.w;\n", reg_coord);
1926         shader_addline(buffer, "MUL TB.xy, %s, TB.w;\n", reg_coord);
1927         shader_addline(buffer, "ADD TA.xy, TA, TB;\n");
1928     } else {
1929         shader_addline(buffer, "ADD TA.xy, TA, %s;\n", reg_coord);
1930     }
1931
1932     shader_hw_sample(ins, reg_dest_code, dst_reg, "TA", 0, NULL, NULL);
1933
1934     if (ins->handler_idx == WINED3DSIH_TEXBEML)
1935     {
1936         /* No src swizzles are allowed, so this is ok */
1937         shader_addline(buffer, "MAD TA, %s.z, luminance%d.x, luminance%d.y;\n",
1938                        src_reg, reg_dest_code, reg_dest_code);
1939         shader_addline(buffer, "MUL %s, %s, TA;\n", dst_reg, dst_reg);
1940     }
1941 }
1942
1943 static void pshader_hw_texm3x2pad(const struct wined3d_shader_instruction *ins)
1944 {
1945     DWORD reg = ins->dst[0].reg.idx;
1946     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1947     char src0_name[50], dst_name[50];
1948     BOOL is_color;
1949     struct wined3d_shader_register tmp_reg = ins->dst[0].reg;
1950
1951     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1952     /* The next instruction will be a texm3x2tex or texm3x2depth that writes to the uninitialized
1953      * T<reg+1> register. Use this register to store the calculated vector
1954      */
1955     tmp_reg.idx = reg + 1;
1956     shader_arb_get_register_name(ins, &tmp_reg, dst_name, &is_color);
1957     shader_addline(buffer, "DP3 %s.x, fragment.texcoord[%u], %s;\n", dst_name, reg, src0_name);
1958 }
1959
1960 static void pshader_hw_texm3x2tex(const struct wined3d_shader_instruction *ins)
1961 {
1962     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1963     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1964     DWORD flags;
1965     DWORD reg = ins->dst[0].reg.idx;
1966     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1967     char dst_str[50];
1968     char src0_name[50];
1969     char dst_reg[50];
1970     BOOL is_color;
1971
1972     /* We know that we're writing to the uninitialized T<reg> register, so use it for temporary storage */
1973     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
1974
1975     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1976     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1977     shader_addline(buffer, "DP3 %s.y, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
1978     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1979     shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
1980 }
1981
1982 static void pshader_hw_texm3x3pad(const struct wined3d_shader_instruction *ins)
1983 {
1984     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1985     DWORD reg = ins->dst[0].reg.idx;
1986     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
1987     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
1988     char src0_name[50], dst_name[50];
1989     struct wined3d_shader_register tmp_reg = ins->dst[0].reg;
1990     BOOL is_color;
1991
1992     /* There are always 2 texm3x3pad instructions followed by one texm3x3[tex,vspec, ...] instruction, with
1993      * incrementing ins->dst[0].register_idx numbers. So the pad instruction already knows the final destination
1994      * register, and this register is uninitialized(otherwise the assembler complains that it is 'redeclared')
1995      */
1996     tmp_reg.idx = reg + 2 - current_state->current_row;
1997     shader_arb_get_register_name(ins, &tmp_reg, dst_name, &is_color);
1998
1999     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
2000     shader_addline(buffer, "DP3 %s.%c, fragment.texcoord[%u], %s;\n",
2001                    dst_name, 'x' + current_state->current_row, reg, src0_name);
2002     current_state->texcoord_w[current_state->current_row++] = reg;
2003 }
2004
2005 static void pshader_hw_texm3x3tex(const struct wined3d_shader_instruction *ins)
2006 {
2007     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
2008     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
2009     DWORD flags;
2010     DWORD reg = ins->dst[0].reg.idx;
2011     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2012     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
2013     char dst_str[50];
2014     char src0_name[50], dst_name[50];
2015     BOOL is_color;
2016
2017     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
2018     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
2019     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_name, reg, src0_name);
2020
2021     /* Sample the texture using the calculated coordinates */
2022     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
2023     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
2024     shader_hw_sample(ins, reg, dst_str, dst_name, flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
2025     current_state->current_row = 0;
2026 }
2027
2028 static void pshader_hw_texm3x3vspec(const struct wined3d_shader_instruction *ins)
2029 {
2030     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
2031     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
2032     DWORD flags;
2033     DWORD reg = ins->dst[0].reg.idx;
2034     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2035     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
2036     char dst_str[50];
2037     char src0_name[50];
2038     char dst_reg[50];
2039     BOOL is_color;
2040
2041     /* Get the dst reg without writemask strings. We know this register is uninitialized, so we can use all
2042      * components for temporary data storage
2043      */
2044     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
2045     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
2046     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
2047
2048     /* Construct the eye-ray vector from w coordinates */
2049     shader_addline(buffer, "MOV TB.x, fragment.texcoord[%u].w;\n", current_state->texcoord_w[0]);
2050     shader_addline(buffer, "MOV TB.y, fragment.texcoord[%u].w;\n", current_state->texcoord_w[1]);
2051     shader_addline(buffer, "MOV TB.z, fragment.texcoord[%u].w;\n", reg);
2052
2053     /* Calculate reflection vector
2054      */
2055     shader_addline(buffer, "DP3 %s.w, %s, TB;\n", dst_reg, dst_reg);
2056     /* The .w is ignored when sampling, so I can use TB.w to calculate dot(N, N) */
2057     shader_addline(buffer, "DP3 TB.w, %s, %s;\n", dst_reg, dst_reg);
2058     shader_addline(buffer, "RCP TB.w, TB.w;\n");
2059     shader_addline(buffer, "MUL %s.w, %s.w, TB.w;\n", dst_reg, dst_reg);
2060     shader_addline(buffer, "MUL %s, %s.w, %s;\n", dst_reg, dst_reg, dst_reg);
2061     shader_addline(buffer, "MAD %s, coefmul.x, %s, -TB;\n", dst_reg, dst_reg);
2062
2063     /* Sample the texture using the calculated coordinates */
2064     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
2065     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
2066     shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
2067     current_state->current_row = 0;
2068 }
2069
2070 static void pshader_hw_texm3x3spec(const struct wined3d_shader_instruction *ins)
2071 {
2072     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
2073     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
2074     DWORD flags;
2075     DWORD reg = ins->dst[0].reg.idx;
2076     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
2077     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2078     char dst_str[50];
2079     char src0_name[50];
2080     char src1_name[50];
2081     char dst_reg[50];
2082     BOOL is_color;
2083
2084     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
2085     shader_arb_get_src_param(ins, &ins->src[0], 1, src1_name);
2086     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
2087     /* Note: dst_reg.xy is input here, generated by two texm3x3pad instructions */
2088     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
2089
2090     /* Calculate reflection vector.
2091      *
2092      *                   dot(N, E)
2093      * dst_reg.xyz = 2 * --------- * N - E
2094      *                   dot(N, N)
2095      *
2096      * Which normalizes the normal vector
2097      */
2098     shader_addline(buffer, "DP3 %s.w, %s, %s;\n", dst_reg, dst_reg, src1_name);
2099     shader_addline(buffer, "DP3 TC.w, %s, %s;\n", dst_reg, dst_reg);
2100     shader_addline(buffer, "RCP TC.w, TC.w;\n");
2101     shader_addline(buffer, "MUL %s.w, %s.w, TC.w;\n", dst_reg, dst_reg);
2102     shader_addline(buffer, "MUL %s, %s.w, %s;\n", dst_reg, dst_reg, dst_reg);
2103     shader_addline(buffer, "MAD %s, coefmul.x, %s, -%s;\n", dst_reg, dst_reg, src1_name);
2104
2105     /* Sample the texture using the calculated coordinates */
2106     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
2107     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
2108     shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
2109     current_state->current_row = 0;
2110 }
2111
2112 static void pshader_hw_texdepth(const struct wined3d_shader_instruction *ins)
2113 {
2114     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2115     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2116     char dst_name[50];
2117
2118     /* texdepth has an implicit destination, the fragment depth value. It's only parameter,
2119      * which is essentially an input, is the destination register because it is the first
2120      * parameter. According to the msdn, this must be register r5, but let's keep it more flexible
2121      * here(writemasks/swizzles are not valid on texdepth)
2122      */
2123     shader_arb_get_dst_param(ins, dst, dst_name);
2124
2125     /* According to the msdn, the source register(must be r5) is unusable after
2126      * the texdepth instruction, so we're free to modify it
2127      */
2128     shader_addline(buffer, "MIN %s.y, %s.y, one.y;\n", dst_name, dst_name);
2129
2130     /* How to deal with the special case dst_name.g == 0? if r != 0, then
2131      * the r * (1 / 0) will give infinity, which is clamped to 1.0, the correct
2132      * result. But if r = 0.0, then 0 * inf = 0, which is incorrect.
2133      */
2134     shader_addline(buffer, "RCP %s.y, %s.y;\n", dst_name, dst_name);
2135     shader_addline(buffer, "MUL TA.x, %s.x, %s.y;\n", dst_name, dst_name);
2136     shader_addline(buffer, "MIN TA.x, TA.x, one.x;\n");
2137     shader_addline(buffer, "MAX result.depth, TA.x, 0.0;\n");
2138 }
2139
2140 /** Process the WINED3DSIO_TEXDP3TEX instruction in ARB:
2141  * Take a 3-component dot product of the TexCoord[dstreg] and src,
2142  * then perform a 1D texture lookup from stage dstregnum, place into dst. */
2143 static void pshader_hw_texdp3tex(const struct wined3d_shader_instruction *ins)
2144 {
2145     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2146     DWORD sampler_idx = ins->dst[0].reg.idx;
2147     char src0[50];
2148     char dst_str[50];
2149
2150     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
2151     shader_addline(buffer, "MOV TB, 0.0;\n");
2152     shader_addline(buffer, "DP3 TB.x, fragment.texcoord[%u], %s;\n", sampler_idx, src0);
2153
2154     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
2155     shader_hw_sample(ins, sampler_idx, dst_str, "TB", 0 /* Only one coord, can't be projected */, NULL, NULL);
2156 }
2157
2158 /** Process the WINED3DSIO_TEXDP3 instruction in ARB:
2159  * Take a 3-component dot product of the TexCoord[dstreg] and src. */
2160 static void pshader_hw_texdp3(const struct wined3d_shader_instruction *ins)
2161 {
2162     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2163     char src0[50];
2164     char dst_str[50];
2165     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2166
2167     /* Handle output register */
2168     shader_arb_get_dst_param(ins, dst, dst_str);
2169     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
2170     shader_addline(buffer, "DP3 %s, fragment.texcoord[%u], %s;\n", dst_str, dst->reg.idx, src0);
2171 }
2172
2173 /** Process the WINED3DSIO_TEXM3X3 instruction in ARB
2174  * Perform the 3rd row of a 3x3 matrix multiply */
2175 static void pshader_hw_texm3x3(const struct wined3d_shader_instruction *ins)
2176 {
2177     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2178     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2179     char dst_str[50], dst_name[50];
2180     char src0[50];
2181     BOOL is_color;
2182
2183     shader_arb_get_dst_param(ins, dst, dst_str);
2184     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
2185     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
2186     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_name, dst->reg.idx, src0);
2187     shader_addline(buffer, "MOV %s, %s;\n", dst_str, dst_name);
2188 }
2189
2190 /** Process the WINED3DSIO_TEXM3X2DEPTH instruction in ARB:
2191  * Last row of a 3x2 matrix multiply, use the result to calculate the depth:
2192  * Calculate tmp0.y = TexCoord[dstreg] . src.xyz;  (tmp0.x has already been calculated)
2193  * depth = (tmp0.y == 0.0) ? 1.0 : tmp0.x / tmp0.y
2194  */
2195 static void pshader_hw_texm3x2depth(const struct wined3d_shader_instruction *ins)
2196 {
2197     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2198     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2199     char src0[50], dst_name[50];
2200     BOOL is_color;
2201
2202     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
2203     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
2204     shader_addline(buffer, "DP3 %s.y, fragment.texcoord[%u], %s;\n", dst_name, dst->reg.idx, src0);
2205
2206     /* How to deal with the special case dst_name.g == 0? if r != 0, then
2207      * the r * (1 / 0) will give infinity, which is clamped to 1.0, the correct
2208      * result. But if r = 0.0, then 0 * inf = 0, which is incorrect.
2209      */
2210     shader_addline(buffer, "RCP %s.y, %s.y;\n", dst_name, dst_name);
2211     shader_addline(buffer, "MUL %s.x, %s.x, %s.y;\n", dst_name, dst_name, dst_name);
2212     shader_addline(buffer, "MIN %s.x, %s.x, one.x;\n", dst_name, dst_name);
2213     shader_addline(buffer, "MAX result.depth, %s.x, 0.0;\n", dst_name);
2214 }
2215
2216 /** Handles transforming all WINED3DSIO_M?x? opcodes for
2217     Vertex/Pixel shaders to ARB_vertex_program codes */
2218 static void shader_hw_mnxn(const struct wined3d_shader_instruction *ins)
2219 {
2220     int i;
2221     int nComponents = 0;
2222     struct wined3d_shader_dst_param tmp_dst = {{0}};
2223     struct wined3d_shader_src_param tmp_src[2] = {{{0}}};
2224     struct wined3d_shader_instruction tmp_ins;
2225
2226     memset(&tmp_ins, 0, sizeof(tmp_ins));
2227
2228     /* Set constants for the temporary argument */
2229     tmp_ins.ctx = ins->ctx;
2230     tmp_ins.dst_count = 1;
2231     tmp_ins.dst = &tmp_dst;
2232     tmp_ins.src_count = 2;
2233     tmp_ins.src = tmp_src;
2234
2235     switch(ins->handler_idx)
2236     {
2237         case WINED3DSIH_M4x4:
2238             nComponents = 4;
2239             tmp_ins.handler_idx = WINED3DSIH_DP4;
2240             break;
2241         case WINED3DSIH_M4x3:
2242             nComponents = 3;
2243             tmp_ins.handler_idx = WINED3DSIH_DP4;
2244             break;
2245         case WINED3DSIH_M3x4:
2246             nComponents = 4;
2247             tmp_ins.handler_idx = WINED3DSIH_DP3;
2248             break;
2249         case WINED3DSIH_M3x3:
2250             nComponents = 3;
2251             tmp_ins.handler_idx = WINED3DSIH_DP3;
2252             break;
2253         case WINED3DSIH_M3x2:
2254             nComponents = 2;
2255             tmp_ins.handler_idx = WINED3DSIH_DP3;
2256             break;
2257         default:
2258             FIXME("Unhandled opcode %#x\n", ins->handler_idx);
2259             break;
2260     }
2261
2262     tmp_dst = ins->dst[0];
2263     tmp_src[0] = ins->src[0];
2264     tmp_src[1] = ins->src[1];
2265     for (i = 0; i < nComponents; i++) {
2266         tmp_dst.write_mask = WINED3DSP_WRITEMASK_0 << i;
2267         shader_hw_map2gl(&tmp_ins);
2268         ++tmp_src[1].reg.idx;
2269     }
2270 }
2271
2272 static void shader_hw_scalar_op(const struct wined3d_shader_instruction *ins)
2273 {
2274     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2275     const char *instruction;
2276
2277     char dst[50];
2278     char src[50];
2279
2280     switch(ins->handler_idx)
2281     {
2282         case WINED3DSIH_RSQ:  instruction = "RSQ"; break;
2283         case WINED3DSIH_RCP:  instruction = "RCP"; break;
2284         case WINED3DSIH_EXP:  instruction = "EX2"; break;
2285         case WINED3DSIH_EXPP: instruction = "EXP"; break;
2286         default: instruction = "";
2287             FIXME("Unhandled opcode %#x\n", ins->handler_idx);
2288             break;
2289     }
2290
2291     shader_arb_get_dst_param(ins, &ins->dst[0], dst); /* Destination */
2292     shader_arb_get_src_param(ins, &ins->src[0], 0, src);
2293     if (ins->src[0].swizzle == WINED3DSP_NOSWIZZLE)
2294     {
2295         /* Dx sdk says .x is used if no swizzle is given, but our test shows that
2296          * .w is used
2297          */
2298         strcat(src, ".w");
2299     }
2300
2301     shader_addline(buffer, "%s%s %s, %s;\n", instruction, shader_arb_get_modifier(ins), dst, src);
2302 }
2303
2304 static void shader_hw_nrm(const struct wined3d_shader_instruction *ins)
2305 {
2306     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2307     char dst_name[50];
2308     char src_name[50];
2309     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2310     BOOL pshader = shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type);
2311
2312     shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
2313     shader_arb_get_src_param(ins, &ins->src[0], 1 /* Use TB */, src_name);
2314
2315     if(pshader && priv->target_version >= NV3)
2316     {
2317         shader_addline(buffer, "NRM%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name);
2318     }
2319     else
2320     {
2321         shader_addline(buffer, "DP3 TA, %s, %s;\n", src_name, src_name);
2322         shader_addline(buffer, "RSQ TA, TA.x;\n");
2323         /* dst.w = src[0].w * 1 / (src.x^2 + src.y^2 + src.z^2)^(1/2) according to msdn*/
2324         shader_addline(buffer, "MUL%s %s, %s, TA;\n", shader_arb_get_modifier(ins), dst_name,
2325                     src_name);
2326     }
2327 }
2328
2329 static void shader_hw_lrp(const struct wined3d_shader_instruction *ins)
2330 {
2331     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2332     char dst_name[50];
2333     char src_name[3][50];
2334
2335     /* ARB_fragment_program has a convenient LRP instruction */
2336     if(shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)) {
2337         shader_hw_map2gl(ins);
2338         return;
2339     }
2340
2341     shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
2342     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
2343     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
2344     shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
2345
2346     shader_addline(buffer, "SUB TA, %s, %s;\n", src_name[1], src_name[2]);
2347     shader_addline(buffer, "MAD%s %s, %s, TA, %s;\n", shader_arb_get_modifier(ins),
2348                    dst_name, src_name[0], src_name[2]);
2349 }
2350
2351 static void shader_hw_sincos(const struct wined3d_shader_instruction *ins)
2352 {
2353     /* This instruction exists in ARB, but the d3d instruction takes two extra parameters which
2354      * must contain fixed constants. So we need a separate function to filter those constants and
2355      * can't use map2gl
2356      */
2357     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2358     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2359     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2360     char dst_name[50];
2361     char src_name0[50], src_name1[50], src_name2[50];
2362     BOOL is_color;
2363
2364     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0);
2365     if(shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)) {
2366         shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
2367         /* No modifiers are supported on SCS */
2368         shader_addline(buffer, "SCS %s, %s;\n", dst_name, src_name0);
2369
2370         if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE)
2371         {
2372             shader_arb_get_register_name(ins, &dst->reg, src_name0, &is_color);
2373             shader_addline(buffer, "MOV_SAT %s, %s;\n", dst_name, src_name0);
2374         }
2375     } else if(priv->target_version >= NV2) {
2376         shader_arb_get_register_name(ins, &dst->reg, dst_name, &is_color);
2377
2378         /* Sincos writemask must be .x, .y or .xy */
2379         if(dst->write_mask & WINED3DSP_WRITEMASK_0)
2380             shader_addline(buffer, "COS%s %s.x, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name0);
2381         if(dst->write_mask & WINED3DSP_WRITEMASK_1)
2382             shader_addline(buffer, "SIN%s %s.y, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name0);
2383     } else {
2384         /* Approximate sine and cosine with a taylor series, as per math textbook. The application passes 8
2385          * helper constants(D3DSINCOSCONST1 and D3DSINCOSCONST2) in src1 and src2.
2386          *
2387          * sin(x) = x - x^3/3! + x^5/5! - x^7/7! + ...
2388          * cos(x) = 1 - x^2/2! + x^4/4! - x^6/6! + ...
2389          *
2390          * The constants we get are:
2391          *
2392          *  +1   +1,     -1     -1     +1      +1      -1       -1
2393          *      ---- ,  ---- , ---- , ----- , ----- , ----- , ------
2394          *      1!*2    2!*4   3!*8   4!*16   5!*32   6!*64   7!*128
2395          *
2396          * If used with x^2, x^3, x^4 etc they calculate sin(x/2) and cos(x/2):
2397          *
2398          * (x/2)^2 = x^2 / 4
2399          * (x/2)^3 = x^3 / 8
2400          * (x/2)^4 = x^4 / 16
2401          * (x/2)^5 = x^5 / 32
2402          * etc
2403          *
2404          * To get the final result:
2405          * sin(x) = 2 * sin(x/2) * cos(x/2)
2406          * cos(x) = cos(x/2)^2 - sin(x/2)^2
2407          * (from sin(x+y) and cos(x+y) rules)
2408          *
2409          * As per MSDN, dst.z is undefined after the operation, and so is
2410          * dst.x and dst.y if they're masked out by the writemask. Ie
2411          * sincos dst.y, src1, c0, c1
2412          * returns the sine in dst.y. dst.x and dst.z are undefined, dst.w is not touched. The assembler
2413          * vsa.exe also stops with an error if the dest register is the same register as the source
2414          * register. This means we can use dest.xyz as temporary storage. The assembler vsa.exe output also
2415          * indicates that sincos consumes 8 instruction slots in vs_2_0(and, strangely, in vs_3_0).
2416          */
2417         shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1);
2418         shader_arb_get_src_param(ins, &ins->src[2], 2, src_name2);
2419         shader_arb_get_register_name(ins, &dst->reg, dst_name, &is_color);
2420
2421         shader_addline(buffer, "MUL %s.x, %s, %s;\n", dst_name, src_name0, src_name0);  /* x ^ 2 */
2422         shader_addline(buffer, "MUL TA.y, %s.x, %s;\n", dst_name, src_name0);           /* x ^ 3 */
2423         shader_addline(buffer, "MUL %s.y, TA.y, %s;\n", dst_name, src_name0);           /* x ^ 4 */
2424         shader_addline(buffer, "MUL TA.z, %s.y, %s;\n", dst_name, src_name0);           /* x ^ 5 */
2425         shader_addline(buffer, "MUL %s.z, TA.z, %s;\n", dst_name, src_name0);           /* x ^ 6 */
2426         shader_addline(buffer, "MUL TA.w, %s.z, %s;\n", dst_name, src_name0);           /* x ^ 7 */
2427
2428         /* sin(x/2)
2429          *
2430          * Unfortunately we don't get the constants in a DP4-capable form. Is there a way to
2431          * properly merge that with MULs in the code above?
2432          * The swizzles .yz and xw however fit into the .yzxw swizzle added to ps_2_0. Maybe
2433          * we can merge the sine and cosine MAD rows to calculate them together.
2434          */
2435         shader_addline(buffer, "MUL TA.x, %s, %s.w;\n", src_name0, src_name2); /* x^1, +1/(1!*2) */
2436         shader_addline(buffer, "MAD TA.x, TA.y, %s.x, TA.x;\n", src_name2); /* -1/(3!*8) */
2437         shader_addline(buffer, "MAD TA.x, TA.z, %s.w, TA.x;\n", src_name1); /* +1/(5!*32) */
2438         shader_addline(buffer, "MAD TA.x, TA.w, %s.x, TA.x;\n", src_name1); /* -1/(7!*128) */
2439
2440         /* cos(x/2) */
2441         shader_addline(buffer, "MAD TA.y, %s.x, %s.y, %s.z;\n", dst_name, src_name2, src_name2); /* -1/(2!*4), +1.0 */
2442         shader_addline(buffer, "MAD TA.y, %s.y, %s.z, TA.y;\n", dst_name, src_name1); /* +1/(4!*16) */
2443         shader_addline(buffer, "MAD TA.y, %s.z, %s.y, TA.y;\n", dst_name, src_name1); /* -1/(6!*64) */
2444
2445         if(dst->write_mask & WINED3DSP_WRITEMASK_0) {
2446             /* cos x */
2447             shader_addline(buffer, "MUL TA.z, TA.y, TA.y;\n");
2448             shader_addline(buffer, "MAD %s.x, -TA.x, TA.x, TA.z;\n", dst_name);
2449         }
2450         if(dst->write_mask & WINED3DSP_WRITEMASK_1) {
2451             /* sin x */
2452             shader_addline(buffer, "MUL %s.y, TA.x, TA.y;\n", dst_name);
2453             shader_addline(buffer, "ADD %s.y, %s.y, %s.y;\n", dst_name, dst_name, dst_name);
2454         }
2455     }
2456 }
2457
2458 static void shader_hw_sgn(const struct wined3d_shader_instruction *ins)
2459 {
2460     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2461     char dst_name[50];
2462     char src_name[50];
2463     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
2464
2465     shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
2466     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name);
2467
2468     /* SGN is only valid in vertex shaders */
2469     if(ctx->target_version >= NV2) {
2470         shader_addline(buffer, "SSG%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name);
2471         return;
2472     }
2473
2474     /* If SRC > 0.0, -SRC < SRC = TRUE, otherwise false.
2475      * if SRC < 0.0,  SRC < -SRC = TRUE. If neither is true, src = 0.0
2476      */
2477     if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE) {
2478         shader_addline(buffer, "SLT %s, -%s, %s;\n", dst_name, src_name, src_name);
2479     } else {
2480         /* src contains TA? Write to the dest first. This won't overwrite our destination.
2481          * Then use TA, and calculate the final result
2482          *
2483          * Not reading from TA? Store the first result in TA to avoid overwriting the
2484          * destination if src reg = dst reg
2485          */
2486         if(strstr(src_name, "TA"))
2487         {
2488             shader_addline(buffer, "SLT %s,  %s, -%s;\n", dst_name, src_name, src_name);
2489             shader_addline(buffer, "SLT TA, -%s, %s;\n", src_name, src_name);
2490             shader_addline(buffer, "ADD %s, %s, -TA;\n", dst_name, dst_name);
2491         }
2492         else
2493         {
2494             shader_addline(buffer, "SLT TA, -%s, %s;\n", src_name, src_name);
2495             shader_addline(buffer, "SLT %s,  %s, -%s;\n", dst_name, src_name, src_name);
2496             shader_addline(buffer, "ADD %s, TA, -%s;\n", dst_name, dst_name);
2497         }
2498     }
2499 }
2500
2501 static void shader_hw_dsy(const struct wined3d_shader_instruction *ins)
2502 {
2503     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2504     char src[50];
2505     char dst[50];
2506     char dst_name[50];
2507     BOOL is_color;
2508
2509     shader_arb_get_dst_param(ins, &ins->dst[0], dst);
2510     shader_arb_get_src_param(ins, &ins->src[0], 0, src);
2511     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
2512
2513     shader_addline(buffer, "DDY %s, %s;\n", dst, src);
2514     shader_addline(buffer, "MUL%s %s, %s, ycorrection.y;\n", shader_arb_get_modifier(ins), dst, dst_name);
2515 }
2516
2517 static DWORD abs_modifier(DWORD mod, BOOL *need_abs)
2518 {
2519     *need_abs = FALSE;
2520
2521     switch(mod)
2522     {
2523         case WINED3DSPSM_NONE:      return WINED3DSPSM_ABS;
2524         case WINED3DSPSM_NEG:       return WINED3DSPSM_ABS;
2525         case WINED3DSPSM_BIAS:      *need_abs = TRUE; return WINED3DSPSM_BIAS;
2526         case WINED3DSPSM_BIASNEG:   *need_abs = TRUE; return WINED3DSPSM_BIASNEG;
2527         case WINED3DSPSM_SIGN:      *need_abs = TRUE; return WINED3DSPSM_SIGN;
2528         case WINED3DSPSM_SIGNNEG:   *need_abs = TRUE; return WINED3DSPSM_SIGNNEG;
2529         case WINED3DSPSM_COMP:      *need_abs = TRUE; return WINED3DSPSM_COMP;
2530         case WINED3DSPSM_X2:        *need_abs = TRUE; return WINED3DSPSM_X2;
2531         case WINED3DSPSM_X2NEG:     *need_abs = TRUE; return WINED3DSPSM_X2NEG;
2532         case WINED3DSPSM_DZ:        *need_abs = TRUE; return WINED3DSPSM_DZ;
2533         case WINED3DSPSM_DW:        *need_abs = TRUE; return WINED3DSPSM_DW;
2534         case WINED3DSPSM_ABS:       return WINED3DSPSM_ABS;
2535         case WINED3DSPSM_ABSNEG:    return WINED3DSPSM_ABS;
2536     }
2537     FIXME("Unknown modifier %u\n", mod);
2538     return mod;
2539 }
2540
2541 static void shader_hw_log_pow(const struct wined3d_shader_instruction *ins)
2542 {
2543     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2544     char src0[50], src1[50], dst[50];
2545     struct wined3d_shader_src_param src0_copy = ins->src[0];
2546     BOOL need_abs = FALSE;
2547     const char *instr;
2548     BOOL arg2 = FALSE;
2549
2550     switch(ins->handler_idx)
2551     {
2552         case WINED3DSIH_LOG:  instr = "LG2"; break;
2553         case WINED3DSIH_LOGP: instr = "LOG"; break;
2554         case WINED3DSIH_POW:  instr = "POW"; arg2 = TRUE; break;
2555         default:
2556             ERR("Unexpected instruction %d\n", ins->handler_idx);
2557             return;
2558     }
2559
2560     /* LOG, LOGP and POW operate on the absolute value of the input */
2561     src0_copy.modifiers = abs_modifier(src0_copy.modifiers, &need_abs);
2562
2563     shader_arb_get_dst_param(ins, &ins->dst[0], dst);
2564     shader_arb_get_src_param(ins, &src0_copy, 0, src0);
2565     if(arg2) shader_arb_get_src_param(ins, &ins->src[1], 1, src1);
2566
2567     if(need_abs)
2568     {
2569         shader_addline(buffer, "ABS TA, %s;\n", src0);
2570         if(arg2)
2571         {
2572             shader_addline(buffer, "%s%s %s, TA, %s;\n", instr, shader_arb_get_modifier(ins), dst, src1);
2573         }
2574         else
2575         {
2576             shader_addline(buffer, "%s%s %s, TA;\n", instr, shader_arb_get_modifier(ins), dst);
2577         }
2578     }
2579     else if(arg2)
2580     {
2581         shader_addline(buffer, "%s%s %s, %s, %s;\n", instr, shader_arb_get_modifier(ins), dst, src0, src1);
2582     }
2583     else
2584     {
2585         shader_addline(buffer, "%s%s %s, %s;\n", instr, shader_arb_get_modifier(ins), dst, src0);
2586     }
2587 }
2588
2589 static void shader_hw_loop(const struct wined3d_shader_instruction *ins)
2590 {
2591     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2592     char src_name[50];
2593     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2594
2595     /* src0 is aL */
2596     shader_arb_get_src_param(ins, &ins->src[1], 0, src_name);
2597
2598     if(vshader)
2599     {
2600         struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2601         struct list *e = list_head(&priv->control_frames);
2602         struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2603
2604         if(priv->loop_depth > 1) shader_addline(buffer, "PUSHA aL;\n");
2605         /* The constant loader makes sure to load -1 into iX.w */
2606         shader_addline(buffer, "ARLC aL, %s.xywz;\n", src_name);
2607         shader_addline(buffer, "BRA loop_%u_end (LE.x);\n", control_frame->loop_no);
2608         shader_addline(buffer, "loop_%u_start:\n", control_frame->loop_no);
2609     }
2610     else
2611     {
2612         shader_addline(buffer, "LOOP %s;\n", src_name);
2613     }
2614 }
2615
2616 static void shader_hw_rep(const struct wined3d_shader_instruction *ins)
2617 {
2618     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2619     char src_name[50];
2620     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2621
2622     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name);
2623
2624     /* The constant loader makes sure to load -1 into iX.w */
2625     if(vshader)
2626     {
2627         struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2628         struct list *e = list_head(&priv->control_frames);
2629         struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2630
2631         if(priv->loop_depth > 1) shader_addline(buffer, "PUSHA aL;\n");
2632
2633         shader_addline(buffer, "ARLC aL, %s.xywz;\n", src_name);
2634         shader_addline(buffer, "BRA loop_%u_end (LE.x);\n", control_frame->loop_no);
2635         shader_addline(buffer, "loop_%u_start:\n", control_frame->loop_no);
2636     }
2637     else
2638     {
2639         shader_addline(buffer, "REP %s;\n", src_name);
2640     }
2641 }
2642
2643 static void shader_hw_endloop(const struct wined3d_shader_instruction *ins)
2644 {
2645     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2646     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2647
2648     if(vshader)
2649     {
2650         struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2651         struct list *e = list_head(&priv->control_frames);
2652         struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2653
2654         shader_addline(buffer, "ARAC aL.xy, aL;\n");
2655         shader_addline(buffer, "BRA loop_%u_start (GT.x);\n", control_frame->loop_no);
2656         shader_addline(buffer, "loop_%u_end:\n", control_frame->loop_no);
2657
2658         if(priv->loop_depth > 1) shader_addline(buffer, "POPA aL;\n");
2659     }
2660     else
2661     {
2662         shader_addline(buffer, "ENDLOOP;\n");
2663     }
2664 }
2665
2666 static void shader_hw_endrep(const struct wined3d_shader_instruction *ins)
2667 {
2668     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2669     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2670
2671     if(vshader)
2672     {
2673         struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2674         struct list *e = list_head(&priv->control_frames);
2675         struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2676
2677         shader_addline(buffer, "ARAC aL.xy, aL;\n");
2678         shader_addline(buffer, "BRA loop_%u_start (GT.x);\n", control_frame->loop_no);
2679         shader_addline(buffer, "loop_%u_end:\n", control_frame->loop_no);
2680
2681         if(priv->loop_depth > 1) shader_addline(buffer, "POPA aL;\n");
2682     }
2683     else
2684     {
2685         shader_addline(buffer, "ENDREP;\n");
2686     }
2687 }
2688
2689 static const struct control_frame *find_last_loop(const struct shader_arb_ctx_priv *priv)
2690 {
2691     struct control_frame *control_frame;
2692
2693     LIST_FOR_EACH_ENTRY(control_frame, &priv->control_frames, struct control_frame, entry)
2694     {
2695         if(control_frame->type == LOOP || control_frame->type == REP) return control_frame;
2696     }
2697     ERR("Could not find loop for break\n");
2698     return NULL;
2699 }
2700
2701 static void shader_hw_break(const struct wined3d_shader_instruction *ins)
2702 {
2703     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2704     const struct control_frame *control_frame = find_last_loop(ins->ctx->backend_data);
2705     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2706
2707     if(vshader)
2708     {
2709         shader_addline(buffer, "BRA loop_%u_end;\n", control_frame->loop_no);
2710     }
2711     else
2712     {
2713         shader_addline(buffer, "BRK;\n");
2714     }
2715 }
2716
2717 static const char *get_compare(COMPARISON_TYPE flags)
2718 {
2719     switch (flags)
2720     {
2721         case COMPARISON_GT: return "GT";
2722         case COMPARISON_EQ: return "EQ";
2723         case COMPARISON_GE: return "GE";
2724         case COMPARISON_LT: return "LT";
2725         case COMPARISON_NE: return "NE";
2726         case COMPARISON_LE: return "LE";
2727         default:
2728             FIXME("Unrecognized comparison value: %u\n", flags);
2729             return "(\?\?)";
2730     }
2731 }
2732
2733 static COMPARISON_TYPE invert_compare(COMPARISON_TYPE flags)
2734 {
2735     switch (flags)
2736     {
2737         case COMPARISON_GT: return COMPARISON_LE;
2738         case COMPARISON_EQ: return COMPARISON_NE;
2739         case COMPARISON_GE: return COMPARISON_LT;
2740         case COMPARISON_LT: return COMPARISON_GE;
2741         case COMPARISON_NE: return COMPARISON_EQ;
2742         case COMPARISON_LE: return COMPARISON_GT;
2743         default:
2744             FIXME("Unrecognized comparison value: %u\n", flags);
2745             return -1;
2746     }
2747 }
2748
2749 static void shader_hw_breakc(const struct wined3d_shader_instruction *ins)
2750 {
2751     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2752     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2753     const struct control_frame *control_frame = find_last_loop(ins->ctx->backend_data);
2754     char src_name0[50];
2755     char src_name1[50];
2756     const char *comp = get_compare(ins->flags);
2757
2758     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0);
2759     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1);
2760
2761     if(vshader)
2762     {
2763         /* SUBC CC, src0, src1" works only in pixel shaders, so use TA to throw
2764          * away the subtraction result
2765          */
2766         shader_addline(buffer, "SUBC TA, %s, %s;\n", src_name0, src_name1);
2767         shader_addline(buffer, "BRA loop_%u_end (%s.x);\n", control_frame->loop_no, comp);
2768     }
2769     else
2770     {
2771         shader_addline(buffer, "SUBC TA, %s, %s;\n", src_name0, src_name1);
2772         shader_addline(buffer, "BRK (%s.x);\n", comp);
2773     }
2774 }
2775
2776 static void shader_hw_ifc(const struct wined3d_shader_instruction *ins)
2777 {
2778     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2779     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2780     struct list *e = list_head(&priv->control_frames);
2781     struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2782     const char *comp;
2783     char src_name0[50];
2784     char src_name1[50];
2785     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2786
2787     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0);
2788     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1);
2789
2790     if(vshader)
2791     {
2792         /* Invert the flag. We jump to the else label if the condition is NOT true */
2793         comp = get_compare(invert_compare(ins->flags));
2794         shader_addline(buffer, "SUBC TA, %s, %s;\n", src_name0, src_name1);
2795         shader_addline(buffer, "BRA ifc_%u_else (%s.x);\n", control_frame->ifc_no, comp);
2796     }
2797     else
2798     {
2799         comp = get_compare(ins->flags);
2800         shader_addline(buffer, "SUBC TA, %s, %s;\n", src_name0, src_name1);
2801         shader_addline(buffer, "IF %s.x;\n", comp);
2802     }
2803 }
2804
2805 static void shader_hw_else(const struct wined3d_shader_instruction *ins)
2806 {
2807     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2808     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2809     struct list *e = list_head(&priv->control_frames);
2810     struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2811     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2812
2813     if(vshader)
2814     {
2815         shader_addline(buffer, "BRA ifc_%u_endif;\n", control_frame->ifc_no);
2816         shader_addline(buffer, "ifc_%u_else:\n", control_frame->ifc_no);
2817         control_frame->had_else = TRUE;
2818     }
2819     else
2820     {
2821         shader_addline(buffer, "ELSE;\n");
2822     }
2823 }
2824
2825 static void shader_hw_endif(const struct wined3d_shader_instruction *ins)
2826 {
2827     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2828     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2829     struct list *e = list_head(&priv->control_frames);
2830     struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2831     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2832
2833     if(vshader)
2834     {
2835         if(control_frame->had_else)
2836         {
2837             shader_addline(buffer, "ifc_%u_endif:\n", control_frame->ifc_no);
2838         }
2839         else
2840         {
2841             shader_addline(buffer, "#No else branch. else is endif\n");
2842             shader_addline(buffer, "ifc_%u_else:\n", control_frame->ifc_no);
2843         }
2844     }
2845     else
2846     {
2847         shader_addline(buffer, "ENDIF;\n");
2848     }
2849 }
2850
2851 static void shader_hw_texldd(const struct wined3d_shader_instruction *ins)
2852 {
2853     DWORD sampler_idx = ins->src[1].reg.idx;
2854     char reg_dest[40];
2855     char reg_src[3][40];
2856     DWORD flags = TEX_DERIV;
2857
2858     shader_arb_get_dst_param(ins, &ins->dst[0], reg_dest);
2859     shader_arb_get_src_param(ins, &ins->src[0], 0, reg_src[0]);
2860     shader_arb_get_src_param(ins, &ins->src[2], 1, reg_src[1]);
2861     shader_arb_get_src_param(ins, &ins->src[3], 2, reg_src[2]);
2862
2863     if (ins->flags & WINED3DSI_TEXLD_PROJECT) flags |= TEX_PROJ;
2864     if (ins->flags & WINED3DSI_TEXLD_BIAS) flags |= TEX_BIAS;
2865
2866     shader_hw_sample(ins, sampler_idx, reg_dest, reg_src[0], flags, reg_src[1], reg_src[2]);
2867 }
2868
2869 static void shader_hw_texldl(const struct wined3d_shader_instruction *ins)
2870 {
2871     DWORD sampler_idx = ins->src[1].reg.idx;
2872     char reg_dest[40];
2873     char reg_coord[40];
2874     DWORD flags = TEX_LOD;
2875
2876     shader_arb_get_dst_param(ins, &ins->dst[0], reg_dest);
2877     shader_arb_get_src_param(ins, &ins->src[0], 0, reg_coord);
2878
2879     if (ins->flags & WINED3DSI_TEXLD_PROJECT) flags |= TEX_PROJ;
2880     if (ins->flags & WINED3DSI_TEXLD_BIAS) flags |= TEX_BIAS;
2881
2882     shader_hw_sample(ins, sampler_idx, reg_dest, reg_coord, flags, NULL, NULL);
2883 }
2884
2885 static void shader_hw_label(const struct wined3d_shader_instruction *ins)
2886 {
2887     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2888     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2889
2890     priv->in_main_func = FALSE;
2891     /* Call instructions activate the NV extensions, not labels and rets. If there is an uncalled
2892      * subroutine, don't generate a label that will make GL complain
2893      */
2894     if(priv->target_version == ARB) return;
2895
2896     shader_addline(buffer, "l%u:\n", ins->src[0].reg.idx);
2897 }
2898
2899 static void vshader_add_footer(IWineD3DVertexShaderImpl *This, struct wined3d_shader_buffer *buffer,
2900         const struct arb_vs_compile_args *args, struct shader_arb_ctx_priv *priv_ctx)
2901 {
2902     const shader_reg_maps *reg_maps = &This->baseShader.reg_maps;
2903     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *)This->baseShader.device;
2904     const struct wined3d_gl_info *gl_info = &device->adapter->gl_info;
2905     unsigned int i;
2906
2907     /* The D3DRS_FOGTABLEMODE render state defines if the shader-generated fog coord is used
2908      * or if the fragment depth is used. If the fragment depth is used(FOGTABLEMODE != NONE),
2909      * the fog frag coord is thrown away. If the fog frag coord is used, but not written by
2910      * the shader, it is set to 0.0(fully fogged, since start = 1.0, end = 0.0)
2911      */
2912     if(args->super.fog_src == VS_FOG_Z) {
2913         shader_addline(buffer, "MOV result.fogcoord, TMP_OUT.z;\n");
2914     } else if (!reg_maps->fog) {
2915         /* posFixup.x is always 1.0, so we can savely use it */
2916         shader_addline(buffer, "ADD result.fogcoord, posFixup.x, -posFixup.x;\n");
2917     }
2918
2919     /* Write the final position.
2920      *
2921      * OpenGL coordinates specify the center of the pixel while d3d coords specify
2922      * the corner. The offsets are stored in z and w in posFixup. posFixup.y contains
2923      * 1.0 or -1.0 to turn the rendering upside down for offscreen rendering. PosFixup.x
2924      * contains 1.0 to allow a mad, but arb vs swizzles are too restricted for that.
2925      */
2926     shader_addline(buffer, "MUL TA, posFixup, TMP_OUT.w;\n");
2927     shader_addline(buffer, "ADD TMP_OUT.x, TMP_OUT.x, TA.z;\n");
2928     shader_addline(buffer, "MAD TMP_OUT.y, TMP_OUT.y, posFixup.y, TA.w;\n");
2929
2930     if(use_nv_clip(gl_info) && priv_ctx->target_version >= NV2)
2931     {
2932         for(i = 0; i < priv_ctx->vs_clipplanes; i++)
2933         {
2934             shader_addline(buffer, "DP4 result.clip[%u].x, TMP_OUT, state.clip[%u].plane;\n", i, i);
2935         }
2936     }
2937     else if(args->boolclip.clip_texcoord)
2938     {
2939         unsigned int cur_clip = 0;
2940         char component[4] = {'x', 'y', 'z', 'w'};
2941
2942         for(i = 0; i < GL_LIMITS(clipplanes); i++)
2943         {
2944             if(args->boolclip.clipplane_mask & (1 << i))
2945             {
2946                 shader_addline(buffer, "DP4 TA.%c, TMP_OUT, state.clip[%u].plane;\n",
2947                                component[cur_clip++], i);
2948             }
2949         }
2950         switch(cur_clip)
2951         {
2952             case 0:
2953                 shader_addline(buffer, "MOV TA, -helper_const.w;\n");
2954                 break;
2955             case 1:
2956                 shader_addline(buffer, "MOV TA.yzw, -helper_const.w;\n");
2957                 break;
2958             case 2:
2959                 shader_addline(buffer, "MOV TA.zw, -helper_const.w;\n");
2960                 break;
2961             case 3:
2962                 shader_addline(buffer, "MOV TA.w, -helper_const.w;\n");
2963                 break;
2964         }
2965         shader_addline(buffer, "MOV result.texcoord[%u], TA;\n",
2966                        args->boolclip.clip_texcoord - 1);
2967     }
2968
2969     /* Z coord [0;1]->[-1;1] mapping, see comment in transform_projection in state.c
2970      * and the glsl equivalent
2971      */
2972     if(need_helper_const(gl_info)) {
2973         shader_addline(buffer, "MAD TMP_OUT.z, TMP_OUT.z, helper_const.x, -TMP_OUT.w;\n");
2974     } else {
2975         shader_addline(buffer, "ADD TMP_OUT.z, TMP_OUT.z, TMP_OUT.z;\n");
2976         shader_addline(buffer, "ADD TMP_OUT.z, TMP_OUT.z, -TMP_OUT.w;\n");
2977     }
2978
2979     shader_addline(buffer, "MOV result.position, TMP_OUT;\n");
2980
2981     priv_ctx->footer_written = TRUE;
2982 }
2983
2984 static void shader_hw_ret(const struct wined3d_shader_instruction *ins)
2985 {
2986     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
2987     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2988     IWineD3DBaseShaderImpl *shader = (IWineD3DBaseShaderImpl *) ins->ctx->shader;
2989     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2990
2991     if(priv->target_version == ARB) return;
2992
2993     if(vshader)
2994     {
2995         if(priv->in_main_func) vshader_add_footer((IWineD3DVertexShaderImpl *) shader, buffer, priv->cur_vs_args, priv);
2996     }
2997
2998     shader_addline(buffer, "RET;\n");
2999 }
3000
3001 static void shader_hw_call(const struct wined3d_shader_instruction *ins)
3002 {
3003     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
3004     shader_addline(buffer, "CAL l%u;\n", ins->src[0].reg.idx);
3005 }
3006
3007 /* GL locking is done by the caller */
3008 static GLuint create_arb_blt_vertex_program(const struct wined3d_gl_info *gl_info)
3009 {
3010     GLuint program_id = 0;
3011     GLint pos;
3012
3013     const char *blt_vprogram =
3014         "!!ARBvp1.0\n"
3015         "PARAM c[1] = { { 1, 0.5 } };\n"
3016         "MOV result.position, vertex.position;\n"
3017         "MOV result.color, c[0].x;\n"
3018         "MOV result.texcoord[0], vertex.texcoord[0];\n"
3019         "END\n";
3020
3021     GL_EXTCALL(glGenProgramsARB(1, &program_id));
3022     GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, program_id));
3023     GL_EXTCALL(glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
3024             strlen(blt_vprogram), blt_vprogram));
3025     checkGLcall("glProgramStringARB()");
3026
3027     glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
3028     if (pos != -1)
3029     {
3030         FIXME("Vertex program error at position %d: %s\n", pos,
3031             debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
3032     }
3033     else
3034     {
3035         GLint native;
3036
3037         GL_EXTCALL(glGetProgramivARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_UNDER_NATIVE_LIMITS_ARB, &native));
3038         checkGLcall("glGetProgramivARB()");
3039         if (!native) WARN("Program exceeds native resource limits.\n");
3040     }
3041
3042     return program_id;
3043 }
3044
3045 /* GL locking is done by the caller */
3046 static GLuint create_arb_blt_fragment_program(const struct wined3d_gl_info *gl_info, enum tex_types tex_type)
3047 {
3048     GLuint program_id = 0;
3049     GLint pos;
3050
3051     static const char * const blt_fprograms[tex_type_count] =
3052     {
3053         /* tex_1d */
3054         NULL,
3055         /* tex_2d */
3056         "!!ARBfp1.0\n"
3057         "TEMP R0;\n"
3058         "TEX R0.x, fragment.texcoord[0], texture[0], 2D;\n"
3059         "MOV result.depth.z, R0.x;\n"
3060         "END\n",
3061         /* tex_3d */
3062         NULL,
3063         /* tex_cube */
3064         "!!ARBfp1.0\n"
3065         "TEMP R0;\n"
3066         "TEX R0.x, fragment.texcoord[0], texture[0], CUBE;\n"
3067         "MOV result.depth.z, R0.x;\n"
3068         "END\n",
3069         /* tex_rect */
3070         "!!ARBfp1.0\n"
3071         "TEMP R0;\n"
3072         "TEX R0.x, fragment.texcoord[0], texture[0], RECT;\n"
3073         "MOV result.depth.z, R0.x;\n"
3074         "END\n",
3075     };
3076
3077     if (!blt_fprograms[tex_type])
3078     {
3079         FIXME("tex_type %#x not supported\n", tex_type);
3080         tex_type = tex_2d;
3081     }
3082
3083     GL_EXTCALL(glGenProgramsARB(1, &program_id));
3084     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, program_id));
3085     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
3086             strlen(blt_fprograms[tex_type]), blt_fprograms[tex_type]));
3087     checkGLcall("glProgramStringARB()");
3088
3089     glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
3090     if (pos != -1)
3091     {
3092         FIXME("Fragment program error at position %d: %s\n", pos,
3093             debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
3094     }
3095     else
3096     {
3097         GLint native;
3098
3099         GL_EXTCALL(glGetProgramivARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_UNDER_NATIVE_LIMITS_ARB, &native));
3100         checkGLcall("glGetProgramivARB()");
3101         if (!native) WARN("Program exceeds native resource limits.\n");
3102     }
3103
3104     return program_id;
3105 }
3106
3107 static void arbfp_add_sRGB_correction(struct wined3d_shader_buffer *buffer, const char *fragcolor,
3108         const char *tmp1, const char *tmp2, const char *tmp3, const char *tmp4, BOOL condcode)
3109 {
3110     /* Perform sRGB write correction. See GLX_EXT_framebuffer_sRGB */
3111
3112     if(condcode)
3113     {
3114         /* Sigh. MOVC CC doesn't work, so use one of the temps as dummy dest */
3115         shader_addline(buffer, "SUBC %s, %s.x, srgb_consts1.y;\n", tmp1, fragcolor);
3116         /* Calculate the > 0.0031308 case */
3117         shader_addline(buffer, "POW %s.x (GE), %s.x, srgb_consts1.z;\n", fragcolor, fragcolor);
3118         shader_addline(buffer, "POW %s.y (GE), %s.y, srgb_consts1.z;\n", fragcolor, fragcolor);
3119         shader_addline(buffer, "POW %s.z (GE), %s.z, srgb_consts1.z;\n", fragcolor, fragcolor);
3120         shader_addline(buffer, "MUL %s.xyz (GE), %s, srgb_consts1.w;\n", fragcolor, fragcolor);
3121         shader_addline(buffer, "SUB %s.xyz (GE), %s, srgb_consts2.x;\n", fragcolor, fragcolor);
3122         /* Calculate the < case */
3123         shader_addline(buffer, "MUL %s.xyz (LT), srgb_consts1.x, %s;\n", fragcolor, fragcolor);
3124     }
3125     else
3126     {
3127         /* Calculate the > 0.0031308 case */
3128         shader_addline(buffer, "POW %s.x, %s.x, srgb_consts1.z;\n", tmp1, fragcolor);
3129         shader_addline(buffer, "POW %s.y, %s.y, srgb_consts1.z;\n", tmp1, fragcolor);
3130         shader_addline(buffer, "POW %s.z, %s.z, srgb_consts1.z;\n", tmp1, fragcolor);
3131         shader_addline(buffer, "MUL %s, %s, srgb_consts1.w;\n", tmp1, tmp1);
3132         shader_addline(buffer, "SUB %s, %s, srgb_consts2.x;\n", tmp1, tmp1);
3133         /* Calculate the < case */
3134         shader_addline(buffer, "MUL %s, srgb_consts1.x, %s;\n", tmp2, fragcolor);
3135         /* Get 1.0 / 0.0 masks for > 0.0031308 and < 0.0031308 */
3136         shader_addline(buffer, "SLT %s, srgb_consts1.y, %s;\n", tmp3, fragcolor);
3137         shader_addline(buffer, "SGE %s, srgb_consts1.y, %s;\n", tmp4, fragcolor);
3138         /* Store the components > 0.0031308 in the destination */
3139         shader_addline(buffer, "MUL %s.xyz, %s, %s;\n", fragcolor, tmp1, tmp3);
3140         /* Add the components that are < 0.0031308 */
3141         shader_addline(buffer, "MAD %s.xyz, %s, %s, %s;\n", fragcolor, tmp2, tmp4, fragcolor);
3142         /* Move everything into result.color at once. Nvidia hardware cannot handle partial
3143         * result.color writes(.rgb first, then .a), or handle overwriting already written
3144         * components. The assembler uses a temporary register in this case, which is usually
3145         * not allocated from one of our registers that were used earlier.
3146         */
3147     }
3148     /* [0.0;1.0] clamping. Not needed, this is done implicitly */
3149 }
3150
3151 static const DWORD *find_loop_control_values(IWineD3DBaseShaderImpl *This, DWORD idx)
3152 {
3153     const local_constant *constant;
3154
3155     LIST_FOR_EACH_ENTRY(constant, &This->baseShader.constantsI, local_constant, entry)
3156     {
3157         if (constant->idx == idx)
3158         {
3159             return constant->value;
3160         }
3161     }
3162     return NULL;
3163 }
3164
3165 static void init_ps_input(const IWineD3DPixelShaderImpl *This, const struct arb_ps_compile_args *args,
3166                           struct shader_arb_ctx_priv *priv)
3167 {
3168     const char *texcoords[8] =
3169     {
3170         "fragment.texcoord[0]", "fragment.texcoord[1]", "fragment.texcoord[2]", "fragment.texcoord[3]",
3171         "fragment.texcoord[4]", "fragment.texcoord[5]", "fragment.texcoord[6]", "fragment.texcoord[7]"
3172     };
3173     unsigned int i;
3174     const struct wined3d_shader_signature_element *sig = This->input_signature;
3175     const char *semantic_name;
3176     DWORD semantic_idx;
3177
3178     switch(args->super.vp_mode)
3179     {
3180         case pretransformed:
3181         case fixedfunction:
3182             /* The pixelshader has to collect the varyings on its own. In any case properly load
3183              * color0 and color1. In the case of pretransformed vertices also load texcoords. Set
3184              * other attribs to 0.0.
3185              *
3186              * For fixedfunction this behavior is correct, according to the tests. For pretransformed
3187              * we'd either need a replacement shader that can load other attribs like BINORMAL, or
3188              * load the texcoord attrib pointers to match the pixel shader signature
3189              */
3190             for(i = 0; i < MAX_REG_INPUT; i++)
3191             {
3192                 semantic_name = sig[i].semantic_name;
3193                 semantic_idx = sig[i].semantic_idx;
3194                 if(semantic_name == NULL) continue;
3195
3196                 if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_COLOR))
3197                 {
3198                     if(semantic_idx == 0) priv->ps_input[i] = "fragment.color.primary";
3199                     else if(semantic_idx == 1) priv->ps_input[i] = "fragment.color.secondary";
3200                     else priv->ps_input[i] = "0.0";
3201                 }
3202                 else if(args->super.vp_mode == fixedfunction)
3203                 {
3204                     priv->ps_input[i] = "0.0";
3205                 }
3206                 else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_TEXCOORD))
3207                 {
3208                     if(semantic_idx < 8) priv->ps_input[i] = texcoords[semantic_idx];
3209                     else priv->ps_input[i] = "0.0";
3210                 }
3211                 else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_FOG))
3212                 {
3213                     if(semantic_idx == 0) priv->ps_input[i] = "fragment.fogcoord";
3214                     else priv->ps_input[i] = "0.0";
3215                 }
3216                 else
3217                 {
3218                     priv->ps_input[i] = "0.0";
3219                 }
3220
3221                 TRACE("v%u, semantic %s%u is %s\n", i, semantic_name, semantic_idx, priv->ps_input[i]);
3222             }
3223             break;
3224
3225         case vertexshader:
3226             /* That one is easy. The vertex shaders provide v0-v7 in fragment.texcoord and v8 and v9 in
3227              * fragment.color
3228              */
3229             for(i = 0; i < 8; i++)
3230             {
3231                 priv->ps_input[i] = texcoords[i];
3232             }
3233             priv->ps_input[8] = "fragment.color.primary";
3234             priv->ps_input[9] = "fragment.color.secondary";
3235             break;
3236     }
3237 }
3238
3239 /* GL locking is done by the caller */
3240 static GLuint shader_arb_generate_pshader(IWineD3DPixelShaderImpl *This, struct wined3d_shader_buffer *buffer,
3241         const struct arb_ps_compile_args *args, struct arb_ps_compiled_shader *compiled)
3242 {
3243     const shader_reg_maps* reg_maps = &This->baseShader.reg_maps;
3244     CONST DWORD *function = This->baseShader.function;
3245     const struct wined3d_gl_info *gl_info = &((IWineD3DDeviceImpl *)This->baseShader.device)->adapter->gl_info;
3246     const local_constant *lconst;
3247     GLuint retval;
3248     char fragcolor[16];
3249     DWORD *lconst_map = local_const_mapping((IWineD3DBaseShaderImpl *) This), next_local, cur;
3250     struct shader_arb_ctx_priv priv_ctx;
3251     BOOL dcl_tmp = args->super.srgb_correction, dcl_td = FALSE;
3252     BOOL want_nv_prog = FALSE;
3253     struct arb_pshader_private *shader_priv = This->baseShader.backend_data;
3254     GLint errPos;
3255     DWORD map;
3256
3257     char srgbtmp[4][4];
3258     unsigned int i, found = 0;
3259
3260     for (i = 0, map = reg_maps->temporary; map; map >>= 1, ++i)
3261     {
3262         if (!(map & 1)
3263                 || (This->color0_mov && i == This->color0_reg)
3264                 || (reg_maps->shader_version.major < 2 && i == 0))
3265             continue;
3266
3267         sprintf(srgbtmp[found], "R%u", i);
3268         ++found;
3269         if (found == 4) break;
3270     }
3271
3272     switch(found) {
3273         case 4: dcl_tmp = FALSE; break;
3274         case 0:
3275             sprintf(srgbtmp[0], "TA");
3276             sprintf(srgbtmp[1], "TB");
3277             sprintf(srgbtmp[2], "TC");
3278             sprintf(srgbtmp[3], "TD");
3279             dcl_td = TRUE;
3280             break;
3281         case 1:
3282             sprintf(srgbtmp[1], "TA");
3283             sprintf(srgbtmp[2], "TB");
3284             sprintf(srgbtmp[3], "TC");
3285             break;
3286         case 2:
3287             sprintf(srgbtmp[2], "TA");
3288             sprintf(srgbtmp[3], "TB");
3289             break;
3290         case 3:
3291             sprintf(srgbtmp[3], "TA");
3292             break;
3293     }
3294
3295     /*  Create the hw ARB shader */
3296     memset(&priv_ctx, 0, sizeof(priv_ctx));
3297     priv_ctx.cur_ps_args = args;
3298     priv_ctx.compiled_fprog = compiled;
3299     priv_ctx.cur_np2fixup_info = &compiled->np2fixup_info;
3300     init_ps_input(This, args, &priv_ctx);
3301     list_init(&priv_ctx.control_frames);
3302
3303     /* Avoid enabling NV_fragment_program* if we do not need it.
3304      *
3305      * Enabling GL_NV_fragment_program_option causes the driver to occupy a temporary register,
3306      * and it slows down the shader execution noticeably(about 5%). Usually our instruction emulation
3307      * is faster than what we gain from using higher native instructions. There are some things though
3308      * that cannot be emulated. In that case enable the extensions.
3309      * If the extension is enabled, instruction handlers that support both ways will use it.
3310      *
3311      * Testing shows no performance difference between OPTION NV_fragment_program2 and NV_fragment_program.
3312      * So enable the best we can get.
3313      */
3314     if(reg_maps->usesdsx || reg_maps->usesdsy || reg_maps->loop_depth > 0 || reg_maps->usestexldd ||
3315        reg_maps->usestexldl || reg_maps->usesfacing || reg_maps->usesifc || reg_maps->usescall)
3316     {
3317         want_nv_prog = TRUE;
3318     }
3319
3320     shader_addline(buffer, "!!ARBfp1.0\n");
3321     if(want_nv_prog && GL_SUPPORT(NV_FRAGMENT_PROGRAM2)) {
3322         shader_addline(buffer, "OPTION NV_fragment_program2;\n");
3323         priv_ctx.target_version = NV3;
3324     } else if(want_nv_prog && GL_SUPPORT(NV_FRAGMENT_PROGRAM_OPTION)) {
3325         shader_addline(buffer, "OPTION NV_fragment_program;\n");
3326         priv_ctx.target_version = NV2;
3327     } else {
3328         if(want_nv_prog)
3329         {
3330             /* This is an error - either we're advertising the wrong shader version, or aren't enforcing some
3331              * limits properly
3332              */
3333             ERR("The shader requires instructions that are not available in plain GL_ARB_fragment_program\n");
3334             ERR("Try GLSL\n");
3335         }
3336         priv_ctx.target_version = ARB;
3337     }
3338
3339     if(This->baseShader.reg_maps.highest_render_target > 0)
3340     {
3341         shader_addline(buffer, "OPTION ARB_draw_buffers;\n");
3342     }
3343
3344     if (reg_maps->shader_version.major < 3)
3345     {
3346         switch(args->super.fog) {
3347             case FOG_OFF:
3348                 break;
3349             case FOG_LINEAR:
3350                 shader_addline(buffer, "OPTION ARB_fog_linear;\n");
3351                 break;
3352             case FOG_EXP:
3353                 shader_addline(buffer, "OPTION ARB_fog_exp;\n");
3354                 break;
3355             case FOG_EXP2:
3356                 shader_addline(buffer, "OPTION ARB_fog_exp2;\n");
3357                 break;
3358         }
3359     }
3360
3361     /* For now always declare the temps. At least the Nvidia assembler optimizes completely
3362      * unused temps away(but occupies them for the whole shader if they're used once). Always
3363      * declaring them avoids tricky bookkeeping work
3364      */
3365     shader_addline(buffer, "TEMP TA;\n");      /* Used for modifiers */
3366     shader_addline(buffer, "TEMP TB;\n");      /* Used for modifiers */
3367     shader_addline(buffer, "TEMP TC;\n");      /* Used for modifiers */
3368     if(dcl_td) shader_addline(buffer, "TEMP TD;\n"); /* Used for sRGB writing */
3369     shader_addline(buffer, "PARAM coefdiv = { 0.5, 0.25, 0.125, 0.0625 };\n");
3370     shader_addline(buffer, "PARAM coefmul = { 2, 4, 8, 16 };\n");
3371     shader_addline(buffer, "PARAM one = { 1.0, 1.0, 1.0, 1.0 };\n");
3372
3373     if (reg_maps->shader_version.major < 2)
3374     {
3375         strcpy(fragcolor, "R0");
3376     } else {
3377         if(args->super.srgb_correction) {
3378             if(This->color0_mov) {
3379                 sprintf(fragcolor, "R%u", This->color0_reg);
3380             } else {
3381                 shader_addline(buffer, "TEMP TMP_COLOR;\n");
3382                 strcpy(fragcolor, "TMP_COLOR");
3383             }
3384         } else {
3385             strcpy(fragcolor, "result.color");
3386         }
3387     }
3388
3389     if(args->super.srgb_correction) {
3390         shader_addline(buffer, "PARAM srgb_consts1 = {%f, %f, %f, %f};\n",
3391                        srgb_mul_low, srgb_cmp, srgb_pow, srgb_mul_high);
3392         shader_addline(buffer, "PARAM srgb_consts2 = {%f, %f, %f, %f};\n",
3393                        srgb_sub_high, 0.0, 0.0, 0.0);
3394     }
3395
3396     /* Base Declarations */
3397     next_local = shader_generate_arb_declarations( (IWineD3DBaseShader*) This, reg_maps, buffer, &GLINFO_LOCATION,
3398             lconst_map, NULL, &priv_ctx);
3399
3400     for (i = 0, map = reg_maps->bumpmat; map; map >>= 1, ++i)
3401     {
3402         if (!(map & 1)) continue;
3403
3404         cur = compiled->numbumpenvmatconsts;
3405         compiled->bumpenvmatconst[cur].const_num = WINED3D_CONST_NUM_UNUSED;
3406         compiled->bumpenvmatconst[cur].texunit = i;
3407         compiled->luminanceconst[cur].const_num = WINED3D_CONST_NUM_UNUSED;
3408         compiled->luminanceconst[cur].texunit = i;
3409
3410         /* We can fit the constants into the constant limit for sure because texbem, texbeml, bem and beml are only supported
3411          * in 1.x shaders, and GL_ARB_fragment_program has a constant limit of 24 constants. So in the worst case we're loading
3412          * 8 shader constants, 8 bump matrices and 8 luminance parameters and are perfectly fine. (No NP2 fixup on bumpmapped
3413          * textures due to conditional NP2 restrictions)
3414          *
3415          * Use local constants to load the bump env parameters, not program.env. This avoids collisions with d3d constants of
3416          * shaders in newer shader models. Since the bump env parameters have to share their space with NP2 fixup constants,
3417          * their location is shader dependent anyway and they cannot be loaded globally.
3418          */
3419         compiled->bumpenvmatconst[cur].const_num = next_local++;
3420         shader_addline(buffer, "PARAM bumpenvmat%d = program.local[%d];\n",
3421                        i, compiled->bumpenvmatconst[cur].const_num);
3422         compiled->numbumpenvmatconsts = cur + 1;
3423
3424         if (!(reg_maps->luminanceparams & (1 << i))) continue;
3425
3426         compiled->luminanceconst[cur].const_num = next_local++;
3427         shader_addline(buffer, "PARAM luminance%d = program.local[%d];\n",
3428                        i, compiled->luminanceconst[cur].const_num);
3429     }
3430
3431     for(i = 0; i < MAX_CONST_I; i++)
3432     {
3433         compiled->int_consts[i] = WINED3D_CONST_NUM_UNUSED;
3434         if (reg_maps->integer_constants & (1 << i) && priv_ctx.target_version >= NV2)
3435         {
3436             const DWORD *control_values = find_loop_control_values((IWineD3DBaseShaderImpl *) This, i);
3437
3438             if(control_values)
3439             {
3440                 shader_addline(buffer, "PARAM I%u = {%u, %u, %u, -1};\n", i,
3441                                 control_values[0], control_values[1], control_values[2]);
3442             }
3443             else
3444             {
3445                 compiled->int_consts[i] = next_local;
3446                 compiled->num_int_consts++;
3447                 shader_addline(buffer, "PARAM I%u = program.local[%u];\n", i, next_local++);
3448             }
3449         }
3450     }
3451
3452     if(reg_maps->vpos || reg_maps->usesdsy)
3453     {
3454         compiled->ycorrection = next_local;
3455         shader_addline(buffer, "PARAM ycorrection = program.local[%u];\n", next_local++);
3456
3457         if(reg_maps->vpos)
3458         {
3459             shader_addline(buffer, "TEMP vpos;\n");
3460             /* ycorrection.x: Backbuffer height(onscreen) or 0(offscreen).
3461              * ycorrection.y: -1.0(onscreen), 1.0(offscreen)
3462              * ycorrection.z: 1.0
3463              * ycorrection.w: 0.0
3464              */
3465             shader_addline(buffer, "MAD vpos, fragment.position, ycorrection.zyww, ycorrection.wxww;\n");
3466             shader_addline(buffer, "FLR vpos.xy, vpos;\n");
3467         }
3468     }
3469     else
3470     {
3471         compiled->ycorrection = WINED3D_CONST_NUM_UNUSED;
3472     }
3473
3474     /* Load constants to fixup NP2 texcoords if there are still free constants left:
3475      * Constants (texture dimensions) for the NP2 fixup are loaded as local program parameters. This will consume
3476      * at most 8 (MAX_FRAGMENT_SAMPLERS / 2) parameters, which is highly unlikely, since the application had to
3477      * use 16 NP2 textures at the same time. In case that we run out of constants the fixup is simply not
3478      * applied / activated. This will probably result in wrong rendering of the texture, but will save us from
3479      * shader compilation errors and the subsequent errors when drawing with this shader. */
3480     if (priv_ctx.cur_ps_args->super.np2_fixup) {
3481
3482         struct arb_ps_np2fixup_info* const fixup = priv_ctx.cur_np2fixup_info;
3483         const WORD map = priv_ctx.cur_ps_args->super.np2_fixup;
3484         const UINT max_lconsts = gl_info->ps_arb_max_local_constants;
3485
3486         fixup->offset = next_local;
3487         fixup->super.active = 0;
3488
3489         cur = 0;
3490         for (i = 0; i < MAX_FRAGMENT_SAMPLERS; ++i) {
3491             if (!(map & (1 << i))) continue;
3492
3493             if (fixup->offset + (cur >> 1) < max_lconsts) {
3494                 fixup->super.active |= (1 << i);
3495                 fixup->super.idx[i] = cur++;
3496             } else {
3497                 FIXME("No free constant found to load NP2 fixup data into shader. "
3498                       "Sampling from this texture will probably look wrong.\n");
3499                 break;
3500             }
3501         }
3502
3503         fixup->super.num_consts = (cur + 1) >> 1;
3504         if (fixup->super.num_consts) {
3505             shader_addline(buffer, "PARAM np2fixup[%u] = { program.env[%u..%u] };\n",
3506                            fixup->super.num_consts, fixup->offset, fixup->super.num_consts + fixup->offset - 1);
3507         }
3508
3509         next_local += fixup->super.num_consts;
3510     }
3511
3512     if (shader_priv->clipplane_emulation != ~0U && args->clip)
3513     {
3514         shader_addline(buffer, "KIL fragment.texcoord[%u];\n", shader_priv->clipplane_emulation);
3515     }
3516
3517     /* Base Shader Body */
3518     shader_generate_main((IWineD3DBaseShader *)This, buffer, reg_maps, function, &priv_ctx);
3519
3520     if(args->super.srgb_correction) {
3521         arbfp_add_sRGB_correction(buffer, fragcolor, srgbtmp[0], srgbtmp[1], srgbtmp[2], srgbtmp[3],
3522                                   priv_ctx.target_version >= NV2);
3523     }
3524
3525     if(strcmp(fragcolor, "result.color")) {
3526         shader_addline(buffer, "MOV result.color, %s;\n", fragcolor);
3527     }
3528     shader_addline(buffer, "END\n");
3529
3530     /* TODO: change to resource.glObjectHandle or something like that */
3531     GL_EXTCALL(glGenProgramsARB(1, &retval));
3532
3533     TRACE("Creating a hw pixel shader, prg=%d\n", retval);
3534     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, retval));
3535
3536     TRACE("Created hw pixel shader, prg=%d\n", retval);
3537     /* Create the program and check for errors */
3538     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
3539                buffer->bsize, buffer->buffer));
3540     checkGLcall("glProgramStringARB()");
3541
3542     glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &errPos);
3543     if (errPos != -1)
3544     {
3545         FIXME("HW PixelShader Error at position %d: %s\n",
3546               errPos, debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
3547         retval = 0;
3548     }
3549     else
3550     {
3551         GLint native;
3552
3553         GL_EXTCALL(glGetProgramivARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_UNDER_NATIVE_LIMITS_ARB, &native));
3554         checkGLcall("glGetProgramivARB()");
3555         if (!native) WARN("Program exceeds native resource limits.\n");
3556     }
3557
3558     /* Load immediate constants */
3559     if(lconst_map) {
3560         LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
3561             const float *value = (const float *)lconst->value;
3562             GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, lconst_map[lconst->idx], value));
3563             checkGLcall("glProgramLocalParameter4fvARB");
3564         }
3565         HeapFree(GetProcessHeap(), 0, lconst_map);
3566     }
3567
3568     return retval;
3569 }
3570
3571 static int compare_sig(const struct wined3d_shader_signature_element *sig1, const struct wined3d_shader_signature_element *sig2)
3572 {
3573     unsigned int i;
3574     int ret;
3575
3576     for(i = 0; i < MAX_REG_INPUT; i++)
3577     {
3578         if(sig1[i].semantic_name == NULL || sig2[i].semantic_name == NULL)
3579         {
3580             /* Compare pointers, not contents. One string is NULL(element does not exist), the other one is not NULL */
3581             if(sig1[i].semantic_name != sig2[i].semantic_name) return sig1[i].semantic_name < sig2[i].semantic_name ? -1 : 1;
3582             continue;
3583         }
3584
3585         ret = strcmp(sig1[i].semantic_name, sig2[i].semantic_name);
3586         if(ret != 0) return ret;
3587         if(sig1[i].semantic_idx    != sig2[i].semantic_idx)    return sig1[i].semantic_idx    < sig2[i].semantic_idx    ? -1 : 1;
3588         if(sig1[i].sysval_semantic != sig2[i].sysval_semantic) return sig1[i].sysval_semantic < sig2[i].sysval_semantic ? -1 : 1;
3589         if(sig1[i].component_type  != sig2[i].component_type)  return sig1[i].sysval_semantic < sig2[i].component_type  ? -1 : 1;
3590         if(sig1[i].register_idx    != sig2[i].register_idx)    return sig1[i].register_idx    < sig2[i].register_idx    ? -1 : 1;
3591         if(sig1[i].mask            != sig2->mask)              return sig1[i].mask            < sig2[i].mask            ? -1 : 1;
3592     }
3593     return 0;
3594 }
3595
3596 static struct wined3d_shader_signature_element *clone_sig(const struct wined3d_shader_signature_element *sig)
3597 {
3598     struct wined3d_shader_signature_element *new;
3599     int i;
3600     char *name;
3601
3602     new = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*new) * MAX_REG_INPUT);
3603     for(i = 0; i < MAX_REG_INPUT; i++)
3604     {
3605         if(sig[i].semantic_name == NULL)
3606         {
3607             continue;
3608         }
3609
3610         new[i] = sig[i];
3611         /* Clone the semantic string */
3612         name = HeapAlloc(GetProcessHeap(), 0, strlen(sig[i].semantic_name) + 1);
3613         strcpy(name, sig[i].semantic_name);
3614         new[i].semantic_name = name;
3615     }
3616     return new;
3617 }
3618
3619 static DWORD find_input_signature(struct shader_arb_priv *priv, const struct wined3d_shader_signature_element *sig)
3620 {
3621     struct wine_rb_entry *entry = wine_rb_get(&priv->signature_tree, sig);
3622     struct ps_signature *found_sig;
3623
3624     if(entry != NULL)
3625     {
3626         found_sig = WINE_RB_ENTRY_VALUE(entry, struct ps_signature, entry);
3627         TRACE("Found existing signature %u\n", found_sig->idx);
3628         return found_sig->idx;
3629     }
3630     found_sig = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*sig));
3631     found_sig->sig = clone_sig(sig);
3632     found_sig->idx = priv->ps_sig_number++;
3633     TRACE("New signature stored and assigned number %u\n", found_sig->idx);
3634     if(wine_rb_put(&priv->signature_tree, sig, &found_sig->entry) == -1)
3635     {
3636         ERR("Failed to insert program entry.\n");
3637     }
3638     return found_sig->idx;
3639 }
3640
3641 static void init_output_registers(IWineD3DVertexShaderImpl *shader, DWORD sig_num, struct shader_arb_ctx_priv *priv_ctx,
3642                                   struct arb_vs_compiled_shader *compiled)
3643 {
3644     unsigned int i, j;
3645     static const char *texcoords[8] =
3646     {
3647         "result.texcoord[0]", "result.texcoord[1]", "result.texcoord[2]", "result.texcoord[3]",
3648         "result.texcoord[4]", "result.texcoord[5]", "result.texcoord[6]", "result.texcoord[7]"
3649     };
3650     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) shader->baseShader.device;
3651     const struct wined3d_shader_signature_element *sig;
3652     const char *semantic_name;
3653     DWORD semantic_idx, reg_idx;
3654
3655     /* Write generic input varyings 0 to 7 to result.texcoord[], varying 8 to result.color.primary
3656      * and varying 9 to result.color.secondary
3657      */
3658     const char *decl_idx_to_string[MAX_REG_INPUT] =
3659     {
3660         texcoords[0], texcoords[1], texcoords[2], texcoords[3],
3661         texcoords[4], texcoords[5], texcoords[6], texcoords[7],
3662         "result.color.primary", "result.color.secondary"
3663     };
3664
3665     if(sig_num == ~0)
3666     {
3667         TRACE("Pixel shader uses builtin varyings\n");
3668         /* Map builtins to builtins */
3669         for(i = 0; i < 8; i++)
3670         {
3671             priv_ctx->texcrd_output[i] = texcoords[i];
3672         }
3673         priv_ctx->color_output[0] = "result.color.primary";
3674         priv_ctx->color_output[1] = "result.color.secondary";
3675         priv_ctx->fog_output = "result.fogcoord";
3676
3677         /* Map declared regs to builtins. Use "TA" to /dev/null unread output */
3678         for(i = 0; i < (sizeof(shader->output_signature) / sizeof(*shader->output_signature)); i++)
3679         {
3680             semantic_name = shader->output_signature[i].semantic_name;
3681             if(semantic_name == NULL) continue;
3682
3683             if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_POSITION))
3684             {
3685                 TRACE("o%u is TMP_OUT\n", i);
3686                 if(shader->output_signature[i].semantic_idx == 0) priv_ctx->vs_output[i] = "TMP_OUT";
3687                 else priv_ctx->vs_output[i] = "TA";
3688             }
3689             else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_PSIZE))
3690             {
3691                 TRACE("o%u is result.pointsize\n", i);
3692                 if(shader->output_signature[i].semantic_idx == 0) priv_ctx->vs_output[i] = "result.pointsize";
3693                 else priv_ctx->vs_output[i] = "TA";
3694             }
3695             else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_COLOR))
3696             {
3697                 TRACE("o%u is result.color.?, idx %u\n", i, shader->output_signature[i].semantic_idx);
3698                 if(shader->output_signature[i].semantic_idx == 0) priv_ctx->vs_output[i] = "result.color.primary";
3699                 else if(shader->output_signature[i].semantic_idx == 1) priv_ctx->vs_output[i] = "result.color.secondary";
3700                 else priv_ctx->vs_output[i] = "TA";
3701             }
3702             else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_TEXCOORD))
3703             {
3704                 TRACE("o%u is %s\n", i, texcoords[shader->output_signature[i].semantic_idx]);
3705                 if(shader->output_signature[i].semantic_idx >= 8) priv_ctx->vs_output[i] = "TA";
3706                 else priv_ctx->vs_output[i] = texcoords[shader->output_signature[i].semantic_idx];
3707             }
3708             else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_FOG))
3709             {
3710                 TRACE("o%u is result.fogcoord\n", i);
3711                 if(shader->output_signature[i].semantic_idx > 0) priv_ctx->vs_output[i] = "TA";
3712                 else priv_ctx->vs_output[i] = "result.fogcoord";
3713             }
3714             else
3715             {
3716                 priv_ctx->vs_output[i] = "TA";
3717             }
3718         }
3719         return;
3720     }
3721
3722     /* Instead of searching for the signature in the signature list, read the one from the current pixel shader.
3723      * Its maybe not the shader where the signature came from, but it is the same signature and faster to find
3724      */
3725     sig = ((IWineD3DPixelShaderImpl *)device->stateBlock->pixelShader)->input_signature;
3726     TRACE("Pixel shader uses declared varyings\n");
3727
3728     /* Map builtin to declared. /dev/null the results by default to the TA temp reg */
3729     for(i = 0; i < 8; i++)
3730     {
3731         priv_ctx->texcrd_output[i] = "TA";
3732     }
3733     priv_ctx->color_output[0] = "TA";
3734     priv_ctx->color_output[1] = "TA";
3735     priv_ctx->fog_output = "TA";
3736
3737     for(i = 0; i < MAX_REG_INPUT; i++)
3738     {
3739         semantic_name = sig[i].semantic_name;
3740         semantic_idx = sig[i].semantic_idx;
3741         reg_idx = sig[i].register_idx;
3742         if(semantic_name == NULL) continue;
3743
3744         /* If a declared input register is not written by builtin arguments, don't write to it.
3745          * GL_NV_vertex_program makes sure the input defaults to 0.0, which is correct with D3D
3746          *
3747          * Don't care about POSITION and PSIZE here - this is a builtin vertex shader, position goes
3748          * to TMP_OUT in any case
3749          */
3750         if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_TEXCOORD))
3751         {
3752             if(semantic_idx < 8) priv_ctx->texcrd_output[semantic_idx] = decl_idx_to_string[reg_idx];
3753         }
3754         else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_COLOR))
3755         {
3756             if(semantic_idx < 2) priv_ctx->color_output[semantic_idx] = decl_idx_to_string[reg_idx];
3757         }
3758         else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_FOG))
3759         {
3760             if(semantic_idx == 0) priv_ctx->fog_output = decl_idx_to_string[reg_idx];
3761         }
3762         else
3763         {
3764             continue;
3765         }
3766
3767         if(strcmp(decl_idx_to_string[reg_idx], "result.color.primary") == 0 ||
3768            strcmp(decl_idx_to_string[reg_idx], "result.color.secondary") == 0)
3769         {
3770             compiled->need_color_unclamp = TRUE;
3771         }
3772     }
3773
3774     /* Map declared to declared */
3775     for(i = 0; i < (sizeof(shader->output_signature) / sizeof(*shader->output_signature)); i++)
3776     {
3777         /* Write unread output to TA to throw them away */
3778         priv_ctx->vs_output[i] = "TA";
3779         semantic_name = shader->output_signature[i].semantic_name;
3780         if(semantic_name == NULL)
3781         {
3782             continue;
3783         }
3784
3785         if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_POSITION) &&
3786            shader->output_signature[i].semantic_idx == 0)
3787         {
3788             priv_ctx->vs_output[i] = "TMP_OUT";
3789             continue;
3790         }
3791         else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_PSIZE) &&
3792            shader->output_signature[i].semantic_idx == 0)
3793         {
3794             priv_ctx->vs_output[i] = "result.pointsize";
3795             continue;
3796         }
3797
3798         for(j = 0; j < MAX_REG_INPUT; j++)
3799         {
3800             if(sig[j].semantic_name == NULL)
3801             {
3802                 continue;
3803             }
3804
3805             if(strcmp(sig[j].semantic_name, semantic_name) == 0 &&
3806                sig[j].semantic_idx == shader->output_signature[i].semantic_idx)
3807             {
3808                 priv_ctx->vs_output[i] = decl_idx_to_string[sig[j].register_idx];
3809
3810                 if(strcmp(priv_ctx->vs_output[i], "result.color.primary") == 0 ||
3811                    strcmp(priv_ctx->vs_output[i], "result.color.secondary") == 0)
3812                 {
3813                     compiled->need_color_unclamp = TRUE;
3814                 }
3815             }
3816         }
3817     }
3818 }
3819
3820 /* GL locking is done by the caller */
3821 static GLuint shader_arb_generate_vshader(IWineD3DVertexShaderImpl *This, struct wined3d_shader_buffer *buffer,
3822         const struct arb_vs_compile_args *args, struct arb_vs_compiled_shader *compiled)
3823 {
3824     const shader_reg_maps *reg_maps = &This->baseShader.reg_maps;
3825     CONST DWORD *function = This->baseShader.function;
3826     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *)This->baseShader.device;
3827     const struct wined3d_gl_info *gl_info = &device->adapter->gl_info;
3828     const local_constant *lconst;
3829     GLuint ret;
3830     DWORD next_local, *lconst_map = local_const_mapping((IWineD3DBaseShaderImpl *) This);
3831     struct shader_arb_ctx_priv priv_ctx;
3832     unsigned int i;
3833     GLint errPos;
3834
3835     memset(&priv_ctx, 0, sizeof(priv_ctx));
3836     priv_ctx.cur_vs_args = args;
3837     list_init(&priv_ctx.control_frames);
3838     init_output_registers(This, args->ps_signature, &priv_ctx, compiled);
3839
3840     /*  Create the hw ARB shader */
3841     shader_addline(buffer, "!!ARBvp1.0\n");
3842
3843     /* Always enable the NV extension if available. Unlike fragment shaders, there is no
3844      * mesurable performance penalty, and we can always make use of it for clipplanes.
3845      */
3846     if(GL_SUPPORT(NV_VERTEX_PROGRAM3)) {
3847         shader_addline(buffer, "OPTION NV_vertex_program3;\n");
3848         priv_ctx.target_version = NV3;
3849         shader_addline(buffer, "ADDRESS aL;\n");
3850     } else if(GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION)) {
3851         shader_addline(buffer, "OPTION NV_vertex_program2;\n");
3852         priv_ctx.target_version = NV2;
3853         shader_addline(buffer, "ADDRESS aL;\n");
3854     } else {
3855         priv_ctx.target_version = ARB;
3856     }
3857
3858     shader_addline(buffer, "TEMP TMP_OUT;\n");
3859     if(need_helper_const(gl_info)) {
3860         shader_addline(buffer, "PARAM helper_const = { 2.0, -1.0, %d.0, 0.0 };\n", This->rel_offset);
3861     }
3862     if(need_mova_const((IWineD3DBaseShader *) This, gl_info)) {
3863         shader_addline(buffer, "PARAM mova_const = { 0.5, 0.0, 2.0, 1.0 };\n");
3864         shader_addline(buffer, "TEMP A0_SHADOW;\n");
3865     }
3866
3867     shader_addline(buffer, "TEMP TA;\n");
3868
3869     /* Base Declarations */
3870     next_local = shader_generate_arb_declarations( (IWineD3DBaseShader*) This, reg_maps, buffer, &GLINFO_LOCATION,
3871             lconst_map, &priv_ctx.vs_clipplanes, &priv_ctx);
3872
3873     for(i = 0; i < MAX_CONST_I; i++)
3874     {
3875         compiled->int_consts[i] = WINED3D_CONST_NUM_UNUSED;
3876         if(reg_maps->integer_constants & (1 << i) && priv_ctx.target_version >= NV2)
3877         {
3878             const DWORD *control_values = find_loop_control_values((IWineD3DBaseShaderImpl *) This, i);
3879
3880             if(control_values)
3881             {
3882                 shader_addline(buffer, "PARAM I%u = {%u, %u, %u, -1};\n", i,
3883                                 control_values[0], control_values[1], control_values[2]);
3884             }
3885             else
3886             {
3887                 compiled->int_consts[i] = next_local;
3888                 compiled->num_int_consts++;
3889                 shader_addline(buffer, "PARAM I%u = program.local[%u];\n", i, next_local++);
3890             }
3891         }
3892     }
3893
3894     /* We need a constant to fixup the final position */
3895     shader_addline(buffer, "PARAM posFixup = program.local[%u];\n", next_local);
3896     compiled->pos_fixup = next_local++;
3897
3898     /* Initialize output parameters. GL_ARB_vertex_program does not require special initialization values
3899      * for output parameters. D3D in theory does not do that either, but some applications depend on a
3900      * proper initialization of the secondary color, and programs using the fixed function pipeline without
3901      * a replacement shader depend on the texcoord.w being set properly.
3902      *
3903      * GL_NV_vertex_program defines that all output values are initialized to {0.0, 0.0, 0.0, 1.0}. This
3904      * assertion is in effect even when using GL_ARB_vertex_program without any NV specific additions. So
3905      * skip this if NV_vertex_program is supported. Otherwise, initialize the secondary color. For the tex-
3906      * coords, we have a flag in the opengl caps. Many cards do not require the texcoord being set, and
3907      * this can eat a number of instructions, so skip it unless this cap is set as well
3908      */
3909     if(!GL_SUPPORT(NV_VERTEX_PROGRAM)) {
3910         shader_addline(buffer, "MOV result.color.secondary, -helper_const.wwwy;\n");
3911
3912         if ((GLINFO_LOCATION).quirks & WINED3D_QUIRK_SET_TEXCOORD_W && !device->frag_pipe->ffp_proj_control)
3913         {
3914             int i;
3915             for(i = 0; i < min(8, MAX_REG_TEXCRD); i++) {
3916                 if(This->baseShader.reg_maps.texcoord_mask[i] != 0 &&
3917                 This->baseShader.reg_maps.texcoord_mask[i] != WINED3DSP_WRITEMASK_ALL) {
3918                     shader_addline(buffer, "MOV result.texcoord[%u].w, -helper_const.y;\n", i);
3919                 }
3920             }
3921         }
3922     }
3923
3924     /* The shader starts with the main function */
3925     priv_ctx.in_main_func = TRUE;
3926     /* Base Shader Body */
3927     shader_generate_main((IWineD3DBaseShader *)This, buffer, reg_maps, function, &priv_ctx);
3928
3929     if(!priv_ctx.footer_written) vshader_add_footer(This, buffer, args, &priv_ctx);
3930
3931     shader_addline(buffer, "END\n");
3932
3933     /* TODO: change to resource.glObjectHandle or something like that */
3934     GL_EXTCALL(glGenProgramsARB(1, &ret));
3935
3936     TRACE("Creating a hw vertex shader, prg=%d\n", ret);
3937     GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, ret));
3938
3939     TRACE("Created hw vertex shader, prg=%d\n", ret);
3940     /* Create the program and check for errors */
3941     GL_EXTCALL(glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
3942                buffer->bsize, buffer->buffer));
3943     checkGLcall("glProgramStringARB()");
3944
3945     glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &errPos);
3946     if (errPos != -1)
3947     {
3948         FIXME("HW VertexShader Error at position %d: %s\n",
3949               errPos, debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
3950         ret = -1;
3951     }
3952     else
3953     {
3954         GLint native;
3955
3956         GL_EXTCALL(glGetProgramivARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_UNDER_NATIVE_LIMITS_ARB, &native));
3957         checkGLcall("glGetProgramivARB()");
3958         if (!native) WARN("Program exceeds native resource limits.\n");
3959
3960         /* Load immediate constants */
3961         if(lconst_map) {
3962             LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
3963                 const float *value = (const float *)lconst->value;
3964                 GL_EXTCALL(glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, lconst_map[lconst->idx], value));
3965             }
3966         }
3967     }
3968     HeapFree(GetProcessHeap(), 0, lconst_map);
3969
3970     return ret;
3971 }
3972
3973 /* GL locking is done by the caller */
3974 static struct arb_ps_compiled_shader *find_arb_pshader(IWineD3DPixelShaderImpl *shader, const struct arb_ps_compile_args *args)
3975 {
3976     UINT i;
3977     DWORD new_size;
3978     struct arb_ps_compiled_shader *new_array;
3979     struct wined3d_shader_buffer buffer;
3980     struct arb_pshader_private *shader_data;
3981     GLuint ret;
3982
3983     if (!shader->baseShader.backend_data)
3984     {
3985         IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) shader->baseShader.device;
3986         const struct wined3d_gl_info *gl_info = &device->adapter->gl_info;
3987         struct shader_arb_priv *priv = device->shader_priv;
3988
3989         shader->baseShader.backend_data = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data));
3990         shader_data = shader->baseShader.backend_data;
3991         shader_data->clamp_consts = shader->baseShader.reg_maps.shader_version.major == 1;
3992
3993         if(shader->baseShader.reg_maps.shader_version.major < 3) shader_data->input_signature_idx = ~0;
3994         else shader_data->input_signature_idx = find_input_signature(priv, shader->input_signature);
3995
3996         shader_data->has_signature_idx = TRUE;
3997         TRACE("Shader got assigned input signature index %u\n", shader_data->input_signature_idx);
3998
3999         if (!device->vs_clipping)
4000             shader_data->clipplane_emulation = shader_find_free_input_register(&shader->baseShader.reg_maps,
4001                     GL_LIMITS(texture_stages) - 1);
4002         else
4003             shader_data->clipplane_emulation = ~0U;
4004     }
4005     shader_data = shader->baseShader.backend_data;
4006
4007     /* Usually we have very few GL shaders for each d3d shader(just 1 or maybe 2),
4008      * so a linear search is more performant than a hashmap or a binary search
4009      * (cache coherency etc)
4010      */
4011     for(i = 0; i < shader_data->num_gl_shaders; i++) {
4012         if(memcmp(&shader_data->gl_shaders[i].args, args, sizeof(*args)) == 0) {
4013             return &shader_data->gl_shaders[i];
4014         }
4015     }
4016
4017     TRACE("No matching GL shader found, compiling a new shader\n");
4018     if(shader_data->shader_array_size == shader_data->num_gl_shaders) {
4019         if (shader_data->num_gl_shaders)
4020         {
4021             new_size = shader_data->shader_array_size + max(1, shader_data->shader_array_size / 2);
4022             new_array = HeapReAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, shader_data->gl_shaders,
4023                                     new_size * sizeof(*shader_data->gl_shaders));
4024         } else {
4025             new_array = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data->gl_shaders));
4026             new_size = 1;
4027         }
4028
4029         if(!new_array) {
4030             ERR("Out of memory\n");
4031             return 0;
4032         }
4033         shader_data->gl_shaders = new_array;
4034         shader_data->shader_array_size = new_size;
4035     }
4036
4037     shader_data->gl_shaders[shader_data->num_gl_shaders].args = *args;
4038
4039     pixelshader_update_samplers(&shader->baseShader.reg_maps,
4040             ((IWineD3DDeviceImpl *)shader->baseShader.device)->stateBlock->textures);
4041
4042     if (!shader_buffer_init(&buffer))
4043     {
4044         ERR("Failed to initialize shader buffer.\n");
4045         return 0;
4046     }
4047
4048     ret = shader_arb_generate_pshader(shader, &buffer, args,
4049                                       &shader_data->gl_shaders[shader_data->num_gl_shaders]);
4050     shader_buffer_free(&buffer);
4051     shader_data->gl_shaders[shader_data->num_gl_shaders].prgId = ret;
4052
4053     return &shader_data->gl_shaders[shader_data->num_gl_shaders++];
4054 }
4055
4056 static inline BOOL vs_args_equal(const struct arb_vs_compile_args *stored, const struct arb_vs_compile_args *new,
4057                                  const DWORD use_map, BOOL skip_int) {
4058     if((stored->super.swizzle_map & use_map) != new->super.swizzle_map) return FALSE;
4059     if(stored->super.fog_src != new->super.fog_src) return FALSE;
4060     if(stored->boolclip_compare != new->boolclip_compare) return FALSE;
4061     if(stored->ps_signature != new->ps_signature) return FALSE;
4062     if(stored->vertex_samplers_compare != new->vertex_samplers_compare) return FALSE;
4063     if(skip_int) return TRUE;
4064
4065     return memcmp(stored->loop_ctrl, new->loop_ctrl, sizeof(stored->loop_ctrl)) == 0;
4066 }
4067
4068 static struct arb_vs_compiled_shader *find_arb_vshader(IWineD3DVertexShaderImpl *shader, const struct arb_vs_compile_args *args)
4069 {
4070     UINT i;
4071     DWORD new_size;
4072     struct arb_vs_compiled_shader *new_array;
4073     DWORD use_map = ((IWineD3DDeviceImpl *)shader->baseShader.device)->strided_streams.use_map;
4074     struct wined3d_shader_buffer buffer;
4075     struct arb_vshader_private *shader_data;
4076     GLuint ret;
4077     const struct wined3d_gl_info *gl_info = &((IWineD3DDeviceImpl *)shader->baseShader.device)->adapter->gl_info;
4078
4079     if (!shader->baseShader.backend_data)
4080     {
4081         shader->baseShader.backend_data = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data));
4082     }
4083     shader_data = shader->baseShader.backend_data;
4084
4085     /* Usually we have very few GL shaders for each d3d shader(just 1 or maybe 2),
4086      * so a linear search is more performant than a hashmap or a binary search
4087      * (cache coherency etc)
4088      */
4089     for(i = 0; i < shader_data->num_gl_shaders; i++) {
4090         if(vs_args_equal(&shader_data->gl_shaders[i].args, args, use_map, GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION))) {
4091             return &shader_data->gl_shaders[i];
4092         }
4093     }
4094
4095     TRACE("No matching GL shader found, compiling a new shader\n");
4096
4097     if(shader_data->shader_array_size == shader_data->num_gl_shaders) {
4098         if (shader_data->num_gl_shaders)
4099         {
4100             new_size = shader_data->shader_array_size + max(1, shader_data->shader_array_size / 2);
4101             new_array = HeapReAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, shader_data->gl_shaders,
4102                                     new_size * sizeof(*shader_data->gl_shaders));
4103         } else {
4104             new_array = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data->gl_shaders));
4105             new_size = 1;
4106         }
4107
4108         if(!new_array) {
4109             ERR("Out of memory\n");
4110             return 0;
4111         }
4112         shader_data->gl_shaders = new_array;
4113         shader_data->shader_array_size = new_size;
4114     }
4115
4116     shader_data->gl_shaders[shader_data->num_gl_shaders].args = *args;
4117
4118     if (!shader_buffer_init(&buffer))
4119     {
4120         ERR("Failed to initialize shader buffer.\n");
4121         return 0;
4122     }
4123
4124     ret = shader_arb_generate_vshader(shader, &buffer, args,
4125             &shader_data->gl_shaders[shader_data->num_gl_shaders]);
4126     shader_buffer_free(&buffer);
4127     shader_data->gl_shaders[shader_data->num_gl_shaders].prgId = ret;
4128
4129     return &shader_data->gl_shaders[shader_data->num_gl_shaders++];
4130 }
4131
4132 static inline void find_arb_ps_compile_args(IWineD3DPixelShaderImpl *shader, IWineD3DStateBlockImpl *stateblock,
4133         struct arb_ps_compile_args *args)
4134 {
4135     int i;
4136     WORD int_skip;
4137     const struct wined3d_gl_info *gl_info = &((IWineD3DDeviceImpl *)shader->baseShader.device)->adapter->gl_info;
4138     find_ps_compile_args(shader, stateblock, &args->super);
4139
4140     /* This forces all local boolean constants to 1 to make them stateblock independent */
4141     args->bools = shader->baseShader.reg_maps.local_bool_consts;
4142
4143     for(i = 0; i < MAX_CONST_B; i++)
4144     {
4145         if(stateblock->pixelShaderConstantB[i]) args->bools |= ( 1 << i);
4146     }
4147
4148     /* Only enable the clip plane emulation KIL if at least one clipplane is enabled. The KIL instruction
4149      * is quite expensive because it forces the driver to disable early Z discards. It is cheaper to
4150      * duplicate the shader than have a no-op KIL instruction in every shader
4151      */
4152     if((!((IWineD3DDeviceImpl *) shader->baseShader.device)->vs_clipping) && use_vs(stateblock) &&
4153        stateblock->renderState[WINED3DRS_CLIPPING] && stateblock->renderState[WINED3DRS_CLIPPLANEENABLE])
4154     {
4155         args->clip = 1;
4156     }
4157     else
4158     {
4159         args->clip = 0;
4160     }
4161
4162     /* Skip if unused or local, or supported natively */
4163     int_skip = ~shader->baseShader.reg_maps.integer_constants | shader->baseShader.reg_maps.local_int_consts;
4164     if(int_skip == 0xffff || GL_SUPPORT(NV_FRAGMENT_PROGRAM_OPTION))
4165     {
4166         memset(&args->loop_ctrl, 0, sizeof(args->loop_ctrl));
4167         return;
4168     }
4169
4170     for(i = 0; i < MAX_CONST_I; i++)
4171     {
4172         if(int_skip & (1 << i))
4173         {
4174             args->loop_ctrl[i][0] = 0;
4175             args->loop_ctrl[i][1] = 0;
4176             args->loop_ctrl[i][2] = 0;
4177         }
4178         else
4179         {
4180             args->loop_ctrl[i][0] = stateblock->pixelShaderConstantI[i * 4];
4181             args->loop_ctrl[i][1] = stateblock->pixelShaderConstantI[i * 4 + 1];
4182             args->loop_ctrl[i][2] = stateblock->pixelShaderConstantI[i * 4 + 2];
4183         }
4184     }
4185 }
4186
4187 static inline void find_arb_vs_compile_args(IWineD3DVertexShaderImpl *shader, IWineD3DStateBlockImpl *stateblock,
4188         struct arb_vs_compile_args *args)
4189 {
4190     int i;
4191     WORD int_skip;
4192     IWineD3DDeviceImpl *dev = (IWineD3DDeviceImpl *)shader->baseShader.device;
4193     const struct wined3d_gl_info *gl_info = &dev->adapter->gl_info;
4194     find_vs_compile_args(shader, stateblock, &args->super);
4195
4196     args->boolclip_compare = 0;
4197     if(use_ps(stateblock))
4198     {
4199         IWineD3DPixelShaderImpl *ps = (IWineD3DPixelShaderImpl *) stateblock->pixelShader;
4200         struct arb_pshader_private *shader_priv = ps->baseShader.backend_data;
4201         args->ps_signature = shader_priv->input_signature_idx;
4202
4203         args->boolclip.clip_texcoord = shader_priv->clipplane_emulation + 1;
4204     }
4205     else
4206     {
4207         args->ps_signature = ~0;
4208         if(!dev->vs_clipping)
4209         {
4210             args->boolclip.clip_texcoord = ffp_clip_emul(stateblock) ? GL_LIMITS(texture_stages) : 0;
4211         }
4212         /* Otherwise: Setting boolclip_compare set clip_texcoord to 0 */
4213     }
4214
4215     if(args->boolclip.clip_texcoord)
4216     {
4217         if(stateblock->renderState[WINED3DRS_CLIPPING])
4218         {
4219             args->boolclip.clipplane_mask = stateblock->renderState[WINED3DRS_CLIPPLANEENABLE];
4220         }
4221         /* clipplane_mask was set to 0 by setting boolclip_compare to 0 */
4222     }
4223
4224     /* This forces all local boolean constants to 1 to make them stateblock independent */
4225     args->boolclip.bools = shader->baseShader.reg_maps.local_bool_consts;
4226     /* TODO: Figure out if it would be better to store bool constants as bitmasks in the stateblock */
4227     for(i = 0; i < MAX_CONST_B; i++)
4228     {
4229         if(stateblock->vertexShaderConstantB[i]) args->boolclip.bools |= ( 1 << i);
4230     }
4231
4232     args->vertex_samplers[0] = dev->texUnitMap[MAX_FRAGMENT_SAMPLERS + 0];
4233     args->vertex_samplers[1] = dev->texUnitMap[MAX_FRAGMENT_SAMPLERS + 1];
4234     args->vertex_samplers[2] = dev->texUnitMap[MAX_FRAGMENT_SAMPLERS + 2];
4235     args->vertex_samplers[3] = 0;
4236
4237     /* Skip if unused or local */
4238     int_skip = ~shader->baseShader.reg_maps.integer_constants | shader->baseShader.reg_maps.local_int_consts;
4239     if(int_skip == 0xffff || GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION)) /* This is about flow control, not clipping */
4240     {
4241         memset(&args->loop_ctrl, 0, sizeof(args->loop_ctrl));
4242         return;
4243     }
4244
4245     for(i = 0; i < MAX_CONST_I; i++)
4246     {
4247         if(int_skip & (1 << i))
4248         {
4249             args->loop_ctrl[i][0] = 0;
4250             args->loop_ctrl[i][1] = 0;
4251             args->loop_ctrl[i][2] = 0;
4252         }
4253         else
4254         {
4255             args->loop_ctrl[i][0] = stateblock->vertexShaderConstantI[i * 4];
4256             args->loop_ctrl[i][1] = stateblock->vertexShaderConstantI[i * 4 + 1];
4257             args->loop_ctrl[i][2] = stateblock->vertexShaderConstantI[i * 4 + 2];
4258         }
4259     }
4260 }
4261
4262 /* GL locking is done by the caller */
4263 static void shader_arb_select(const struct wined3d_context *context, BOOL usePS, BOOL useVS)
4264 {
4265     IWineD3DDeviceImpl *This = ((IWineD3DSurfaceImpl *)context->surface)->resource.wineD3DDevice;
4266     struct shader_arb_priv *priv = This->shader_priv;
4267     const struct wined3d_gl_info *gl_info = context->gl_info;
4268     int i;
4269
4270     /* Deal with pixel shaders first so the vertex shader arg function has the input signature ready */
4271     if (usePS) {
4272         struct arb_ps_compile_args compile_args;
4273         struct arb_ps_compiled_shader *compiled;
4274         IWineD3DPixelShaderImpl *ps = (IWineD3DPixelShaderImpl *) This->stateBlock->pixelShader;
4275
4276         TRACE("Using pixel shader %p\n", This->stateBlock->pixelShader);
4277         find_arb_ps_compile_args(ps, This->stateBlock, &compile_args);
4278         compiled = find_arb_pshader(ps, &compile_args);
4279         priv->current_fprogram_id = compiled->prgId;
4280         priv->compiled_fprog = compiled;
4281
4282         /* Bind the fragment program */
4283         GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id));
4284         checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id);");
4285
4286         if(!priv->use_arbfp_fixed_func) {
4287             /* Enable OpenGL fragment programs */
4288             glEnable(GL_FRAGMENT_PROGRAM_ARB);
4289             checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB);");
4290         }
4291         TRACE("(%p) : Bound fragment program %u and enabled GL_FRAGMENT_PROGRAM_ARB\n", This, priv->current_fprogram_id);
4292
4293         /* Pixel Shader 1.x constants are clamped to [-1;1], Pixel Shader 2.0 constants are not. If switching between
4294          * a 1.x and newer shader, reload the first 8 constants
4295          */
4296         if(priv->last_ps_const_clamped != ((struct arb_pshader_private *)ps->baseShader.backend_data)->clamp_consts)
4297         {
4298             priv->last_ps_const_clamped = ((struct arb_pshader_private *)ps->baseShader.backend_data)->clamp_consts;
4299             This->highest_dirty_ps_const = max(This->highest_dirty_ps_const, 8);
4300             for(i = 0; i < 8; i++)
4301             {
4302                 context->pshader_const_dirty[i] = 1;
4303             }
4304             /* Also takes care of loading local constants */
4305             shader_arb_load_constants(context, TRUE, FALSE);
4306         }
4307         else
4308         {
4309             shader_arb_ps_local_constants(This);
4310         }
4311
4312         /* Force constant reloading for the NP2 fixup (see comment in shader_glsl_select for more info) */
4313         if (compiled->np2fixup_info.super.active)
4314             shader_arb_load_np2fixup_constants((IWineD3DDevice *)This, usePS, useVS);
4315     } else if(GL_SUPPORT(ARB_FRAGMENT_PROGRAM) && !priv->use_arbfp_fixed_func) {
4316         /* Disable only if we're not using arbfp fixed function fragment processing. If this is used,
4317         * keep GL_FRAGMENT_PROGRAM_ARB enabled, and the fixed function pipeline will bind the fixed function
4318         * replacement shader
4319         */
4320         glDisable(GL_FRAGMENT_PROGRAM_ARB);
4321         checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
4322         priv->current_fprogram_id = 0;
4323     }
4324
4325     if (useVS) {
4326         struct arb_vs_compile_args compile_args;
4327         struct arb_vs_compiled_shader *compiled;
4328         IWineD3DVertexShaderImpl *vs = (IWineD3DVertexShaderImpl *) This->stateBlock->vertexShader;
4329
4330         TRACE("Using vertex shader %p\n", This->stateBlock->vertexShader);
4331         find_arb_vs_compile_args(vs, This->stateBlock, &compile_args);
4332         compiled = find_arb_vshader(vs, &compile_args);
4333         priv->current_vprogram_id = compiled->prgId;
4334         priv->compiled_vprog = compiled;
4335
4336         /* Bind the vertex program */
4337         GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id));
4338         checkGLcall("glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id);");
4339
4340         /* Enable OpenGL vertex programs */
4341         glEnable(GL_VERTEX_PROGRAM_ARB);
4342         checkGLcall("glEnable(GL_VERTEX_PROGRAM_ARB);");
4343         TRACE("(%p) : Bound vertex program %u and enabled GL_VERTEX_PROGRAM_ARB\n", This, priv->current_vprogram_id);
4344         shader_arb_vs_local_constants(This);
4345
4346         if(priv->last_vs_color_unclamp != compiled->need_color_unclamp) {
4347             priv->last_vs_color_unclamp = compiled->need_color_unclamp;
4348
4349             if (GL_SUPPORT(ARB_COLOR_BUFFER_FLOAT)) {
4350                 GL_EXTCALL(glClampColorARB(GL_CLAMP_VERTEX_COLOR_ARB, !compiled->need_color_unclamp));
4351                 checkGLcall("glClampColorARB");
4352             } else {
4353                 FIXME("vertex color clamp needs to be changed, but extension not supported.\n");
4354             }
4355         }
4356     } else if(GL_SUPPORT(ARB_VERTEX_PROGRAM)) {
4357         priv->current_vprogram_id = 0;
4358         glDisable(GL_VERTEX_PROGRAM_ARB);
4359         checkGLcall("glDisable(GL_VERTEX_PROGRAM_ARB)");
4360     }
4361 }
4362
4363 /* GL locking is done by the caller */
4364 static void shader_arb_select_depth_blt(IWineD3DDevice *iface, enum tex_types tex_type) {
4365     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
4366     struct shader_arb_priv *priv = This->shader_priv;
4367     GLuint *blt_fprogram = &priv->depth_blt_fprogram_id[tex_type];
4368     const struct wined3d_gl_info *gl_info = &This->adapter->gl_info;
4369
4370     if (!priv->depth_blt_vprogram_id) priv->depth_blt_vprogram_id = create_arb_blt_vertex_program(gl_info);
4371     GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->depth_blt_vprogram_id));
4372     glEnable(GL_VERTEX_PROGRAM_ARB);
4373
4374     if (!*blt_fprogram) *blt_fprogram = create_arb_blt_fragment_program(gl_info, tex_type);
4375     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, *blt_fprogram));
4376     glEnable(GL_FRAGMENT_PROGRAM_ARB);
4377 }
4378
4379 /* GL locking is done by the caller */
4380 static void shader_arb_deselect_depth_blt(IWineD3DDevice *iface) {
4381     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
4382     struct shader_arb_priv *priv = This->shader_priv;
4383     const struct wined3d_gl_info *gl_info = &This->adapter->gl_info;
4384
4385     if (priv->current_vprogram_id) {
4386         GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id));
4387         checkGLcall("glBindProgramARB(GL_VERTEX_PROGRAM_ARB, vertexShader->prgId);");
4388
4389         TRACE("(%p) : Bound vertex program %u and enabled GL_VERTEX_PROGRAM_ARB\n", This, priv->current_vprogram_id);
4390     } else {
4391         glDisable(GL_VERTEX_PROGRAM_ARB);
4392         checkGLcall("glDisable(GL_VERTEX_PROGRAM_ARB)");
4393     }
4394
4395     if (priv->current_fprogram_id) {
4396         GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id));
4397         checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, pixelShader->prgId);");
4398
4399         TRACE("(%p) : Bound fragment program %u and enabled GL_FRAGMENT_PROGRAM_ARB\n", This, priv->current_fprogram_id);
4400     } else if(!priv->use_arbfp_fixed_func) {
4401         glDisable(GL_FRAGMENT_PROGRAM_ARB);
4402         checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
4403     }
4404 }
4405
4406 static void shader_arb_destroy(IWineD3DBaseShader *iface) {
4407     IWineD3DBaseShaderImpl *baseShader = (IWineD3DBaseShaderImpl *) iface;
4408     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *)baseShader->baseShader.device;
4409     const struct wined3d_gl_info *gl_info = &device->adapter->gl_info;
4410
4411     if (shader_is_pshader_version(baseShader->baseShader.reg_maps.shader_version.type))
4412     {
4413         IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *) iface;
4414         struct arb_pshader_private *shader_data = This->baseShader.backend_data;
4415         UINT i;
4416
4417         if(!shader_data) return; /* This can happen if a shader was never compiled */
4418         ENTER_GL();
4419
4420         if(shader_data->num_gl_shaders) ActivateContext(device, NULL, CTXUSAGE_RESOURCELOAD);
4421
4422         for(i = 0; i < shader_data->num_gl_shaders; i++) {
4423             GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId));
4424             checkGLcall("GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId))");
4425         }
4426         LEAVE_GL();
4427         HeapFree(GetProcessHeap(), 0, shader_data->gl_shaders);
4428         HeapFree(GetProcessHeap(), 0, shader_data);
4429         This->baseShader.backend_data = NULL;
4430     } else {
4431         IWineD3DVertexShaderImpl *This = (IWineD3DVertexShaderImpl *) iface;
4432         struct arb_vshader_private *shader_data = This->baseShader.backend_data;
4433         UINT i;
4434
4435         if(!shader_data) return; /* This can happen if a shader was never compiled */
4436         ENTER_GL();
4437
4438         if(shader_data->num_gl_shaders) ActivateContext(device, NULL, CTXUSAGE_RESOURCELOAD);
4439
4440         for(i = 0; i < shader_data->num_gl_shaders; i++) {
4441             GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId));
4442             checkGLcall("GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId))");
4443         }
4444         LEAVE_GL();
4445         HeapFree(GetProcessHeap(), 0, shader_data->gl_shaders);
4446         HeapFree(GetProcessHeap(), 0, shader_data);
4447         This->baseShader.backend_data = NULL;
4448     }
4449 }
4450
4451 static int sig_tree_compare(const void *key, const struct wine_rb_entry *entry)
4452 {
4453     struct ps_signature *e = WINE_RB_ENTRY_VALUE(entry, struct ps_signature, entry);
4454     return compare_sig(key, e->sig);
4455 }
4456
4457 static const struct wine_rb_functions sig_tree_functions =
4458 {
4459     wined3d_rb_alloc,
4460     wined3d_rb_realloc,
4461     wined3d_rb_free,
4462     sig_tree_compare
4463 };
4464
4465 static HRESULT shader_arb_alloc(IWineD3DDevice *iface) {
4466     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
4467     struct shader_arb_priv *priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*priv));
4468     if(wine_rb_init(&priv->signature_tree, &sig_tree_functions) == -1)
4469     {
4470         ERR("RB tree init failed\n");
4471         HeapFree(GetProcessHeap(), 0, priv);
4472         return E_OUTOFMEMORY;
4473     }
4474     This->shader_priv = priv;
4475     return WINED3D_OK;
4476 }
4477
4478 static void release_signature(struct wine_rb_entry *entry, void *context)
4479 {
4480     struct ps_signature *sig = WINE_RB_ENTRY_VALUE(entry, struct ps_signature, entry);
4481     int i;
4482     for(i = 0; i < MAX_REG_INPUT; i++)
4483     {
4484         HeapFree(GetProcessHeap(), 0, (char *) sig->sig[i].semantic_name);
4485     }
4486     HeapFree(GetProcessHeap(), 0, sig->sig);
4487     HeapFree(GetProcessHeap(), 0, sig);
4488 }
4489
4490 /* Context activation is done by the caller. */
4491 static void shader_arb_free(IWineD3DDevice *iface) {
4492     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
4493     const struct wined3d_gl_info *gl_info = &This->adapter->gl_info;
4494     struct shader_arb_priv *priv = This->shader_priv;
4495     int i;
4496
4497     ENTER_GL();
4498     if(priv->depth_blt_vprogram_id) {
4499         GL_EXTCALL(glDeleteProgramsARB(1, &priv->depth_blt_vprogram_id));
4500     }
4501     for (i = 0; i < tex_type_count; ++i) {
4502         if (priv->depth_blt_fprogram_id[i]) {
4503             GL_EXTCALL(glDeleteProgramsARB(1, &priv->depth_blt_fprogram_id[i]));
4504         }
4505     }
4506     LEAVE_GL();
4507
4508     wine_rb_destroy(&priv->signature_tree, release_signature, NULL);
4509     HeapFree(GetProcessHeap(), 0, This->shader_priv);
4510 }
4511
4512 static BOOL shader_arb_dirty_const(IWineD3DDevice *iface) {
4513     return TRUE;
4514 }
4515
4516 static void shader_arb_get_caps(WINED3DDEVTYPE devtype, const struct wined3d_gl_info *gl_info,
4517         struct shader_caps *pCaps)
4518 {
4519     /* We don't have an ARB fixed function pipeline yet, so let the none backend set its caps,
4520      * then overwrite the shader specific ones
4521      */
4522     none_shader_backend.shader_get_caps(devtype, gl_info, pCaps);
4523
4524     if(GL_SUPPORT(ARB_VERTEX_PROGRAM)) {
4525         if(GL_SUPPORT(NV_VERTEX_PROGRAM3))
4526         {
4527             pCaps->VertexShaderVersion = WINED3DVS_VERSION(3,0);
4528             TRACE_(d3d_caps)("Hardware vertex shader version 3.0 enabled (NV_VERTEX_PROGRAM3)\n");
4529         }
4530         else if(GL_LIMITS(vshader_constantsF) >= 256)
4531         {
4532             /* Shader Model 2.0 requires at least 256 vertex shader constants */
4533             pCaps->VertexShaderVersion = WINED3DVS_VERSION(2,0);
4534             TRACE_(d3d_caps)("Hardware vertex shader version 2.0 enabled (ARB_PROGRAM)\n");
4535         }
4536         else
4537         {
4538             pCaps->VertexShaderVersion = WINED3DVS_VERSION(1,1);
4539             TRACE_(d3d_caps)("Hardware vertex shader version 1.1 enabled (ARB_PROGRAM)\n");
4540         }
4541         pCaps->MaxVertexShaderConst = GL_LIMITS(vshader_constantsF);
4542     }
4543
4544     if(GL_SUPPORT(ARB_FRAGMENT_PROGRAM)) {
4545         if(GL_SUPPORT(NV_FRAGMENT_PROGRAM2))
4546         {
4547             pCaps->PixelShaderVersion    = WINED3DPS_VERSION(3,0);
4548             TRACE_(d3d_caps)("Hardware pixel shader version 3.0 enabled (NV_FRAGMENT_PROGRAM2)\n");
4549         }
4550         else if(GL_LIMITS(pshader_constantsF) >= 32)
4551         {
4552             /* Shader Model 2.0 requires at least 32 pixel shader constants */
4553             pCaps->PixelShaderVersion    = WINED3DPS_VERSION(2,0);
4554             TRACE_(d3d_caps)("Hardware pixel shader version 2.0 enabled (ARB_PROGRAM)\n");
4555         }
4556         else
4557         {
4558             pCaps->PixelShaderVersion    = WINED3DPS_VERSION(1,4);
4559             TRACE_(d3d_caps)("Hardware pixel shader version 1.4 enabled (ARB_PROGRAM)\n");
4560         }
4561         pCaps->PixelShader1xMaxValue = 8.0f;
4562         pCaps->MaxPixelShaderConst = GL_LIMITS(pshader_constantsF);
4563     }
4564
4565     pCaps->VSClipping = use_nv_clip(gl_info);
4566 }
4567
4568 static BOOL shader_arb_color_fixup_supported(struct color_fixup_desc fixup)
4569 {
4570     if (TRACE_ON(d3d_shader) && TRACE_ON(d3d))
4571     {
4572         TRACE("Checking support for color_fixup:\n");
4573         dump_color_fixup_desc(fixup);
4574     }
4575
4576     /* We support everything except YUV conversions. */
4577     if (!is_yuv_fixup(fixup))
4578     {
4579         TRACE("[OK]\n");
4580         return TRUE;
4581     }
4582
4583     TRACE("[FAILED]\n");
4584     return FALSE;
4585 }
4586
4587 static void shader_arb_add_instruction_modifiers(const struct wined3d_shader_instruction *ins) {
4588     DWORD shift;
4589     char write_mask[20], regstr[50];
4590     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
4591     BOOL is_color = FALSE;
4592     const struct wined3d_shader_dst_param *dst;
4593
4594     if (!ins->dst_count) return;
4595
4596     dst = &ins->dst[0];
4597     shift = dst->shift;
4598     if(shift == 0) return; /* Saturate alone is handled by the instructions */
4599
4600     shader_arb_get_write_mask(ins, dst, write_mask);
4601     shader_arb_get_register_name(ins, &dst->reg, regstr, &is_color);
4602
4603     /* Generate a line that does the output modifier computation
4604      * FIXME: _SAT vs shift? _SAT alone is already handled in the instructions, if this
4605      * maps problems in e.g. _d4_sat modify shader_arb_get_modifier
4606      */
4607     shader_addline(buffer, "MUL%s %s%s, %s, %s;\n", shader_arb_get_modifier(ins),
4608                    regstr, write_mask, regstr, shift_tab[shift]);
4609 }
4610
4611 static const SHADER_HANDLER shader_arb_instruction_handler_table[WINED3DSIH_TABLE_SIZE] =
4612 {
4613     /* WINED3DSIH_ABS           */ shader_hw_map2gl,
4614     /* WINED3DSIH_ADD           */ shader_hw_map2gl,
4615     /* WINED3DSIH_BEM           */ pshader_hw_bem,
4616     /* WINED3DSIH_BREAK         */ shader_hw_break,
4617     /* WINED3DSIH_BREAKC        */ shader_hw_breakc,
4618     /* WINED3DSIH_BREAKP        */ NULL,
4619     /* WINED3DSIH_CALL          */ shader_hw_call,
4620     /* WINED3DSIH_CALLNZ        */ NULL,
4621     /* WINED3DSIH_CMP           */ pshader_hw_cmp,
4622     /* WINED3DSIH_CND           */ pshader_hw_cnd,
4623     /* WINED3DSIH_CRS           */ shader_hw_map2gl,
4624     /* WINED3DSIH_DCL           */ NULL,
4625     /* WINED3DSIH_DEF           */ NULL,
4626     /* WINED3DSIH_DEFB          */ NULL,
4627     /* WINED3DSIH_DEFI          */ NULL,
4628     /* WINED3DSIH_DP2ADD        */ pshader_hw_dp2add,
4629     /* WINED3DSIH_DP3           */ shader_hw_map2gl,
4630     /* WINED3DSIH_DP4           */ shader_hw_map2gl,
4631     /* WINED3DSIH_DST           */ shader_hw_map2gl,
4632     /* WINED3DSIH_DSX           */ shader_hw_map2gl,
4633     /* WINED3DSIH_DSY           */ shader_hw_dsy,
4634     /* WINED3DSIH_ELSE          */ shader_hw_else,
4635     /* WINED3DSIH_ENDIF         */ shader_hw_endif,
4636     /* WINED3DSIH_ENDLOOP       */ shader_hw_endloop,
4637     /* WINED3DSIH_ENDREP        */ shader_hw_endrep,
4638     /* WINED3DSIH_EXP           */ shader_hw_scalar_op,
4639     /* WINED3DSIH_EXPP          */ shader_hw_scalar_op,
4640     /* WINED3DSIH_FRC           */ shader_hw_map2gl,
4641     /* WINED3DSIH_IF            */ NULL /* Hardcoded into the shader */,
4642     /* WINED3DSIH_IFC           */ shader_hw_ifc,
4643     /* WINED3DSIH_LABEL         */ shader_hw_label,
4644     /* WINED3DSIH_LIT           */ shader_hw_map2gl,
4645     /* WINED3DSIH_LOG           */ shader_hw_log_pow,
4646     /* WINED3DSIH_LOGP          */ shader_hw_log_pow,
4647     /* WINED3DSIH_LOOP          */ shader_hw_loop,
4648     /* WINED3DSIH_LRP           */ shader_hw_lrp,
4649     /* WINED3DSIH_M3x2          */ shader_hw_mnxn,
4650     /* WINED3DSIH_M3x3          */ shader_hw_mnxn,
4651     /* WINED3DSIH_M3x4          */ shader_hw_mnxn,
4652     /* WINED3DSIH_M4x3          */ shader_hw_mnxn,
4653     /* WINED3DSIH_M4x4          */ shader_hw_mnxn,
4654     /* WINED3DSIH_MAD           */ shader_hw_map2gl,
4655     /* WINED3DSIH_MAX           */ shader_hw_map2gl,
4656     /* WINED3DSIH_MIN           */ shader_hw_map2gl,
4657     /* WINED3DSIH_MOV           */ shader_hw_mov,
4658     /* WINED3DSIH_MOVA          */ shader_hw_mov,
4659     /* WINED3DSIH_MUL           */ shader_hw_map2gl,
4660     /* WINED3DSIH_NOP           */ shader_hw_nop,
4661     /* WINED3DSIH_NRM           */ shader_hw_nrm,
4662     /* WINED3DSIH_PHASE         */ NULL,
4663     /* WINED3DSIH_POW           */ shader_hw_log_pow,
4664     /* WINED3DSIH_RCP           */ shader_hw_scalar_op,
4665     /* WINED3DSIH_REP           */ shader_hw_rep,
4666     /* WINED3DSIH_RET           */ shader_hw_ret,
4667     /* WINED3DSIH_RSQ           */ shader_hw_scalar_op,
4668     /* WINED3DSIH_SETP          */ NULL,
4669     /* WINED3DSIH_SGE           */ shader_hw_map2gl,
4670     /* WINED3DSIH_SGN           */ shader_hw_sgn,
4671     /* WINED3DSIH_SINCOS        */ shader_hw_sincos,
4672     /* WINED3DSIH_SLT           */ shader_hw_map2gl,
4673     /* WINED3DSIH_SUB           */ shader_hw_map2gl,
4674     /* WINED3DSIH_TEX           */ pshader_hw_tex,
4675     /* WINED3DSIH_TEXBEM        */ pshader_hw_texbem,
4676     /* WINED3DSIH_TEXBEML       */ pshader_hw_texbem,
4677     /* WINED3DSIH_TEXCOORD      */ pshader_hw_texcoord,
4678     /* WINED3DSIH_TEXDEPTH      */ pshader_hw_texdepth,
4679     /* WINED3DSIH_TEXDP3        */ pshader_hw_texdp3,
4680     /* WINED3DSIH_TEXDP3TEX     */ pshader_hw_texdp3tex,
4681     /* WINED3DSIH_TEXKILL       */ pshader_hw_texkill,
4682     /* WINED3DSIH_TEXLDD        */ shader_hw_texldd,
4683     /* WINED3DSIH_TEXLDL        */ shader_hw_texldl,
4684     /* WINED3DSIH_TEXM3x2DEPTH  */ pshader_hw_texm3x2depth,
4685     /* WINED3DSIH_TEXM3x2PAD    */ pshader_hw_texm3x2pad,
4686     /* WINED3DSIH_TEXM3x2TEX    */ pshader_hw_texm3x2tex,
4687     /* WINED3DSIH_TEXM3x3       */ pshader_hw_texm3x3,
4688     /* WINED3DSIH_TEXM3x3DIFF   */ NULL,
4689     /* WINED3DSIH_TEXM3x3PAD    */ pshader_hw_texm3x3pad,
4690     /* WINED3DSIH_TEXM3x3SPEC   */ pshader_hw_texm3x3spec,
4691     /* WINED3DSIH_TEXM3x3TEX    */ pshader_hw_texm3x3tex,
4692     /* WINED3DSIH_TEXM3x3VSPEC  */ pshader_hw_texm3x3vspec,
4693     /* WINED3DSIH_TEXREG2AR     */ pshader_hw_texreg2ar,
4694     /* WINED3DSIH_TEXREG2GB     */ pshader_hw_texreg2gb,
4695     /* WINED3DSIH_TEXREG2RGB    */ pshader_hw_texreg2rgb,
4696 };
4697
4698 static inline BOOL get_bool_const(const struct wined3d_shader_instruction *ins, IWineD3DBaseShaderImpl *This, DWORD idx)
4699 {
4700     BOOL vshader = shader_is_vshader_version(This->baseShader.reg_maps.shader_version.type);
4701     WORD bools = 0;
4702     WORD flag = (1 << idx);
4703     const local_constant *constant;
4704     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
4705
4706     if(This->baseShader.reg_maps.local_bool_consts & flag)
4707     {
4708         /* What good is a if(bool) with a hardcoded local constant? I don't know, but handle it */
4709         LIST_FOR_EACH_ENTRY(constant, &This->baseShader.constantsB, local_constant, entry)
4710         {
4711             if (constant->idx == idx)
4712             {
4713                 return constant->value[0];
4714             }
4715         }
4716         ERR("Local constant not found\n");
4717         return FALSE;
4718     }
4719     else
4720     {
4721         if(vshader) bools = priv->cur_vs_args->boolclip.bools;
4722         else bools = priv->cur_ps_args->bools;
4723         return bools & flag;
4724     }
4725 }
4726
4727 static void get_loop_control_const(const struct wined3d_shader_instruction *ins,
4728         IWineD3DBaseShaderImpl *This, UINT idx, struct wined3d_shader_loop_control *loop_control)
4729 {
4730     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
4731
4732     /* Integer constants can either be a local constant, or they can be stored in the shader
4733      * type specific compile args. */
4734     if (This->baseShader.reg_maps.local_int_consts & (1 << idx))
4735     {
4736         const local_constant *constant;
4737
4738         LIST_FOR_EACH_ENTRY(constant, &This->baseShader.constantsI, local_constant, entry)
4739         {
4740             if (constant->idx == idx)
4741             {
4742                 loop_control->count = constant->value[0];
4743                 loop_control->start = constant->value[1];
4744                 /* Step is signed. */
4745                 loop_control->step = (int)constant->value[2];
4746                 return;
4747             }
4748         }
4749         /* If this happens the flag was set incorrectly */
4750         ERR("Local constant not found\n");
4751         loop_control->count = 0;
4752         loop_control->start = 0;
4753         loop_control->step = 0;
4754         return;
4755     }
4756
4757     switch (This->baseShader.reg_maps.shader_version.type)
4758     {
4759         case WINED3D_SHADER_TYPE_VERTEX:
4760             /* Count and aL start value are unsigned */
4761             loop_control->count = priv->cur_vs_args->loop_ctrl[idx][0];
4762             loop_control->start = priv->cur_vs_args->loop_ctrl[idx][1];
4763             /* Step is signed. */
4764             loop_control->step = ((char)priv->cur_vs_args->loop_ctrl[idx][2]);
4765             break;
4766
4767         case WINED3D_SHADER_TYPE_PIXEL:
4768             loop_control->count = priv->cur_ps_args->loop_ctrl[idx][0];
4769             loop_control->start = priv->cur_ps_args->loop_ctrl[idx][1];
4770             loop_control->step = ((char)priv->cur_ps_args->loop_ctrl[idx][2]);
4771             break;
4772
4773         default:
4774             FIXME("Unhandled shader type %#x.\n", This->baseShader.reg_maps.shader_version.type);
4775             break;
4776     }
4777 }
4778
4779 static void record_instruction(struct list *list, const struct wined3d_shader_instruction *ins)
4780 {
4781     unsigned int i;
4782     struct wined3d_shader_dst_param *dst_param = NULL;
4783     struct wined3d_shader_src_param *src_param = NULL, *rel_addr = NULL;
4784     struct recorded_instruction *rec = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*rec));
4785     if(!rec)
4786     {
4787         ERR("Out of memory\n");
4788         return;
4789     }
4790
4791     rec->ins = *ins;
4792     dst_param = HeapAlloc(GetProcessHeap(), 0, sizeof(*dst_param));
4793     if(!dst_param) goto free;
4794     *dst_param = *ins->dst;
4795     if(ins->dst->reg.rel_addr)
4796     {
4797         rel_addr = HeapAlloc(GetProcessHeap(), 0, sizeof(*dst_param->reg.rel_addr));
4798         if(!rel_addr) goto free;
4799         *rel_addr = *ins->dst->reg.rel_addr;
4800         dst_param->reg.rel_addr = rel_addr;
4801     }
4802     rec->ins.dst = dst_param;
4803
4804     src_param = HeapAlloc(GetProcessHeap(), 0, sizeof(*src_param) * ins->src_count);
4805     if(!src_param) goto free;
4806     for(i = 0; i < ins->src_count; i++)
4807     {
4808         src_param[i] = ins->src[i];
4809         if(ins->src[i].reg.rel_addr)
4810         {
4811             rel_addr = HeapAlloc(GetProcessHeap(), 0, sizeof(*rel_addr));
4812             if(!rel_addr) goto free;
4813             *rel_addr = *ins->src[i].reg.rel_addr;
4814             src_param[i].reg.rel_addr = rel_addr;
4815         }
4816     }
4817     rec->ins.src = src_param;
4818     list_add_tail(list, &rec->entry);
4819     return;
4820
4821 free:
4822     ERR("Out of memory\n");
4823     if(dst_param)
4824     {
4825         HeapFree(GetProcessHeap(), 0, (void *) dst_param->reg.rel_addr);
4826         HeapFree(GetProcessHeap(), 0, dst_param);
4827     }
4828     if(src_param)
4829     {
4830         for(i = 0; i < ins->src_count; i++)
4831         {
4832             HeapFree(GetProcessHeap(), 0, (void *) src_param[i].reg.rel_addr);
4833         }
4834         HeapFree(GetProcessHeap(), 0, src_param);
4835     }
4836     HeapFree(GetProcessHeap(), 0, rec);
4837 }
4838
4839 static void free_recorded_instruction(struct list *list)
4840 {
4841     struct recorded_instruction *rec_ins, *entry2;
4842     unsigned int i;
4843
4844     LIST_FOR_EACH_ENTRY_SAFE(rec_ins, entry2, list, struct recorded_instruction, entry)
4845     {
4846         list_remove(&rec_ins->entry);
4847         if(rec_ins->ins.dst)
4848         {
4849             HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.dst->reg.rel_addr);
4850             HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.dst);
4851         }
4852         if(rec_ins->ins.src)
4853         {
4854             for(i = 0; i < rec_ins->ins.src_count; i++)
4855             {
4856                 HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.src[i].reg.rel_addr);
4857             }
4858             HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.src);
4859         }
4860         HeapFree(GetProcessHeap(), 0, rec_ins);
4861     }
4862 }
4863
4864 static void shader_arb_handle_instruction(const struct wined3d_shader_instruction *ins) {
4865     SHADER_HANDLER hw_fct;
4866     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
4867     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
4868     struct control_frame *control_frame;
4869     struct wined3d_shader_buffer *buffer = ins->ctx->buffer;
4870     BOOL bool_const;
4871
4872     if(ins->handler_idx == WINED3DSIH_LOOP || ins->handler_idx == WINED3DSIH_REP)
4873     {
4874         control_frame = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*control_frame));
4875         list_add_head(&priv->control_frames, &control_frame->entry);
4876
4877         if(ins->handler_idx == WINED3DSIH_LOOP) control_frame->type = LOOP;
4878         if(ins->handler_idx == WINED3DSIH_REP) control_frame->type = REP;
4879
4880         if(priv->target_version >= NV2)
4881         {
4882             control_frame->loop_no = priv->num_loops++;
4883             priv->loop_depth++;
4884         }
4885         else
4886         {
4887             /* Don't bother recording when we're in a not used if branch */
4888             if(priv->muted)
4889             {
4890                 return;
4891             }
4892
4893             if(!priv->recording)
4894             {
4895                 list_init(&priv->record);
4896                 priv->recording = TRUE;
4897                 control_frame->outer_loop = TRUE;
4898                 get_loop_control_const(ins, This, ins->src[0].reg.idx, &control_frame->loop_control);
4899                 return; /* Instruction is handled */
4900             }
4901             /* Record this loop in the outer loop's recording */
4902         }
4903     }
4904     else if(ins->handler_idx == WINED3DSIH_ENDLOOP || ins->handler_idx == WINED3DSIH_ENDREP)
4905     {
4906         if(priv->target_version >= NV2)
4907         {
4908             /* Nothing to do. The control frame is popped after the HW instr handler */
4909         }
4910         else
4911         {
4912             struct list *e = list_head(&priv->control_frames);
4913             control_frame = LIST_ENTRY(e, struct control_frame, entry);
4914             list_remove(&control_frame->entry);
4915
4916             if(control_frame->outer_loop)
4917             {
4918                 int iteration, aL = 0;
4919                 struct list copy;
4920
4921                 /* Turn off recording before playback */
4922                 priv->recording = FALSE;
4923
4924                 /* Move the recorded instructions to a separate list and get them out of the private data
4925                  * structure. If there are nested loops, the shader_arb_handle_instruction below will
4926                  * be recorded again, thus priv->record might be overwritten
4927                  */
4928                 list_init(&copy);
4929                 list_move_tail(&copy, &priv->record);
4930                 list_init(&priv->record);
4931
4932                 if(ins->handler_idx == WINED3DSIH_ENDLOOP)
4933                 {
4934                     shader_addline(buffer, "#unrolling loop: %u iterations, aL=%u, inc %d\n",
4935                                    control_frame->loop_control.count, control_frame->loop_control.start,
4936                                    control_frame->loop_control.step);
4937                     aL = control_frame->loop_control.start;
4938                 }
4939                 else
4940                 {
4941                     shader_addline(buffer, "#unrolling rep: %u iterations\n", control_frame->loop_control.count);
4942                 }
4943
4944                 for (iteration = 0; iteration < control_frame->loop_control.count; ++iteration)
4945                 {
4946                     struct recorded_instruction *rec_ins;
4947                     if(ins->handler_idx == WINED3DSIH_ENDLOOP)
4948                     {
4949                         priv->aL = aL;
4950                         shader_addline(buffer, "#Iteration %d, aL=%d\n", iteration, aL);
4951                     }
4952                     else
4953                     {
4954                         shader_addline(buffer, "#Iteration %d\n", iteration);
4955                     }
4956
4957                     LIST_FOR_EACH_ENTRY(rec_ins, &copy, struct recorded_instruction, entry)
4958                     {
4959                         shader_arb_handle_instruction(&rec_ins->ins);
4960                     }
4961
4962                     if(ins->handler_idx == WINED3DSIH_ENDLOOP)
4963                     {
4964                         aL += control_frame->loop_control.step;
4965                     }
4966                 }
4967                 shader_addline(buffer, "#end loop/rep\n");
4968
4969                 free_recorded_instruction(&copy);
4970                 HeapFree(GetProcessHeap(), 0, control_frame);
4971                 return; /* Instruction is handled */
4972             }
4973             else
4974             {
4975                 /* This is a nested loop. Proceed to the normal recording function */
4976                 HeapFree(GetProcessHeap(), 0, control_frame);
4977             }
4978         }
4979     }
4980
4981     if(priv->recording)
4982     {
4983         record_instruction(&priv->record, ins);
4984         return;
4985     }
4986
4987     /* boolean if */
4988     if(ins->handler_idx == WINED3DSIH_IF)
4989     {
4990         control_frame = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*control_frame));
4991         list_add_head(&priv->control_frames, &control_frame->entry);
4992         control_frame->type = IF;
4993
4994         bool_const = get_bool_const(ins, This, ins->src[0].reg.idx);
4995         if(ins->src[0].modifiers == WINED3DSPSM_NOT) bool_const = !bool_const;
4996         if(!priv->muted && bool_const == FALSE)
4997         {
4998             shader_addline(buffer, "#if(FALSE){\n");
4999             priv->muted = TRUE;
5000             control_frame->muting = TRUE;
5001         }
5002         else shader_addline(buffer, "#if(TRUE) {\n");
5003
5004         return; /* Instruction is handled */
5005     }
5006     else if(ins->handler_idx == WINED3DSIH_IFC)
5007     {
5008         /* IF(bool) and if_cond(a, b) use the same ELSE and ENDIF tokens */
5009         control_frame = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*control_frame));
5010         control_frame->type = IFC;
5011         control_frame->ifc_no = priv->num_ifcs++;
5012         list_add_head(&priv->control_frames, &control_frame->entry);
5013     }
5014     else if(ins->handler_idx == WINED3DSIH_ELSE)
5015     {
5016         struct list *e = list_head(&priv->control_frames);
5017         control_frame = LIST_ENTRY(e, struct control_frame, entry);
5018
5019         if(control_frame->type == IF)
5020         {
5021             shader_addline(buffer, "#} else {\n");
5022             if(!priv->muted && !control_frame->muting)
5023             {
5024                 priv->muted = TRUE;
5025                 control_frame->muting = TRUE;
5026             }
5027             else if(control_frame->muting) priv->muted = FALSE;
5028             return; /* Instruction is handled. */
5029         }
5030         /* In case of an ifc, generate a HW shader instruction */
5031     }
5032     else if(ins->handler_idx == WINED3DSIH_ENDIF)
5033     {
5034         struct list *e = list_head(&priv->control_frames);
5035         control_frame = LIST_ENTRY(e, struct control_frame, entry);
5036
5037         if(control_frame->type == IF)
5038         {
5039             shader_addline(buffer, "#} endif\n");
5040             if(control_frame->muting) priv->muted = FALSE;
5041             list_remove(&control_frame->entry);
5042             HeapFree(GetProcessHeap(), 0, control_frame);
5043             return; /* Instruction is handled */
5044         }
5045     }
5046
5047     if(priv->muted) return;
5048
5049     /* Select handler */
5050     hw_fct = shader_arb_instruction_handler_table[ins->handler_idx];
5051
5052     /* Unhandled opcode */
5053     if (!hw_fct)
5054     {
5055         FIXME("Backend can't handle opcode %#x\n", ins->handler_idx);
5056         return;
5057     }
5058     hw_fct(ins);
5059
5060     if(ins->handler_idx == WINED3DSIH_ENDLOOP || ins->handler_idx == WINED3DSIH_ENDREP)
5061     {
5062         struct list *e = list_head(&priv->control_frames);
5063         control_frame = LIST_ENTRY(e, struct control_frame, entry);
5064         list_remove(&control_frame->entry);
5065         HeapFree(GetProcessHeap(), 0, control_frame);
5066         priv->loop_depth--;
5067     }
5068     else if(ins->handler_idx == WINED3DSIH_ENDIF)
5069     {
5070         /* Non-ifc ENDIFs don't reach that place because of the return in the if block above */
5071         struct list *e = list_head(&priv->control_frames);
5072         control_frame = LIST_ENTRY(e, struct control_frame, entry);
5073         list_remove(&control_frame->entry);
5074         HeapFree(GetProcessHeap(), 0, control_frame);
5075     }
5076
5077
5078     shader_arb_add_instruction_modifiers(ins);
5079 }
5080
5081 const shader_backend_t arb_program_shader_backend = {
5082     shader_arb_handle_instruction,
5083     shader_arb_select,
5084     shader_arb_select_depth_blt,
5085     shader_arb_deselect_depth_blt,
5086     shader_arb_update_float_vertex_constants,
5087     shader_arb_update_float_pixel_constants,
5088     shader_arb_load_constants,
5089     shader_arb_load_np2fixup_constants,
5090     shader_arb_destroy,
5091     shader_arb_alloc,
5092     shader_arb_free,
5093     shader_arb_dirty_const,
5094     shader_arb_get_caps,
5095     shader_arb_color_fixup_supported,
5096 };
5097
5098 /* ARB_fragment_program fixed function pipeline replacement definitions */
5099 #define ARB_FFP_CONST_TFACTOR           0
5100 #define ARB_FFP_CONST_SPECULAR_ENABLE   ((ARB_FFP_CONST_TFACTOR) + 1)
5101 #define ARB_FFP_CONST_CONSTANT(i)       ((ARB_FFP_CONST_SPECULAR_ENABLE) + 1 + i)
5102 #define ARB_FFP_CONST_BUMPMAT(i)        ((ARB_FFP_CONST_CONSTANT(7)) + 1 + i)
5103 #define ARB_FFP_CONST_LUMINANCE(i)      ((ARB_FFP_CONST_BUMPMAT(7)) + 1 + i)
5104
5105 struct arbfp_ffp_desc
5106 {
5107     struct ffp_frag_desc parent;
5108     GLuint shader;
5109     unsigned int num_textures_used;
5110 };
5111
5112 /* Context activation is done by the caller. */
5113 static void arbfp_enable(IWineD3DDevice *iface, BOOL enable) {
5114     ENTER_GL();
5115     if(enable) {
5116         glEnable(GL_FRAGMENT_PROGRAM_ARB);
5117         checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB)");
5118     } else {
5119         glDisable(GL_FRAGMENT_PROGRAM_ARB);
5120         checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
5121     }
5122     LEAVE_GL();
5123 }
5124
5125 static HRESULT arbfp_alloc(IWineD3DDevice *iface) {
5126     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *) iface;
5127     struct shader_arb_priv *priv;
5128     /* Share private data between the shader backend and the pipeline replacement, if both
5129      * are the arb implementation. This is needed to figure out whether ARBfp should be disabled
5130      * if no pixel shader is bound or not
5131      */
5132     if(This->shader_backend == &arb_program_shader_backend) {
5133         This->fragment_priv = This->shader_priv;
5134     } else {
5135         This->fragment_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(struct shader_arb_priv));
5136         if(!This->fragment_priv) return E_OUTOFMEMORY;
5137     }
5138     priv = This->fragment_priv;
5139     if (wine_rb_init(&priv->fragment_shaders, &wined3d_ffp_frag_program_rb_functions) == -1)
5140     {
5141         ERR("Failed to initialize rbtree.\n");
5142         HeapFree(GetProcessHeap(), 0, This->fragment_priv);
5143         return E_OUTOFMEMORY;
5144     }
5145     priv->use_arbfp_fixed_func = TRUE;
5146     return WINED3D_OK;
5147 }
5148
5149 /* Context activation is done by the caller. */
5150 static void arbfp_free_ffpshader(struct wine_rb_entry *entry, void *context)
5151 {
5152     const struct wined3d_gl_info *gl_info = context;
5153     struct arbfp_ffp_desc *entry_arb = WINE_RB_ENTRY_VALUE(entry, struct arbfp_ffp_desc, parent.entry);
5154
5155     ENTER_GL();
5156     GL_EXTCALL(glDeleteProgramsARB(1, &entry_arb->shader));
5157     checkGLcall("glDeleteProgramsARB(1, &entry_arb->shader)");
5158     HeapFree(GetProcessHeap(), 0, entry_arb);
5159     LEAVE_GL();
5160 }
5161
5162 /* Context activation is done by the caller. */
5163 static void arbfp_free(IWineD3DDevice *iface) {
5164     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *) iface;
5165     struct shader_arb_priv *priv = This->fragment_priv;
5166
5167     wine_rb_destroy(&priv->fragment_shaders, arbfp_free_ffpshader, &This->adapter->gl_info);
5168     priv->use_arbfp_fixed_func = FALSE;
5169
5170     if(This->shader_backend != &arb_program_shader_backend) {
5171         HeapFree(GetProcessHeap(), 0, This->fragment_priv);
5172     }
5173 }
5174
5175 static void arbfp_get_caps(WINED3DDEVTYPE devtype, const struct wined3d_gl_info *gl_info, struct fragment_caps *caps)
5176 {
5177     caps->TextureOpCaps =  WINED3DTEXOPCAPS_DISABLE                     |
5178                            WINED3DTEXOPCAPS_SELECTARG1                  |
5179                            WINED3DTEXOPCAPS_SELECTARG2                  |
5180                            WINED3DTEXOPCAPS_MODULATE4X                  |
5181                            WINED3DTEXOPCAPS_MODULATE2X                  |
5182                            WINED3DTEXOPCAPS_MODULATE                    |
5183                            WINED3DTEXOPCAPS_ADDSIGNED2X                 |
5184                            WINED3DTEXOPCAPS_ADDSIGNED                   |
5185                            WINED3DTEXOPCAPS_ADD                         |
5186                            WINED3DTEXOPCAPS_SUBTRACT                    |
5187                            WINED3DTEXOPCAPS_ADDSMOOTH                   |
5188                            WINED3DTEXOPCAPS_BLENDCURRENTALPHA           |
5189                            WINED3DTEXOPCAPS_BLENDFACTORALPHA            |
5190                            WINED3DTEXOPCAPS_BLENDTEXTUREALPHA           |
5191                            WINED3DTEXOPCAPS_BLENDDIFFUSEALPHA           |
5192                            WINED3DTEXOPCAPS_BLENDTEXTUREALPHAPM         |
5193                            WINED3DTEXOPCAPS_MODULATEALPHA_ADDCOLOR      |
5194                            WINED3DTEXOPCAPS_MODULATECOLOR_ADDALPHA      |
5195                            WINED3DTEXOPCAPS_MODULATEINVCOLOR_ADDALPHA   |
5196                            WINED3DTEXOPCAPS_MODULATEINVALPHA_ADDCOLOR   |
5197                            WINED3DTEXOPCAPS_DOTPRODUCT3                 |
5198                            WINED3DTEXOPCAPS_MULTIPLYADD                 |
5199                            WINED3DTEXOPCAPS_LERP                        |
5200                            WINED3DTEXOPCAPS_BUMPENVMAP                  |
5201                            WINED3DTEXOPCAPS_BUMPENVMAPLUMINANCE;
5202
5203     /* TODO: Implement WINED3DTEXOPCAPS_PREMODULATE */
5204
5205     caps->MaxTextureBlendStages   = 8;
5206     caps->MaxSimultaneousTextures = min(GL_LIMITS(fragment_samplers), 8);
5207
5208     caps->PrimitiveMiscCaps |= WINED3DPMISCCAPS_TSSARGTEMP;
5209 }
5210 #undef GLINFO_LOCATION
5211
5212 #define GLINFO_LOCATION stateblock->wineD3DDevice->adapter->gl_info
5213 static void state_texfactor_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, struct wined3d_context *context)
5214 {
5215     float col[4];
5216     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
5217
5218     /* Don't load the parameter if we're using an arbfp pixel shader, otherwise we'll overwrite
5219      * application provided constants
5220      */
5221     if(device->shader_backend == &arb_program_shader_backend) {
5222         if (use_ps(stateblock)) return;
5223
5224         device = stateblock->wineD3DDevice;
5225         context->pshader_const_dirty[ARB_FFP_CONST_TFACTOR] = 1;
5226         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_TFACTOR + 1);
5227     }
5228
5229     D3DCOLORTOGLFLOAT4(stateblock->renderState[WINED3DRS_TEXTUREFACTOR], col);
5230     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_TFACTOR, col));
5231     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_TFACTOR, col)");
5232
5233 }
5234
5235 static void state_arb_specularenable(DWORD state, IWineD3DStateBlockImpl *stateblock, struct wined3d_context *context)
5236 {
5237     float col[4];
5238     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
5239
5240     /* Don't load the parameter if we're using an arbfp pixel shader, otherwise we'll overwrite
5241      * application provided constants
5242      */
5243     if(device->shader_backend == &arb_program_shader_backend) {
5244         if (use_ps(stateblock)) return;
5245
5246         device = stateblock->wineD3DDevice;
5247         context->pshader_const_dirty[ARB_FFP_CONST_SPECULAR_ENABLE] = 1;
5248         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_SPECULAR_ENABLE + 1);
5249     }
5250
5251     if(stateblock->renderState[WINED3DRS_SPECULARENABLE]) {
5252         /* The specular color has no alpha */
5253         col[0] = 1.0f; col[1] = 1.0f;
5254         col[2] = 1.0f; col[3] = 0.0f;
5255     } else {
5256         col[0] = 0.0f; col[1] = 0.0f;
5257         col[2] = 0.0f; col[3] = 0.0f;
5258     }
5259     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_SPECULAR_ENABLE, col));
5260     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_SPECULAR_ENABLE, col)");
5261 }
5262
5263 static void set_bumpmat_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, struct wined3d_context *context)
5264 {
5265     DWORD stage = (state - STATE_TEXTURESTAGE(0, 0)) / (WINED3D_HIGHEST_TEXTURE_STATE + 1);
5266     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
5267     float mat[2][2];
5268
5269     if (use_ps(stateblock))
5270     {
5271         if (stage != 0
5272                 && (((IWineD3DPixelShaderImpl *)stateblock->pixelShader)->baseShader.reg_maps.bumpmat & (1 << stage)))
5273         {
5274             /* The pixel shader has to know the bump env matrix. Do a constants update if it isn't scheduled
5275              * anyway
5276              */
5277             if(!isStateDirty(context, STATE_PIXELSHADERCONSTANT)) {
5278                 device->StateTable[STATE_PIXELSHADERCONSTANT].apply(STATE_PIXELSHADERCONSTANT, stateblock, context);
5279             }
5280         }
5281
5282         if(device->shader_backend == &arb_program_shader_backend) {
5283             /* Exit now, don't set the bumpmat below, otherwise we may overwrite pixel shader constants */
5284             return;
5285         }
5286     } else if(device->shader_backend == &arb_program_shader_backend) {
5287         context->pshader_const_dirty[ARB_FFP_CONST_BUMPMAT(stage)] = 1;
5288         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_BUMPMAT(stage) + 1);
5289     }
5290
5291     mat[0][0] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT00]);
5292     mat[0][1] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT01]);
5293     mat[1][0] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT10]);
5294     mat[1][1] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT11]);
5295
5296     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_BUMPMAT(stage), &mat[0][0]));
5297     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_BUMPMAT(stage), &mat[0][0])");
5298 }
5299
5300 static void tex_bumpenvlum_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, struct wined3d_context *context)
5301 {
5302     DWORD stage = (state - STATE_TEXTURESTAGE(0, 0)) / (WINED3D_HIGHEST_TEXTURE_STATE + 1);
5303     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
5304     float param[4];
5305
5306     if (use_ps(stateblock))
5307     {
5308         if (stage != 0
5309                 && (((IWineD3DPixelShaderImpl *)stateblock->pixelShader)->baseShader.reg_maps.luminanceparams & (1 << stage)))
5310         {
5311             /* The pixel shader has to know the luminance offset. Do a constants update if it
5312              * isn't scheduled anyway
5313              */
5314             if(!isStateDirty(context, STATE_PIXELSHADERCONSTANT)) {
5315                 device->StateTable[STATE_PIXELSHADERCONSTANT].apply(STATE_PIXELSHADERCONSTANT, stateblock, context);
5316             }
5317         }
5318
5319         if(device->shader_backend == &arb_program_shader_backend) {
5320             /* Exit now, don't set the bumpmat below, otherwise we may overwrite pixel shader constants */
5321             return;
5322         }
5323     } else if(device->shader_backend == &arb_program_shader_backend) {
5324         context->pshader_const_dirty[ARB_FFP_CONST_LUMINANCE(stage)] = 1;
5325         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_LUMINANCE(stage) + 1);
5326     }
5327
5328     param[0] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVLSCALE]);
5329     param[1] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVLOFFSET]);
5330     param[2] = 0.0f;
5331     param[3] = 0.0f;
5332
5333     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_LUMINANCE(stage), param));
5334     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_LUMINANCE(stage), param)");
5335 }
5336
5337 static const char *get_argreg(struct wined3d_shader_buffer *buffer, DWORD argnum, unsigned int stage, DWORD arg)
5338 {
5339     const char *ret;
5340
5341     if(arg == ARG_UNUSED) return "unused"; /* This is the marker for unused registers */
5342
5343     switch(arg & WINED3DTA_SELECTMASK) {
5344         case WINED3DTA_DIFFUSE:
5345             ret = "fragment.color.primary"; break;
5346
5347         case WINED3DTA_CURRENT:
5348             if(stage == 0) ret = "fragment.color.primary";
5349             else ret = "ret";
5350             break;
5351
5352         case WINED3DTA_TEXTURE:
5353             switch(stage) {
5354                 case 0: ret = "tex0"; break;
5355                 case 1: ret = "tex1"; break;
5356                 case 2: ret = "tex2"; break;
5357                 case 3: ret = "tex3"; break;
5358                 case 4: ret = "tex4"; break;
5359                 case 5: ret = "tex5"; break;
5360                 case 6: ret = "tex6"; break;
5361                 case 7: ret = "tex7"; break;
5362                 default: ret = "unknown texture";
5363             }
5364             break;
5365
5366         case WINED3DTA_TFACTOR:
5367             ret = "tfactor"; break;
5368
5369         case WINED3DTA_SPECULAR:
5370             ret = "fragment.color.secondary"; break;
5371
5372         case WINED3DTA_TEMP:
5373             ret = "tempreg"; break;
5374
5375         case WINED3DTA_CONSTANT:
5376             FIXME("Implement perstage constants\n");
5377             switch(stage) {
5378                 case 0: ret = "const0"; break;
5379                 case 1: ret = "const1"; break;
5380                 case 2: ret = "const2"; break;
5381                 case 3: ret = "const3"; break;
5382                 case 4: ret = "const4"; break;
5383                 case 5: ret = "const5"; break;
5384                 case 6: ret = "const6"; break;
5385                 case 7: ret = "const7"; break;
5386                 default: ret = "unknown constant";
5387             }
5388             break;
5389
5390         default:
5391             return "unknown";
5392     }
5393
5394     if(arg & WINED3DTA_COMPLEMENT) {
5395         shader_addline(buffer, "SUB arg%u, const.x, %s;\n", argnum, ret);
5396         if(argnum == 0) ret = "arg0";
5397         if(argnum == 1) ret = "arg1";
5398         if(argnum == 2) ret = "arg2";
5399     }
5400     if(arg & WINED3DTA_ALPHAREPLICATE) {
5401         shader_addline(buffer, "MOV arg%u, %s.w;\n", argnum, ret);
5402         if(argnum == 0) ret = "arg0";
5403         if(argnum == 1) ret = "arg1";
5404         if(argnum == 2) ret = "arg2";
5405     }
5406     return ret;
5407 }
5408
5409 static void gen_ffp_instr(struct wined3d_shader_buffer *buffer, unsigned int stage, BOOL color,
5410         BOOL alpha, DWORD dst, DWORD op, DWORD dw_arg0, DWORD dw_arg1, DWORD dw_arg2)
5411 {
5412     const char *dstmask, *dstreg, *arg0, *arg1, *arg2;
5413     unsigned int mul = 1;
5414     BOOL mul_final_dest = FALSE;
5415
5416     if(color && alpha) dstmask = "";
5417     else if(color) dstmask = ".xyz";
5418     else dstmask = ".w";
5419
5420     if(dst == tempreg) dstreg = "tempreg";
5421     else dstreg = "ret";
5422
5423     arg0 = get_argreg(buffer, 0, stage, dw_arg0);
5424     arg1 = get_argreg(buffer, 1, stage, dw_arg1);
5425     arg2 = get_argreg(buffer, 2, stage, dw_arg2);
5426
5427     switch(op) {
5428         case WINED3DTOP_DISABLE:
5429             if(stage == 0) shader_addline(buffer, "MOV %s%s, fragment.color.primary;\n", dstreg, dstmask);
5430             break;
5431
5432         case WINED3DTOP_SELECTARG2:
5433             arg1 = arg2;
5434         case WINED3DTOP_SELECTARG1:
5435             shader_addline(buffer, "MOV %s%s, %s;\n", dstreg, dstmask, arg1);
5436             break;
5437
5438         case WINED3DTOP_MODULATE4X:
5439             mul = 2;
5440         case WINED3DTOP_MODULATE2X:
5441             mul *= 2;
5442             if(strcmp(dstreg, "result.color") == 0) {
5443                 dstreg = "ret";
5444                 mul_final_dest = TRUE;
5445             }
5446         case WINED3DTOP_MODULATE:
5447             shader_addline(buffer, "MUL %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
5448             break;
5449
5450         case WINED3DTOP_ADDSIGNED2X:
5451             mul = 2;
5452             if(strcmp(dstreg, "result.color") == 0) {
5453                 dstreg = "ret";
5454                 mul_final_dest = TRUE;
5455             }
5456         case WINED3DTOP_ADDSIGNED:
5457             shader_addline(buffer, "SUB arg2, %s, const.w;\n", arg2);
5458             arg2 = "arg2";
5459         case WINED3DTOP_ADD:
5460             shader_addline(buffer, "ADD_SAT %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
5461             break;
5462
5463         case WINED3DTOP_SUBTRACT:
5464             shader_addline(buffer, "SUB_SAT %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
5465             break;
5466
5467         case WINED3DTOP_ADDSMOOTH:
5468             shader_addline(buffer, "SUB arg1, const.x, %s;\n", arg1);
5469             shader_addline(buffer, "MAD_SAT %s%s, arg1, %s, %s;\n", dstreg, dstmask, arg2, arg1);
5470             break;
5471
5472         case WINED3DTOP_BLENDCURRENTALPHA:
5473             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_CURRENT);
5474             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
5475             break;
5476         case WINED3DTOP_BLENDFACTORALPHA:
5477             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TFACTOR);
5478             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
5479             break;
5480         case WINED3DTOP_BLENDTEXTUREALPHA:
5481             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TEXTURE);
5482             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
5483             break;
5484         case WINED3DTOP_BLENDDIFFUSEALPHA:
5485             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_DIFFUSE);
5486             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
5487             break;
5488
5489         case WINED3DTOP_BLENDTEXTUREALPHAPM:
5490             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TEXTURE);
5491             shader_addline(buffer, "SUB arg0.w, const.x, %s.w;\n", arg0);
5492             shader_addline(buffer, "MAD_SAT %s%s, %s, arg0.w, %s;\n", dstreg, dstmask, arg2, arg1);
5493             break;
5494
5495         /* D3DTOP_PREMODULATE ???? */
5496
5497         case WINED3DTOP_MODULATEINVALPHA_ADDCOLOR:
5498             shader_addline(buffer, "SUB arg0.w, const.x, %s;\n", arg1);
5499             shader_addline(buffer, "MAD_SAT %s%s, arg0.w, %s, %s;\n", dstreg, dstmask, arg2, arg1);
5500             break;
5501         case WINED3DTOP_MODULATEALPHA_ADDCOLOR:
5502             shader_addline(buffer, "MAD_SAT %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg1, arg2, arg1);
5503             break;
5504         case WINED3DTOP_MODULATEINVCOLOR_ADDALPHA:
5505             shader_addline(buffer, "SUB arg0, const.x, %s;\n", arg1);
5506             shader_addline(buffer, "MAD_SAT %s%s, arg0, %s, %s.w;\n", dstreg, dstmask, arg2, arg1);
5507             break;
5508         case WINED3DTOP_MODULATECOLOR_ADDALPHA:
5509             shader_addline(buffer, "MAD_SAT %s%s, %s, %s, %s.w;\n", dstreg, dstmask, arg1, arg2, arg1);
5510             break;
5511
5512         case WINED3DTOP_DOTPRODUCT3:
5513             mul = 4;
5514             if(strcmp(dstreg, "result.color") == 0) {
5515                 dstreg = "ret";
5516                 mul_final_dest = TRUE;
5517             }
5518             shader_addline(buffer, "SUB arg1, %s, const.w;\n", arg1);
5519             shader_addline(buffer, "SUB arg2, %s, const.w;\n", arg2);
5520             shader_addline(buffer, "DP3_SAT %s%s, arg1, arg2;\n", dstreg, dstmask);
5521             break;
5522
5523         case WINED3DTOP_MULTIPLYADD:
5524             shader_addline(buffer, "MAD_SAT %s%s, %s, %s, %s;\n", dstreg, dstmask, arg1, arg2, arg0);
5525             break;
5526
5527         case WINED3DTOP_LERP:
5528             /* The msdn is not quite right here */
5529             shader_addline(buffer, "LRP %s%s, %s, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
5530             break;
5531
5532         case WINED3DTOP_BUMPENVMAP:
5533         case WINED3DTOP_BUMPENVMAPLUMINANCE:
5534             /* Those are handled in the first pass of the shader(generation pass 1 and 2) already */
5535             break;
5536
5537         default:
5538             FIXME("Unhandled texture op %08x\n", op);
5539     }
5540
5541     if(mul == 2) {
5542         shader_addline(buffer, "MUL_SAT %s%s, %s, const.y;\n", mul_final_dest ? "result.color" : dstreg, dstmask, dstreg);
5543     } else if(mul == 4) {
5544         shader_addline(buffer, "MUL_SAT %s%s, %s, const.z;\n", mul_final_dest ? "result.color" : dstreg, dstmask, dstreg);
5545     }
5546 }
5547
5548 /* The stateblock is passed for GLINFO_LOCATION */
5549 static GLuint gen_arbfp_ffp_shader(const struct ffp_frag_settings *settings, IWineD3DStateBlockImpl *stateblock)
5550 {
5551     unsigned int stage;
5552     struct wined3d_shader_buffer buffer;
5553     BOOL tex_read[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
5554     BOOL bump_used[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
5555     BOOL luminance_used[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
5556     const char *textype;
5557     const char *instr, *sat;
5558     char colorcor_dst[8];
5559     GLuint ret;
5560     DWORD arg0, arg1, arg2;
5561     BOOL tempreg_used = FALSE, tfactor_used = FALSE;
5562     BOOL op_equal;
5563     const char *final_combiner_src = "ret";
5564     GLint pos;
5565
5566     /* Find out which textures are read */
5567     for(stage = 0; stage < MAX_TEXTURES; stage++) {
5568         if(settings->op[stage].cop == WINED3DTOP_DISABLE) break;
5569         arg0 = settings->op[stage].carg0 & WINED3DTA_SELECTMASK;
5570         arg1 = settings->op[stage].carg1 & WINED3DTA_SELECTMASK;
5571         arg2 = settings->op[stage].carg2 & WINED3DTA_SELECTMASK;
5572         if(arg0 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
5573         if(arg1 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
5574         if(arg2 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
5575
5576         if(settings->op[stage].cop == WINED3DTOP_BLENDTEXTUREALPHA) tex_read[stage] = TRUE;
5577         if(settings->op[stage].cop == WINED3DTOP_BLENDTEXTUREALPHAPM) tex_read[stage] = TRUE;
5578         if(settings->op[stage].cop == WINED3DTOP_BUMPENVMAP) {
5579             bump_used[stage] = TRUE;
5580             tex_read[stage] = TRUE;
5581         }
5582         if(settings->op[stage].cop == WINED3DTOP_BUMPENVMAPLUMINANCE) {
5583             bump_used[stage] = TRUE;
5584             tex_read[stage] = TRUE;
5585             luminance_used[stage] = TRUE;
5586         } else if(settings->op[stage].cop == WINED3DTOP_BLENDFACTORALPHA) {
5587             tfactor_used = TRUE;
5588         }
5589
5590         if(arg0 == WINED3DTA_TFACTOR || arg1 == WINED3DTA_TFACTOR || arg2 == WINED3DTA_TFACTOR) {
5591             tfactor_used = TRUE;
5592         }
5593
5594         if(settings->op[stage].dst == tempreg) tempreg_used = TRUE;
5595         if(arg0 == WINED3DTA_TEMP || arg1 == WINED3DTA_TEMP || arg2 == WINED3DTA_TEMP) {
5596             tempreg_used = TRUE;
5597         }
5598
5599         if(settings->op[stage].aop == WINED3DTOP_DISABLE) continue;
5600         arg0 = settings->op[stage].aarg0 & WINED3DTA_SELECTMASK;
5601         arg1 = settings->op[stage].aarg1 & WINED3DTA_SELECTMASK;
5602         arg2 = settings->op[stage].aarg2 & WINED3DTA_SELECTMASK;
5603         if(arg0 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
5604         if(arg1 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
5605         if(arg2 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
5606
5607         if(arg0 == WINED3DTA_TEMP || arg1 == WINED3DTA_TEMP || arg2 == WINED3DTA_TEMP) {
5608             tempreg_used = TRUE;
5609         }
5610         if(arg0 == WINED3DTA_TFACTOR || arg1 == WINED3DTA_TFACTOR || arg2 == WINED3DTA_TFACTOR) {
5611             tfactor_used = TRUE;
5612         }
5613     }
5614
5615     /* Shader header */
5616     if (!shader_buffer_init(&buffer))
5617     {
5618         ERR("Failed to initialize shader buffer.\n");
5619         return 0;
5620     }
5621
5622     shader_addline(&buffer, "!!ARBfp1.0\n");
5623
5624     switch(settings->fog) {
5625         case FOG_OFF:                                                         break;
5626         case FOG_LINEAR: shader_addline(&buffer, "OPTION ARB_fog_linear;\n"); break;
5627         case FOG_EXP:    shader_addline(&buffer, "OPTION ARB_fog_exp;\n");    break;
5628         case FOG_EXP2:   shader_addline(&buffer, "OPTION ARB_fog_exp2;\n");   break;
5629         default: FIXME("Unexpected fog setting %d\n", settings->fog);
5630     }
5631
5632     shader_addline(&buffer, "PARAM const = {1, 2, 4, 0.5};\n");
5633     shader_addline(&buffer, "TEMP TMP;\n");
5634     shader_addline(&buffer, "TEMP ret;\n");
5635     if(tempreg_used || settings->sRGB_write) shader_addline(&buffer, "TEMP tempreg;\n");
5636     shader_addline(&buffer, "TEMP arg0;\n");
5637     shader_addline(&buffer, "TEMP arg1;\n");
5638     shader_addline(&buffer, "TEMP arg2;\n");
5639     for(stage = 0; stage < MAX_TEXTURES; stage++) {
5640         if(!tex_read[stage]) continue;
5641         shader_addline(&buffer, "TEMP tex%u;\n", stage);
5642         if(!bump_used[stage]) continue;
5643         shader_addline(&buffer, "PARAM bumpmat%u = program.env[%u];\n", stage, ARB_FFP_CONST_BUMPMAT(stage));
5644         if(!luminance_used[stage]) continue;
5645         shader_addline(&buffer, "PARAM luminance%u = program.env[%u];\n", stage, ARB_FFP_CONST_LUMINANCE(stage));
5646     }
5647     if(tfactor_used) {
5648         shader_addline(&buffer, "PARAM tfactor = program.env[%u];\n", ARB_FFP_CONST_TFACTOR);
5649     }
5650         shader_addline(&buffer, "PARAM specular_enable = program.env[%u];\n", ARB_FFP_CONST_SPECULAR_ENABLE);
5651
5652     if(settings->sRGB_write) {
5653         shader_addline(&buffer, "PARAM srgb_consts1 = {%f, %f, %f, %f};\n",
5654                        srgb_mul_low, srgb_cmp, srgb_pow, srgb_mul_high);
5655         shader_addline(&buffer, "PARAM srgb_consts2 = {%f, %f, %f, %f};\n",
5656                        srgb_sub_high, 0.0, 0.0, 0.0);
5657     }
5658
5659     if(ffp_clip_emul(stateblock) && settings->emul_clipplanes) shader_addline(&buffer, "KIL fragment.texcoord[7];\n");
5660
5661     /* Generate texture sampling instructions) */
5662     for(stage = 0; stage < MAX_TEXTURES && settings->op[stage].cop != WINED3DTOP_DISABLE; stage++) {
5663         if(!tex_read[stage]) continue;
5664
5665         switch(settings->op[stage].tex_type) {
5666             case tex_1d:                    textype = "1D";     break;
5667             case tex_2d:                    textype = "2D";     break;
5668             case tex_3d:                    textype = "3D";     break;
5669             case tex_cube:                  textype = "CUBE";   break;
5670             case tex_rect:                  textype = "RECT";   break;
5671             default: textype = "unexpected_textype";   break;
5672         }
5673
5674         if(settings->op[stage].cop == WINED3DTOP_BUMPENVMAP ||
5675            settings->op[stage].cop == WINED3DTOP_BUMPENVMAPLUMINANCE) {
5676             sat = "";
5677         } else {
5678             sat = "_SAT";
5679         }
5680
5681         if(settings->op[stage].projected == proj_none) {
5682             instr = "TEX";
5683         } else if(settings->op[stage].projected == proj_count4 ||
5684                   settings->op[stage].projected == proj_count3) {
5685             instr = "TXP";
5686         } else {
5687             FIXME("Unexpected projection mode %d\n", settings->op[stage].projected);
5688             instr = "TXP";
5689         }
5690
5691         if(stage > 0 &&
5692            (settings->op[stage - 1].cop == WINED3DTOP_BUMPENVMAP ||
5693             settings->op[stage - 1].cop == WINED3DTOP_BUMPENVMAPLUMINANCE)) {
5694             shader_addline(&buffer, "SWZ arg1, bumpmat%u, x, z, 0, 0;\n", stage - 1);
5695             shader_addline(&buffer, "DP3 ret.x, arg1, tex%u;\n", stage - 1);
5696             shader_addline(&buffer, "SWZ arg1, bumpmat%u, y, w, 0, 0;\n", stage - 1);
5697             shader_addline(&buffer, "DP3 ret.y, arg1, tex%u;\n", stage - 1);
5698
5699             /* with projective textures, texbem only divides the static texture coord, not the displacement,
5700              * so multiply the displacement with the dividing parameter before passing it to TXP
5701              */
5702             if (settings->op[stage].projected != proj_none) {
5703                 if(settings->op[stage].projected == proj_count4) {
5704                     shader_addline(&buffer, "MOV ret.w, fragment.texcoord[%u].w;\n", stage);
5705                     shader_addline(&buffer, "MUL ret.xyz, ret, fragment.texcoord[%u].w, fragment.texcoord[%u];\n", stage, stage);
5706                 } else {
5707                     shader_addline(&buffer, "MOV ret.w, fragment.texcoord[%u].z;\n", stage);
5708                     shader_addline(&buffer, "MAD ret.xyz, ret, fragment.texcoord[%u].z, fragment.texcoord[%u];\n", stage, stage);
5709                 }
5710             } else {
5711                 shader_addline(&buffer, "ADD ret, ret, fragment.texcoord[%u];\n", stage);
5712             }
5713
5714             shader_addline(&buffer, "%s%s tex%u, ret, texture[%u], %s;\n",
5715                            instr, sat, stage, stage, textype);
5716             if(settings->op[stage - 1].cop == WINED3DTOP_BUMPENVMAPLUMINANCE) {
5717                 shader_addline(&buffer, "MAD_SAT ret.x, tex%u.z, luminance%u.x, luminance%u.y;\n",
5718                                stage - 1, stage - 1, stage - 1);
5719                 shader_addline(&buffer, "MUL tex%u, tex%u, ret.x;\n", stage, stage);
5720             }
5721         } else if(settings->op[stage].projected == proj_count3) {
5722             shader_addline(&buffer, "MOV ret, fragment.texcoord[%u];\n", stage);
5723             shader_addline(&buffer, "MOV ret.w, ret.z;\n");
5724             shader_addline(&buffer, "%s%s tex%u, ret, texture[%u], %s;\n",
5725                             instr, sat, stage, stage, textype);
5726         } else {
5727             shader_addline(&buffer, "%s%s tex%u, fragment.texcoord[%u], texture[%u], %s;\n",
5728                             instr, sat, stage, stage, stage, textype);
5729         }
5730
5731         sprintf(colorcor_dst, "tex%u", stage);
5732         gen_color_correction(&buffer, colorcor_dst, WINED3DSP_WRITEMASK_ALL, "const.x", "const.y",
5733                 settings->op[stage].color_fixup);
5734     }
5735
5736     /* Generate the main shader */
5737     for(stage = 0; stage < MAX_TEXTURES; stage++) {
5738         if(settings->op[stage].cop == WINED3DTOP_DISABLE) {
5739             if(stage == 0) {
5740                 final_combiner_src = "fragment.color.primary";
5741             }
5742             break;
5743         }
5744
5745         if(settings->op[stage].cop == WINED3DTOP_SELECTARG1 &&
5746            settings->op[stage].aop == WINED3DTOP_SELECTARG1) {
5747             op_equal = settings->op[stage].carg1 == settings->op[stage].aarg1;
5748         } else if(settings->op[stage].cop == WINED3DTOP_SELECTARG1 &&
5749                   settings->op[stage].aop == WINED3DTOP_SELECTARG2) {
5750             op_equal = settings->op[stage].carg1 == settings->op[stage].aarg2;
5751         } else if(settings->op[stage].cop == WINED3DTOP_SELECTARG2 &&
5752                   settings->op[stage].aop == WINED3DTOP_SELECTARG1) {
5753             op_equal = settings->op[stage].carg2 == settings->op[stage].aarg1;
5754         } else if(settings->op[stage].cop == WINED3DTOP_SELECTARG2 &&
5755                   settings->op[stage].aop == WINED3DTOP_SELECTARG2) {
5756             op_equal = settings->op[stage].carg2 == settings->op[stage].aarg2;
5757         } else {
5758             op_equal = settings->op[stage].aop   == settings->op[stage].cop &&
5759                        settings->op[stage].carg0 == settings->op[stage].aarg0 &&
5760                        settings->op[stage].carg1 == settings->op[stage].aarg1 &&
5761                        settings->op[stage].carg2 == settings->op[stage].aarg2;
5762         }
5763
5764         if(settings->op[stage].aop == WINED3DTOP_DISABLE) {
5765             gen_ffp_instr(&buffer, stage, TRUE, FALSE, settings->op[stage].dst,
5766                           settings->op[stage].cop, settings->op[stage].carg0,
5767                           settings->op[stage].carg1, settings->op[stage].carg2);
5768             if(stage == 0) {
5769                 shader_addline(&buffer, "MOV ret.w, fragment.color.primary.w;\n");
5770             }
5771         } else if(op_equal) {
5772             gen_ffp_instr(&buffer, stage, TRUE, TRUE, settings->op[stage].dst,
5773                           settings->op[stage].cop, settings->op[stage].carg0,
5774                           settings->op[stage].carg1, settings->op[stage].carg2);
5775         } else {
5776             gen_ffp_instr(&buffer, stage, TRUE, FALSE, settings->op[stage].dst,
5777                           settings->op[stage].cop, settings->op[stage].carg0,
5778                           settings->op[stage].carg1, settings->op[stage].carg2);
5779             gen_ffp_instr(&buffer, stage, FALSE, TRUE, settings->op[stage].dst,
5780                           settings->op[stage].aop, settings->op[stage].aarg0,
5781                           settings->op[stage].aarg1, settings->op[stage].aarg2);
5782         }
5783     }
5784
5785     if(settings->sRGB_write) {
5786         shader_addline(&buffer, "MAD ret, fragment.color.secondary, specular_enable, %s;\n", final_combiner_src);
5787         arbfp_add_sRGB_correction(&buffer, "ret", "arg0", "arg1", "arg2", "tempreg", FALSE);
5788         shader_addline(&buffer, "MOV result.color, ret;\n");
5789     } else {
5790         shader_addline(&buffer, "MAD result.color, fragment.color.secondary, specular_enable, %s;\n", final_combiner_src);
5791     }
5792
5793     /* Footer */
5794     shader_addline(&buffer, "END\n");
5795
5796     /* Generate the shader */
5797     GL_EXTCALL(glGenProgramsARB(1, &ret));
5798     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, ret));
5799     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
5800             strlen(buffer.buffer), buffer.buffer));
5801     checkGLcall("glProgramStringARB()");
5802
5803     glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
5804     if (pos != -1)
5805     {
5806         FIXME("Fragment program error at position %d: %s\n", pos,
5807               debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
5808     }
5809     else
5810     {
5811         GLint native;
5812
5813         GL_EXTCALL(glGetProgramivARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_UNDER_NATIVE_LIMITS_ARB, &native));
5814         checkGLcall("glGetProgramivARB()");
5815         if (!native) WARN("Program exceeds native resource limits.\n");
5816     }
5817
5818     shader_buffer_free(&buffer);
5819     return ret;
5820 }
5821
5822 static void fragment_prog_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, struct wined3d_context *context)
5823 {
5824     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
5825     struct shader_arb_priv *priv = device->fragment_priv;
5826     BOOL use_pshader = use_ps(stateblock);
5827     BOOL use_vshader = use_vs(stateblock);
5828     struct ffp_frag_settings settings;
5829     const struct arbfp_ffp_desc *desc;
5830     unsigned int i;
5831
5832     TRACE("state %#x, stateblock %p, context %p\n", state, stateblock, context);
5833
5834     if(isStateDirty(context, STATE_RENDER(WINED3DRS_FOGENABLE))) {
5835         if(!use_pshader && device->shader_backend == &arb_program_shader_backend && context->last_was_pshader) {
5836             /* Reload fixed function constants since they collide with the pixel shader constants */
5837             for(i = 0; i < MAX_TEXTURES; i++) {
5838                 set_bumpmat_arbfp(STATE_TEXTURESTAGE(i, WINED3DTSS_BUMPENVMAT00), stateblock, context);
5839             }
5840             state_texfactor_arbfp(STATE_RENDER(WINED3DRS_TEXTUREFACTOR), stateblock, context);
5841             state_arb_specularenable(STATE_RENDER(WINED3DRS_SPECULARENABLE), stateblock, context);
5842         } else if(use_pshader && !isStateDirty(context, device->StateTable[STATE_VSHADER].representative)) {
5843             device->shader_backend->shader_select(context, use_pshader, use_vshader);
5844         }
5845         return;
5846     }
5847
5848     if(!use_pshader) {
5849         /* Find or create a shader implementing the fixed function pipeline settings, then activate it */
5850         gen_ffp_frag_op(stateblock, &settings, FALSE);
5851         desc = (const struct arbfp_ffp_desc *)find_ffp_frag_shader(&priv->fragment_shaders, &settings);
5852         if(!desc) {
5853             struct arbfp_ffp_desc *new_desc = HeapAlloc(GetProcessHeap(), 0, sizeof(*new_desc));
5854             if (!new_desc)
5855             {
5856                 ERR("Out of memory\n");
5857                 return;
5858             }
5859             new_desc->num_textures_used = 0;
5860             for(i = 0; i < GL_LIMITS(texture_stages); i++) {
5861                 if(settings.op[i].cop == WINED3DTOP_DISABLE) break;
5862                 new_desc->num_textures_used = i;
5863             }
5864
5865             memcpy(&new_desc->parent.settings, &settings, sizeof(settings));
5866             new_desc->shader = gen_arbfp_ffp_shader(&settings, stateblock);
5867             add_ffp_frag_shader(&priv->fragment_shaders, &new_desc->parent);
5868             TRACE("Allocated fixed function replacement shader descriptor %p\n", new_desc);
5869             desc = new_desc;
5870         }
5871
5872         /* Now activate the replacement program. GL_FRAGMENT_PROGRAM_ARB is already active(however, note the
5873          * comment above the shader_select call below). If e.g. GLSL is active, the shader_select call will
5874          * deactivate it.
5875          */
5876         GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, desc->shader));
5877         checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, desc->shader)");
5878         priv->current_fprogram_id = desc->shader;
5879
5880         if(device->shader_backend == &arb_program_shader_backend && context->last_was_pshader) {
5881             /* Reload fixed function constants since they collide with the pixel shader constants */
5882             for(i = 0; i < MAX_TEXTURES; i++) {
5883                 set_bumpmat_arbfp(STATE_TEXTURESTAGE(i, WINED3DTSS_BUMPENVMAT00), stateblock, context);
5884             }
5885             state_texfactor_arbfp(STATE_RENDER(WINED3DRS_TEXTUREFACTOR), stateblock, context);
5886             state_arb_specularenable(STATE_RENDER(WINED3DRS_SPECULARENABLE), stateblock, context);
5887         }
5888         context->last_was_pshader = FALSE;
5889     } else {
5890         context->last_was_pshader = TRUE;
5891     }
5892
5893     /* Finally, select the shader. If a pixel shader is used, it will be set and enabled by the shader backend.
5894      * If this shader backend is arbfp(most likely), then it will simply overwrite the last fixed function replace-
5895      * ment shader. If the shader backend is not ARB, it currently is important that the opengl implementation
5896      * type overwrites GL_ARB_fragment_program. This is currently the case with GLSL. If we really want to use
5897      * atifs or nvrc pixel shaders with arb fragment programs we'd have to disable GL_FRAGMENT_PROGRAM_ARB here
5898      *
5899      * Don't call shader_select if the vertex shader is dirty, because it will be called later on by the vertex
5900      * shader handler
5901      */
5902     if(!isStateDirty(context, device->StateTable[STATE_VSHADER].representative)) {
5903         device->shader_backend->shader_select(context, use_pshader, use_vshader);
5904
5905         if (!isStateDirty(context, STATE_VERTEXSHADERCONSTANT) && (use_vshader || use_pshader)) {
5906             device->StateTable[STATE_VERTEXSHADERCONSTANT].apply(STATE_VERTEXSHADERCONSTANT, stateblock, context);
5907         }
5908     }
5909     if(use_pshader) {
5910         device->StateTable[STATE_PIXELSHADERCONSTANT].apply(STATE_PIXELSHADERCONSTANT, stateblock, context);
5911     }
5912 }
5913
5914 /* We can't link the fog states to the fragment state directly since the vertex pipeline links them
5915  * to FOGENABLE. A different linking in different pipeline parts can't be expressed in the combined
5916  * state table, so we need to handle that with a forwarding function. The other invisible side effect
5917  * is that changing the fog start and fog end(which links to FOGENABLE in vertex) results in the
5918  * fragment_prog_arbfp function being called because FOGENABLE is dirty, which calls this function here
5919  */
5920 static void state_arbfp_fog(DWORD state, IWineD3DStateBlockImpl *stateblock, struct wined3d_context *context)
5921 {
5922     enum fogsource new_source;
5923
5924     TRACE("state %#x, stateblock %p, context %p\n", state, stateblock, context);
5925
5926     if(!isStateDirty(context, STATE_PIXELSHADER)) {
5927         fragment_prog_arbfp(state, stateblock, context);
5928     }
5929
5930     if(!stateblock->renderState[WINED3DRS_FOGENABLE]) return;
5931
5932     if(stateblock->renderState[WINED3DRS_FOGTABLEMODE] == WINED3DFOG_NONE) {
5933         if(use_vs(stateblock)) {
5934             new_source = FOGSOURCE_VS;
5935         } else {
5936             if(stateblock->renderState[WINED3DRS_FOGVERTEXMODE] == WINED3DFOG_NONE || context->last_was_rhw) {
5937                 new_source = FOGSOURCE_COORD;
5938             } else {
5939                 new_source = FOGSOURCE_FFP;
5940             }
5941         }
5942     } else {
5943         new_source = FOGSOURCE_FFP;
5944     }
5945     if(new_source != context->fog_source) {
5946         context->fog_source = new_source;
5947         state_fogstartend(STATE_RENDER(WINED3DRS_FOGSTART), stateblock, context);
5948     }
5949 }
5950
5951 static void textransform(DWORD state, IWineD3DStateBlockImpl *stateblock, struct wined3d_context *context)
5952 {
5953     if(!isStateDirty(context, STATE_PIXELSHADER)) {
5954         fragment_prog_arbfp(state, stateblock, context);
5955     }
5956 }
5957
5958 #undef GLINFO_LOCATION
5959
5960 static const struct StateEntryTemplate arbfp_fragmentstate_template[] = {
5961     {STATE_RENDER(WINED3DRS_TEXTUREFACTOR),               { STATE_RENDER(WINED3DRS_TEXTUREFACTOR),              state_texfactor_arbfp   }, WINED3D_GL_EXT_NONE             },
5962     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5963     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5964     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5965     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5966     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5967     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5968     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5969     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5970     {STATE_TEXTURESTAGE(0, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5971     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5972     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5973     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5974     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5975     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5976     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5977     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5978     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5979     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5980     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5981     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5982     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5983     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5984     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5985     {STATE_TEXTURESTAGE(1, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5986     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5987     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5988     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5989     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5990     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5991     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5992     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5993     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5994     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5995     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5996     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5997     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5998     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5999     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6000     {STATE_TEXTURESTAGE(2, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6001     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6002     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6003     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6004     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6005     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
6006     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
6007     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6008     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6009     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6010     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6011     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6012     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6013     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6014     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6015     {STATE_TEXTURESTAGE(3, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6016     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6017     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6018     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6019     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6020     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
6021     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
6022     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6023     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6024     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6025     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6026     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6027     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6028     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6029     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6030     {STATE_TEXTURESTAGE(4, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6031     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6032     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6033     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6034     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6035     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
6036     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
6037     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6038     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6039     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6040     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6041     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6042     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6043     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6044     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6045     {STATE_TEXTURESTAGE(5, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6046     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6047     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6048     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6049     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6050     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
6051     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
6052     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6053     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6054     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6055     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6056     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6057     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6058     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6059     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6060     {STATE_TEXTURESTAGE(6, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6061     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6062     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6063     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6064     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6065     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
6066     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
6067     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6068     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6069     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6070     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6071     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6072     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6073     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6074     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6075     {STATE_TEXTURESTAGE(7, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6076     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6077     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6078     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6079     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
6080     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
6081     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
6082     {STATE_SAMPLER(0),                                    { STATE_SAMPLER(0),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
6083     {STATE_SAMPLER(1),                                    { STATE_SAMPLER(1),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
6084     {STATE_SAMPLER(2),                                    { STATE_SAMPLER(2),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
6085     {STATE_SAMPLER(3),                                    { STATE_SAMPLER(3),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
6086     {STATE_SAMPLER(4),                                    { STATE_SAMPLER(4),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
6087     {STATE_SAMPLER(5),                                    { STATE_SAMPLER(5),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
6088     {STATE_SAMPLER(6),                                    { STATE_SAMPLER(6),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
6089     {STATE_SAMPLER(7),                                    { STATE_SAMPLER(7),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
6090     {STATE_PIXELSHADER,                                   { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6091     {STATE_RENDER(WINED3DRS_FOGENABLE),                   { STATE_RENDER(WINED3DRS_FOGENABLE),                  state_arbfp_fog         }, WINED3D_GL_EXT_NONE             },
6092     {STATE_RENDER(WINED3DRS_FOGTABLEMODE),                { STATE_RENDER(WINED3DRS_FOGENABLE),                  state_arbfp_fog         }, WINED3D_GL_EXT_NONE             },
6093     {STATE_RENDER(WINED3DRS_FOGVERTEXMODE),               { STATE_RENDER(WINED3DRS_FOGENABLE),                  state_arbfp_fog         }, WINED3D_GL_EXT_NONE             },
6094     {STATE_RENDER(WINED3DRS_FOGSTART),                    { STATE_RENDER(WINED3DRS_FOGSTART),                   state_fogstartend       }, WINED3D_GL_EXT_NONE             },
6095     {STATE_RENDER(WINED3DRS_FOGEND),                      { STATE_RENDER(WINED3DRS_FOGSTART),                   state_fogstartend       }, WINED3D_GL_EXT_NONE             },
6096     {STATE_RENDER(WINED3DRS_SRGBWRITEENABLE),             { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
6097     {STATE_RENDER(WINED3DRS_FOGCOLOR),                    { STATE_RENDER(WINED3DRS_FOGCOLOR),                   state_fogcolor          }, WINED3D_GL_EXT_NONE             },
6098     {STATE_RENDER(WINED3DRS_FOGDENSITY),                  { STATE_RENDER(WINED3DRS_FOGDENSITY),                 state_fogdensity        }, WINED3D_GL_EXT_NONE             },
6099     {STATE_TEXTURESTAGE(0,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(0, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
6100     {STATE_TEXTURESTAGE(1,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(1, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
6101     {STATE_TEXTURESTAGE(2,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(2, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
6102     {STATE_TEXTURESTAGE(3,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(3, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
6103     {STATE_TEXTURESTAGE(4,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(4, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
6104     {STATE_TEXTURESTAGE(5,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(5, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
6105     {STATE_TEXTURESTAGE(6,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(6, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
6106     {STATE_TEXTURESTAGE(7,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(7, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
6107     {STATE_RENDER(WINED3DRS_SPECULARENABLE),              { STATE_RENDER(WINED3DRS_SPECULARENABLE),             state_arb_specularenable}, WINED3D_GL_EXT_NONE             },
6108     {0 /* Terminate */,                                   { 0,                                                  0                       }, WINED3D_GL_EXT_NONE             },
6109 };
6110
6111 const struct fragment_pipeline arbfp_fragment_pipeline = {
6112     arbfp_enable,
6113     arbfp_get_caps,
6114     arbfp_alloc,
6115     arbfp_free,
6116     shader_arb_color_fixup_supported,
6117     arbfp_fragmentstate_template,
6118     TRUE /* We can disable projected textures */
6119 };
6120
6121 #define GLINFO_LOCATION device->adapter->gl_info
6122
6123 struct arbfp_blit_priv {
6124     GLenum yuy2_rect_shader, yuy2_2d_shader;
6125     GLenum uyvy_rect_shader, uyvy_2d_shader;
6126     GLenum yv12_rect_shader, yv12_2d_shader;
6127 };
6128
6129 static HRESULT arbfp_blit_alloc(IWineD3DDevice *iface) {
6130     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
6131     device->blit_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(struct arbfp_blit_priv));
6132     if(!device->blit_priv) {
6133         ERR("Out of memory\n");
6134         return E_OUTOFMEMORY;
6135     }
6136     return WINED3D_OK;
6137 }
6138
6139 /* Context activation is done by the caller. */
6140 static void arbfp_blit_free(IWineD3DDevice *iface) {
6141     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
6142     struct arbfp_blit_priv *priv = device->blit_priv;
6143
6144     ENTER_GL();
6145     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yuy2_rect_shader));
6146     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yuy2_2d_shader));
6147     GL_EXTCALL(glDeleteProgramsARB(1, &priv->uyvy_rect_shader));
6148     GL_EXTCALL(glDeleteProgramsARB(1, &priv->uyvy_2d_shader));
6149     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yv12_rect_shader));
6150     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yv12_2d_shader));
6151     checkGLcall("Delete yuv programs");
6152     LEAVE_GL();
6153 }
6154
6155 static BOOL gen_planar_yuv_read(struct wined3d_shader_buffer *buffer, enum yuv_fixup yuv_fixup,
6156         GLenum textype, char *luminance)
6157 {
6158     char chroma;
6159     const char *tex, *texinstr;
6160
6161     if (yuv_fixup == YUV_FIXUP_UYVY) {
6162         chroma = 'x';
6163         *luminance = 'w';
6164     } else {
6165         chroma = 'w';
6166         *luminance = 'x';
6167     }
6168     switch(textype) {
6169         case GL_TEXTURE_2D:             tex = "2D";     texinstr = "TXP"; break;
6170         case GL_TEXTURE_RECTANGLE_ARB:  tex = "RECT";   texinstr = "TEX"; break;
6171         default:
6172             /* This is more tricky than just replacing the texture type - we have to navigate
6173              * properly in the texture to find the correct chroma values
6174              */
6175             FIXME("Implement yuv correction for non-2d, non-rect textures\n");
6176             return FALSE;
6177     }
6178
6179     /* First we have to read the chroma values. This means we need at least two pixels(no filtering),
6180      * or 4 pixels(with filtering). To get the unmodified chromas, we have to rid ourselves of the
6181      * filtering when we sample the texture.
6182      *
6183      * These are the rules for reading the chroma:
6184      *
6185      * Even pixel: Cr
6186      * Even pixel: U
6187      * Odd pixel: V
6188      *
6189      * So we have to get the sampling x position in non-normalized coordinates in integers
6190      */
6191     if(textype != GL_TEXTURE_RECTANGLE_ARB) {
6192         shader_addline(buffer, "MUL texcrd.xy, fragment.texcoord[0], size.x;\n");
6193         shader_addline(buffer, "MOV texcrd.w, size.x;\n");
6194     } else {
6195         shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
6196     }
6197     /* We must not allow filtering between pixel x and x+1, this would mix U and V
6198      * Vertical filtering is ok. However, bear in mind that the pixel center is at
6199      * 0.5, so add 0.5.
6200      */
6201     shader_addline(buffer, "FLR texcrd.x, texcrd.x;\n");
6202     shader_addline(buffer, "ADD texcrd.x, texcrd.x, coef.y;\n");
6203
6204     /* Divide the x coordinate by 0.5 and get the fraction. This gives 0.25 and 0.75 for the
6205      * even and odd pixels respectively
6206      */
6207     shader_addline(buffer, "MUL texcrd2, texcrd, coef.y;\n");
6208     shader_addline(buffer, "FRC texcrd2, texcrd2;\n");
6209
6210     /* Sample Pixel 1 */
6211     shader_addline(buffer, "%s luminance, texcrd, texture[0], %s;\n", texinstr, tex);
6212
6213     /* Put the value into either of the chroma values */
6214     shader_addline(buffer, "SGE temp.x, texcrd2.x, coef.y;\n");
6215     shader_addline(buffer, "MUL chroma.x, luminance.%c, temp.x;\n", chroma);
6216     shader_addline(buffer, "SLT temp.x, texcrd2.x, coef.y;\n");
6217     shader_addline(buffer, "MUL chroma.y, luminance.%c, temp.x;\n", chroma);
6218
6219     /* Sample pixel 2. If we read an even pixel(SLT above returned 1), sample
6220      * the pixel right to the current one. Otherwise, sample the left pixel.
6221      * Bias and scale the SLT result to -1;1 and add it to the texcrd.x.
6222      */
6223     shader_addline(buffer, "MAD temp.x, temp.x, coef.z, -coef.x;\n");
6224     shader_addline(buffer, "ADD texcrd.x, texcrd, temp.x;\n");
6225     shader_addline(buffer, "%s luminance, texcrd, texture[0], %s;\n", texinstr, tex);
6226
6227     /* Put the value into the other chroma */
6228     shader_addline(buffer, "SGE temp.x, texcrd2.x, coef.y;\n");
6229     shader_addline(buffer, "MAD chroma.y, luminance.%c, temp.x, chroma.y;\n", chroma);
6230     shader_addline(buffer, "SLT temp.x, texcrd2.x, coef.y;\n");
6231     shader_addline(buffer, "MAD chroma.x, luminance.%c, temp.x, chroma.x;\n", chroma);
6232
6233     /* TODO: If filtering is enabled, sample a 2nd pair of pixels left or right of
6234      * the current one and lerp the two U and V values
6235      */
6236
6237     /* This gives the correctly filtered luminance value */
6238     shader_addline(buffer, "TEX luminance, fragment.texcoord[0], texture[0], %s;\n", tex);
6239
6240     return TRUE;
6241 }
6242
6243 static BOOL gen_yv12_read(struct wined3d_shader_buffer *buffer, GLenum textype, char *luminance)
6244 {
6245     const char *tex;
6246
6247     switch(textype) {
6248         case GL_TEXTURE_2D:             tex = "2D";     break;
6249         case GL_TEXTURE_RECTANGLE_ARB:  tex = "RECT";   break;
6250         default:
6251             FIXME("Implement yv12 correction for non-2d, non-rect textures\n");
6252             return FALSE;
6253     }
6254
6255     /* YV12 surfaces contain a WxH sized luminance plane, followed by a (W/2)x(H/2)
6256      * V and a (W/2)x(H/2) U plane, each with 8 bit per pixel. So the effective
6257      * bitdepth is 12 bits per pixel. Since the U and V planes have only half the
6258      * pitch of the luminance plane, the packing into the gl texture is a bit
6259      * unfortunate. If the whole texture is interpreted as luminance data it looks
6260      * approximately like this:
6261      *
6262      *        +----------------------------------+----
6263      *        |                                  |
6264      *        |                                  |
6265      *        |                                  |
6266      *        |                                  |
6267      *        |                                  |   2
6268      *        |            LUMINANCE             |   -
6269      *        |                                  |   3
6270      *        |                                  |
6271      *        |                                  |
6272      *        |                                  |
6273      *        |                                  |
6274      *        +----------------+-----------------+----
6275      *        |                |                 |
6276      *        |  U even rows   |  U odd rows     |
6277      *        |                |                 |   1
6278      *        +----------------+------------------   -
6279      *        |                |                 |   3
6280      *        |  V even rows   |  V odd rows     |
6281      *        |                |                 |
6282      *        +----------------+-----------------+----
6283      *        |                |                 |
6284      *        |     0.5        |       0.5       |
6285      *
6286      * So it appears as if there are 4 chroma images, but in fact the odd rows
6287      * in the chroma images are in the same row as the even ones. So its is
6288      * kinda tricky to read
6289      *
6290      * When reading from rectangle textures, keep in mind that the input y coordinates
6291      * go from 0 to d3d_height, whereas the opengl texture height is 1.5 * d3d_height
6292      */
6293     shader_addline(buffer, "PARAM yv12_coef = {%f, %f, %f, %f};\n",
6294             2.0f / 3.0f, 1.0f / 6.0f, (2.0f / 3.0f) + (1.0f / 6.0f), 1.0f / 3.0f);
6295
6296     shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
6297     /* the chroma planes have only half the width */
6298     shader_addline(buffer, "MUL texcrd.x, texcrd.x, coef.y;\n");
6299
6300     /* The first value is between 2/3 and 5/6th of the texture's height, so scale+bias
6301      * the coordinate. Also read the right side of the image when reading odd lines
6302      *
6303      * Don't forget to clamp the y values in into the range, otherwise we'll get filtering
6304      * bleeding
6305      */
6306     if(textype == GL_TEXTURE_2D) {
6307
6308         shader_addline(buffer, "RCP chroma.w, size.y;\n");
6309
6310         shader_addline(buffer, "MUL texcrd2.y, texcrd.y, size.y;\n");
6311
6312         shader_addline(buffer, "FLR texcrd2.y, texcrd2.y;\n");
6313         shader_addline(buffer, "MAD texcrd.y, texcrd.y, yv12_coef.y, yv12_coef.x;\n");
6314
6315         /* Read odd lines from the right side(add size * 0.5 to the x coordinate */
6316         shader_addline(buffer, "ADD texcrd2.x, texcrd2.y, yv12_coef.y;\n"); /* To avoid 0.5 == 0.5 comparisons */
6317         shader_addline(buffer, "FRC texcrd2.x, texcrd2.x;\n");
6318         shader_addline(buffer, "SGE texcrd2.x, texcrd2.x, coef.y;\n");
6319         shader_addline(buffer, "MAD texcrd.x, texcrd2.x, coef.y, texcrd.x;\n");
6320
6321         /* clamp, keep the half pixel origin in mind */
6322         shader_addline(buffer, "MAD temp.y, coef.y, chroma.w, yv12_coef.x;\n");
6323         shader_addline(buffer, "MAX texcrd.y, temp.y, texcrd.y;\n");
6324         shader_addline(buffer, "MAD temp.y, -coef.y, chroma.w, yv12_coef.z;\n");
6325         shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
6326     } else {
6327         /* Read from [size - size+size/4] */
6328         shader_addline(buffer, "FLR texcrd.y, texcrd.y;\n");
6329         shader_addline(buffer, "MAD texcrd.y, texcrd.y, coef.w, size.y;\n");
6330
6331         /* Read odd lines from the right side(add size * 0.5 to the x coordinate */
6332         shader_addline(buffer, "ADD texcrd2.x, texcrd.y, yv12_coef.y;\n"); /* To avoid 0.5 == 0.5 comparisons */
6333         shader_addline(buffer, "FRC texcrd2.x, texcrd2.x;\n");
6334         shader_addline(buffer, "SGE texcrd2.x, texcrd2.x, coef.y;\n");
6335         shader_addline(buffer, "MUL texcrd2.x, texcrd2.x, size.x;\n");
6336         shader_addline(buffer, "MAD texcrd.x, texcrd2.x, coef.y, texcrd.x;\n");
6337
6338         /* Make sure to read exactly from the pixel center */
6339         shader_addline(buffer, "FLR texcrd.y, texcrd.y;\n");
6340         shader_addline(buffer, "ADD texcrd.y, texcrd.y, coef.y;\n");
6341
6342         /* Clamp */
6343         shader_addline(buffer, "MAD temp.y, size.y, coef.w, size.y;\n");
6344         shader_addline(buffer, "ADD temp.y, temp.y, -coef.y;\n");
6345         shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
6346         shader_addline(buffer, "ADD temp.y, size.y, -coef.y;\n");
6347         shader_addline(buffer, "MAX texcrd.y, temp.y, texcrd.y;\n");
6348     }
6349     /* Read the texture, put the result into the output register */
6350     shader_addline(buffer, "TEX temp, texcrd, texture[0], %s;\n", tex);
6351     shader_addline(buffer, "MOV chroma.x, temp.w;\n");
6352
6353     /* The other chroma value is 1/6th of the texture lower, from 5/6th to 6/6th
6354      * No need to clamp because we're just reusing the already clamped value from above
6355      */
6356     if(textype == GL_TEXTURE_2D) {
6357         shader_addline(buffer, "ADD texcrd.y, texcrd.y, yv12_coef.y;\n");
6358     } else {
6359         shader_addline(buffer, "MAD texcrd.y, size.y, coef.w, texcrd.y;\n");
6360     }
6361     shader_addline(buffer, "TEX temp, texcrd, texture[0], %s;\n", tex);
6362     shader_addline(buffer, "MOV chroma.y, temp.w;\n");
6363
6364     /* Sample the luminance value. It is in the top 2/3rd of the texture, so scale the y coordinate.
6365      * Clamp the y coordinate to prevent the chroma values from bleeding into the sampled luminance
6366      * values due to filtering
6367      */
6368     shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
6369     if(textype == GL_TEXTURE_2D) {
6370         /* Multiply the y coordinate by 2/3 and clamp it */
6371         shader_addline(buffer, "MUL texcrd.y, texcrd.y, yv12_coef.x;\n");
6372         shader_addline(buffer, "MAD temp.y, -coef.y, chroma.w, yv12_coef.x;\n");
6373         shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
6374         shader_addline(buffer, "TEX luminance, texcrd, texture[0], %s;\n", tex);
6375     } else {
6376         /* Reading from texture_rectangles is pretty straightforward, just use the unmodified
6377          * texture coordinate. It is still a good idea to clamp it though, since the opengl texture
6378          * is bigger
6379          */
6380         shader_addline(buffer, "ADD temp.x, size.y, -coef.y;\n");
6381         shader_addline(buffer, "MIN texcrd.y, texcrd.y, size.x;\n");
6382         shader_addline(buffer, "TEX luminance, texcrd, texture[0], %s;\n", tex);
6383     }
6384     *luminance = 'a';
6385
6386     return TRUE;
6387 }
6388
6389 /* Context activation is done by the caller. */
6390 static GLuint gen_yuv_shader(IWineD3DDeviceImpl *device, enum yuv_fixup yuv_fixup, GLenum textype)
6391 {
6392     GLenum shader;
6393     struct wined3d_shader_buffer buffer;
6394     char luminance_component;
6395     struct arbfp_blit_priv *priv = device->blit_priv;
6396     GLint pos;
6397
6398     /* Shader header */
6399     if (!shader_buffer_init(&buffer))
6400     {
6401         ERR("Failed to initialize shader buffer.\n");
6402         return 0;
6403     }
6404
6405     ENTER_GL();
6406     GL_EXTCALL(glGenProgramsARB(1, &shader));
6407     checkGLcall("GL_EXTCALL(glGenProgramsARB(1, &shader))");
6408     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader));
6409     checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader)");
6410     LEAVE_GL();
6411     if(!shader) {
6412         shader_buffer_free(&buffer);
6413         return 0;
6414     }
6415
6416     /* The YUY2 and UYVY formats contain two pixels packed into a 32 bit macropixel,
6417      * giving effectively 16 bit per pixel. The color consists of a luminance(Y) and
6418      * two chroma(U and V) values. Each macropixel has two luminance values, one for
6419      * each single pixel it contains, and one U and one V value shared between both
6420      * pixels.
6421      *
6422      * The data is loaded into an A8L8 texture. With YUY2, the luminance component
6423      * contains the luminance and alpha the chroma. With UYVY it is vice versa. Thus
6424      * take the format into account when generating the read swizzles
6425      *
6426      * Reading the Y value is straightforward - just sample the texture. The hardware
6427      * takes care of filtering in the horizontal and vertical direction.
6428      *
6429      * Reading the U and V values is harder. We have to avoid filtering horizontally,
6430      * because that would mix the U and V values of one pixel or two adjacent pixels.
6431      * Thus floor the texture coordinate and add 0.5 to get an unfiltered read,
6432      * regardless of the filtering setting. Vertical filtering works automatically
6433      * though - the U and V values of two rows are mixed nicely.
6434      *
6435      * Appart of avoiding filtering issues, the code has to know which value it just
6436      * read, and where it can find the other one. To determine this, it checks if
6437      * it sampled an even or odd pixel, and shifts the 2nd read accordingly.
6438      *
6439      * Handling horizontal filtering of U and V values requires reading a 2nd pair
6440      * of pixels, extracting U and V and mixing them. This is not implemented yet.
6441      *
6442      * An alternative implementation idea is to load the texture as A8R8G8B8 texture,
6443      * with width / 2. This way one read gives all 3 values, finding U and V is easy
6444      * in an unfiltered situation. Finding the luminance on the other hand requires
6445      * finding out if it is an odd or even pixel. The real drawback of this approach
6446      * is filtering. This would have to be emulated completely in the shader, reading
6447      * up two 2 packed pixels in up to 2 rows and interpolating both horizontally and
6448      * vertically. Beyond that it would require adjustments to the texture handling
6449      * code to deal with the width scaling
6450      */
6451     shader_addline(&buffer, "!!ARBfp1.0\n");
6452     shader_addline(&buffer, "TEMP luminance;\n");
6453     shader_addline(&buffer, "TEMP temp;\n");
6454     shader_addline(&buffer, "TEMP chroma;\n");
6455     shader_addline(&buffer, "TEMP texcrd;\n");
6456     shader_addline(&buffer, "TEMP texcrd2;\n");
6457     shader_addline(&buffer, "PARAM coef = {1.0, 0.5, 2.0, 0.25};\n");
6458     shader_addline(&buffer, "PARAM yuv_coef = {1.403, 0.344, 0.714, 1.770};\n");
6459     shader_addline(&buffer, "PARAM size = program.local[0];\n");
6460
6461     switch (yuv_fixup)
6462     {
6463         case YUV_FIXUP_UYVY:
6464         case YUV_FIXUP_YUY2:
6465             if (!gen_planar_yuv_read(&buffer, yuv_fixup, textype, &luminance_component))
6466             {
6467                 shader_buffer_free(&buffer);
6468                 return 0;
6469             }
6470             break;
6471
6472         case YUV_FIXUP_YV12:
6473             if (!gen_yv12_read(&buffer, textype, &luminance_component))
6474             {
6475                 shader_buffer_free(&buffer);
6476                 return 0;
6477             }
6478             break;
6479
6480         default:
6481             FIXME("Unsupported YUV fixup %#x\n", yuv_fixup);
6482             shader_buffer_free(&buffer);
6483             return 0;
6484     }
6485
6486     /* Calculate the final result. Formula is taken from
6487      * http://www.fourcc.org/fccyvrgb.php. Note that the chroma
6488      * ranges from -0.5 to 0.5
6489      */
6490     shader_addline(&buffer, "SUB chroma.xy, chroma, coef.y;\n");
6491
6492     shader_addline(&buffer, "MAD result.color.x, chroma.x, yuv_coef.x, luminance.%c;\n", luminance_component);
6493     shader_addline(&buffer, "MAD temp.x, -chroma.y, yuv_coef.y, luminance.%c;\n", luminance_component);
6494     shader_addline(&buffer, "MAD result.color.y, -chroma.x, yuv_coef.z, temp.x;\n");
6495     shader_addline(&buffer, "MAD result.color.z, chroma.y, yuv_coef.w, luminance.%c;\n", luminance_component);
6496     shader_addline(&buffer, "END\n");
6497
6498     ENTER_GL();
6499     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
6500             strlen(buffer.buffer), buffer.buffer));
6501     checkGLcall("glProgramStringARB()");
6502
6503     glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
6504     if (pos != -1)
6505     {
6506         FIXME("Fragment program error at position %d: %s\n", pos,
6507               debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
6508     }
6509     else
6510     {
6511         GLint native;
6512
6513         GL_EXTCALL(glGetProgramivARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_UNDER_NATIVE_LIMITS_ARB, &native));
6514         checkGLcall("glGetProgramivARB()");
6515         if (!native) WARN("Program exceeds native resource limits.\n");
6516     }
6517
6518     shader_buffer_free(&buffer);
6519     LEAVE_GL();
6520
6521     switch (yuv_fixup)
6522     {
6523         case YUV_FIXUP_YUY2:
6524             if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->yuy2_rect_shader = shader;
6525             else priv->yuy2_2d_shader = shader;
6526             break;
6527
6528         case YUV_FIXUP_UYVY:
6529             if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->uyvy_rect_shader = shader;
6530             else priv->uyvy_2d_shader = shader;
6531             break;
6532
6533         case YUV_FIXUP_YV12:
6534             if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->yv12_rect_shader = shader;
6535             else priv->yv12_2d_shader = shader;
6536             break;
6537     }
6538
6539     return shader;
6540 }
6541
6542 /* Context activation is done by the caller. */
6543 static HRESULT arbfp_blit_set(IWineD3DDevice *iface, const struct GlPixelFormatDesc *format_desc,
6544         GLenum textype, UINT width, UINT height)
6545 {
6546     GLenum shader;
6547     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
6548     float size[4] = {width, height, 1, 1};
6549     struct arbfp_blit_priv *priv = device->blit_priv;
6550     enum yuv_fixup yuv_fixup;
6551
6552     if (!is_yuv_fixup(format_desc->color_fixup))
6553     {
6554         TRACE("Fixup:\n");
6555         dump_color_fixup_desc(format_desc->color_fixup);
6556         /* Don't bother setting up a shader for unconverted formats */
6557         ENTER_GL();
6558         glEnable(textype);
6559         checkGLcall("glEnable(textype)");
6560         LEAVE_GL();
6561         return WINED3D_OK;
6562     }
6563
6564     yuv_fixup = get_yuv_fixup(format_desc->color_fixup);
6565
6566     switch(yuv_fixup)
6567     {
6568         case YUV_FIXUP_YUY2:
6569             shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->yuy2_rect_shader : priv->yuy2_2d_shader;
6570             break;
6571
6572         case YUV_FIXUP_UYVY:
6573             shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->uyvy_rect_shader : priv->uyvy_2d_shader;
6574             break;
6575
6576         case YUV_FIXUP_YV12:
6577             shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->yv12_rect_shader : priv->yv12_2d_shader;
6578             break;
6579
6580         default:
6581             FIXME("Unsupported YUV fixup %#x, not setting a shader\n", yuv_fixup);
6582             ENTER_GL();
6583             glEnable(textype);
6584             checkGLcall("glEnable(textype)");
6585             LEAVE_GL();
6586             return E_NOTIMPL;
6587     }
6588
6589     if (!shader) shader = gen_yuv_shader(device, yuv_fixup, textype);
6590
6591     ENTER_GL();
6592     glEnable(GL_FRAGMENT_PROGRAM_ARB);
6593     checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB)");
6594     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader));
6595     checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader)");
6596     GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 0, size));
6597     checkGLcall("glProgramLocalParameter4fvARB");
6598     LEAVE_GL();
6599
6600     return WINED3D_OK;
6601 }
6602
6603 /* Context activation is done by the caller. */
6604 static void arbfp_blit_unset(IWineD3DDevice *iface) {
6605     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
6606
6607     ENTER_GL();
6608     glDisable(GL_FRAGMENT_PROGRAM_ARB);
6609     checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
6610     glDisable(GL_TEXTURE_2D);
6611     checkGLcall("glDisable(GL_TEXTURE_2D)");
6612     if(GL_SUPPORT(ARB_TEXTURE_CUBE_MAP)) {
6613         glDisable(GL_TEXTURE_CUBE_MAP_ARB);
6614         checkGLcall("glDisable(GL_TEXTURE_CUBE_MAP_ARB)");
6615     }
6616     if(GL_SUPPORT(ARB_TEXTURE_RECTANGLE)) {
6617         glDisable(GL_TEXTURE_RECTANGLE_ARB);
6618         checkGLcall("glDisable(GL_TEXTURE_RECTANGLE_ARB)");
6619     }
6620     LEAVE_GL();
6621 }
6622
6623 static BOOL arbfp_blit_color_fixup_supported(struct color_fixup_desc fixup)
6624 {
6625     enum yuv_fixup yuv_fixup;
6626
6627     if (TRACE_ON(d3d_shader) && TRACE_ON(d3d))
6628     {
6629         TRACE("Checking support for fixup:\n");
6630         dump_color_fixup_desc(fixup);
6631     }
6632
6633     if (is_identity_fixup(fixup))
6634     {
6635         TRACE("[OK]\n");
6636         return TRUE;
6637     }
6638
6639     /* We only support YUV conversions. */
6640     if (!is_yuv_fixup(fixup))
6641     {
6642         TRACE("[FAILED]\n");
6643         return FALSE;
6644     }
6645
6646     yuv_fixup = get_yuv_fixup(fixup);
6647     switch(yuv_fixup)
6648     {
6649         case YUV_FIXUP_YUY2:
6650         case YUV_FIXUP_UYVY:
6651         case YUV_FIXUP_YV12:
6652             TRACE("[OK]\n");
6653             return TRUE;
6654
6655         default:
6656             FIXME("Unsupported YUV fixup %#x\n", yuv_fixup);
6657             TRACE("[FAILED]\n");
6658             return FALSE;
6659     }
6660 }
6661
6662 const struct blit_shader arbfp_blit = {
6663     arbfp_blit_alloc,
6664     arbfp_blit_free,
6665     arbfp_blit_set,
6666     arbfp_blit_unset,
6667     arbfp_blit_color_fixup_supported,
6668 };
6669
6670 #undef GLINFO_LOCATION