wined3d: Eliminate a redundant local variable in get_loop_control_const().
[wine] / dlls / wined3d / arb_program_shader.c
1 /*
2  * Pixel and vertex shaders implementation using ARB_vertex_program
3  * and ARB_fragment_program GL extensions.
4  *
5  * Copyright 2002-2003 Jason Edmeades
6  * Copyright 2002-2003 Raphael Junqueira
7  * Copyright 2004 Christian Costa
8  * Copyright 2005 Oliver Stieber
9  * Copyright 2006 Ivan Gyurdiev
10  * Copyright 2006 Jason Green
11  * Copyright 2006 Henri Verbeet
12  * Copyright 2007-2008 Stefan Dösinger for CodeWeavers
13  * Copyright 2009 Henri Verbeet for CodeWeavers
14  *
15  * This library is free software; you can redistribute it and/or
16  * modify it under the terms of the GNU Lesser General Public
17  * License as published by the Free Software Foundation; either
18  * version 2.1 of the License, or (at your option) any later version.
19  *
20  * This library is distributed in the hope that it will be useful,
21  * but WITHOUT ANY WARRANTY; without even the implied warranty of
22  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
23  * Lesser General Public License for more details.
24  *
25  * You should have received a copy of the GNU Lesser General Public
26  * License along with this library; if not, write to the Free Software
27  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA
28  */
29
30 #include "config.h"
31
32 #include <math.h>
33 #include <stdio.h>
34
35 #include "wined3d_private.h"
36
37 WINE_DEFAULT_DEBUG_CHANNEL(d3d_shader);
38 WINE_DECLARE_DEBUG_CHANNEL(d3d_constants);
39 WINE_DECLARE_DEBUG_CHANNEL(d3d_caps);
40 WINE_DECLARE_DEBUG_CHANNEL(d3d);
41
42 #define GLINFO_LOCATION      (*gl_info)
43
44 /* GL locking for state handlers is done by the caller. */
45 static BOOL need_mova_const(IWineD3DBaseShader *shader, const WineD3D_GL_Info *gl_info) {
46     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *) shader;
47     if(!This->baseShader.reg_maps.usesmova) return FALSE;
48     return !GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION);
49 }
50
51 static BOOL need_helper_const(const WineD3D_GL_Info *gl_info) {
52     if(!GL_SUPPORT(NV_VERTEX_PROGRAM)   || /* Need to init colors */
53        gl_info->arb_vs_offset_limit     || /* Have to init texcoords */
54        gl_info->set_texcoord_w) {          /* Load the immval offset */
55         return TRUE;
56     }
57     return FALSE;
58 }
59
60 static unsigned int reserved_vs_const(IWineD3DBaseShader *shader, const WineD3D_GL_Info *gl_info) {
61     unsigned int ret = 1;
62     /* We use one PARAM for the pos fixup, and in some cases one to load
63      * some immediate values into the shader
64      */
65     if(need_helper_const(gl_info)) ret++;
66     if(need_mova_const(shader, gl_info)) ret++;
67     return ret;
68 }
69
70 static inline BOOL ffp_clip_emul(IWineD3DStateBlockImpl *stateblock)
71 {
72     return stateblock->lowest_disabled_stage < 7;
73 }
74
75 /* Returns TRUE if result.clip from GL_NV_vertex_program2 should be used and FALSE otherwise */
76 static inline BOOL use_nv_clip(const WineD3D_GL_Info *gl_info)
77 {
78     return GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION);
79 }
80
81 /* Internally used shader constants. Applications can use constants 0 to GL_LIMITS(vshader_constantsF) - 1,
82  * so upload them above that
83  */
84 #define ARB_SHADER_PRIVCONST_BASE (GL_LIMITS(vshader_constantsF) - 1)
85 #define ARB_SHADER_PRIVCONST_POS ARB_SHADER_PRIVCONST_BASE + 0
86
87 /* ARB_program_shader private data */
88
89 struct loop_control
90 {
91     unsigned int count;
92     unsigned int start;
93     int step;
94 };
95
96 struct control_frame
97 {
98     struct                          list entry;
99     enum
100     {
101         IF,
102         IFC,
103         LOOP,
104         REP
105     } type;
106     BOOL                            muting;
107     BOOL                            outer_loop;
108     union
109     {
110         unsigned int                loop_no;
111         unsigned int                ifc_no;
112     };
113     struct loop_control             loop_control;
114     BOOL                            had_else;
115 };
116
117 struct arb_ps_compile_args
118 {
119     struct ps_compile_args          super;
120     DWORD                           bools; /* WORD is enough, use DWORD for alignment */
121     unsigned char                   loop_ctrl[MAX_CONST_I][3];
122 };
123
124 struct stb_const_desc
125 {
126     unsigned char           texunit;
127     UINT                    const_num;
128 };
129
130 struct arb_ps_compiled_shader
131 {
132     struct arb_ps_compile_args      args;
133     GLuint                          prgId;
134     struct stb_const_desc           bumpenvmatconst[MAX_TEXTURES];
135     unsigned char                   numbumpenvmatconsts;
136     struct stb_const_desc           luminanceconst[MAX_TEXTURES];
137     UINT                            int_consts[MAX_CONST_I];
138     char                            num_int_consts;
139     UINT                            ycorrection;
140 };
141
142 struct arb_vs_compile_args
143 {
144     struct vs_compile_args          super;
145     union
146     {
147         struct
148         {
149             WORD                    bools;
150             char                    clip_control[2];
151         }                           boolclip;
152         DWORD                       boolclip_compare;
153     };
154     DWORD                           ps_signature;
155     union
156     {
157         unsigned char               vertex_samplers[4];
158         DWORD                       vertex_samplers_compare;
159     };
160     unsigned char                   loop_ctrl[MAX_CONST_I][3];
161 };
162
163 struct arb_vs_compiled_shader
164 {
165     struct arb_vs_compile_args      args;
166     GLuint                          prgId;
167     UINT                            int_consts[MAX_CONST_I];
168     char                            num_int_consts;
169     UINT                            pos_fixup;
170 };
171
172 struct recorded_instruction
173 {
174     struct wined3d_shader_instruction ins;
175     struct list entry;
176 };
177
178 struct shader_arb_ctx_priv
179 {
180     char addr_reg[20];
181     enum
182     {
183         /* plain GL_ARB_vertex_program or GL_ARB_fragment_program */
184         ARB,
185         /* GL_NV_vertex_progam2_option or GL_NV_fragment_program_option */
186         NV2,
187         /* GL_NV_vertex_program3 or GL_NV_fragment_program2 */
188         NV3
189     } target_version;
190
191     const struct arb_vs_compile_args    *cur_vs_args;
192     const struct arb_ps_compile_args    *cur_ps_args;
193     const struct arb_ps_compiled_shader *compiled_fprog;
194     const struct arb_vs_compiled_shader *compiled_vprog;
195     struct list                         control_frames;
196     struct list                         record;
197     BOOL                                recording;
198     BOOL                                muted;
199     unsigned int                        num_loops, loop_depth, num_ifcs;
200     int                                 aL;
201
202     /* For 3.0 vertex shaders */
203     const char                          *vs_output[MAX_REG_OUTPUT];
204     /* For 2.x and earlier vertex shaders */
205     const char                          *texcrd_output[8], *color_output[2], *fog_output;
206
207     /* 3.0 pshader input for compatibility with fixed function */
208     const char                          *ps_input[MAX_REG_INPUT];
209 };
210
211 struct ps_signature
212 {
213     struct wined3d_shader_signature_element *sig;
214     DWORD                               idx;
215     struct wine_rb_entry                entry;
216 };
217
218 struct arb_pshader_private {
219     struct arb_ps_compiled_shader   *gl_shaders;
220     UINT                            num_gl_shaders, shader_array_size;
221     BOOL                            has_signature_idx;
222     DWORD                           input_signature_idx;
223     DWORD                           clipplane_emulation;
224     BOOL                            clamp_consts;
225 };
226
227 struct arb_vshader_private {
228     struct arb_vs_compiled_shader   *gl_shaders;
229     UINT                            num_gl_shaders, shader_array_size;
230 };
231
232 struct shader_arb_priv
233 {
234     GLuint                  current_vprogram_id;
235     GLuint                  current_fprogram_id;
236     const struct arb_ps_compiled_shader *compiled_fprog;
237     const struct arb_vs_compiled_shader *compiled_vprog;
238     GLuint                  depth_blt_vprogram_id;
239     GLuint                  depth_blt_fprogram_id[tex_type_count];
240     BOOL                    use_arbfp_fixed_func;
241     struct wine_rb_tree     fragment_shaders;
242     BOOL                    last_ps_const_clamped;
243
244     struct wine_rb_tree     signature_tree;
245     DWORD ps_sig_number;
246 };
247
248 /********************************************************
249  * ARB_[vertex/fragment]_program helper functions follow
250  ********************************************************/
251
252 /** 
253  * Loads floating point constants into the currently set ARB_vertex/fragment_program.
254  * When constant_list == NULL, it will load all the constants.
255  *  
256  * @target_type should be either GL_VERTEX_PROGRAM_ARB (for vertex shaders)
257  *  or GL_FRAGMENT_PROGRAM_ARB (for pixel shaders)
258  */
259 /* GL locking is done by the caller */
260 static unsigned int shader_arb_load_constantsF(IWineD3DBaseShaderImpl* This, const WineD3D_GL_Info *gl_info,
261         GLuint target_type, unsigned int max_constants, const float *constants, char *dirty_consts)
262 {
263     local_constant* lconst;
264     DWORD i, j;
265     unsigned int ret;
266
267     if (TRACE_ON(d3d_shader)) {
268         for(i = 0; i < max_constants; i++) {
269             if(!dirty_consts[i]) continue;
270             TRACE_(d3d_constants)("Loading constants %i: %f, %f, %f, %f\n", i,
271                         constants[i * 4 + 0], constants[i * 4 + 1],
272                         constants[i * 4 + 2], constants[i * 4 + 3]);
273         }
274     }
275     /* In 1.X pixel shaders constants are implicitly clamped in the range [-1;1] */
276     if (target_type == GL_FRAGMENT_PROGRAM_ARB && This->baseShader.reg_maps.shader_version.major == 1)
277     {
278         float lcl_const[4];
279         for(i = 0; i < max_constants; i++) {
280             if(!dirty_consts[i]) continue;
281             dirty_consts[i] = 0;
282
283             j = 4 * i;
284             if(constants[j + 0] > 1.0) lcl_const[0] = 1.0;
285             else if(constants[j + 0] < -1.0) lcl_const[0] = -1.0;
286             else lcl_const[0] = constants[j + 0];
287
288             if(constants[j + 1] > 1.0) lcl_const[1] = 1.0;
289             else if(constants[j + 1] < -1.0) lcl_const[1] = -1.0;
290             else lcl_const[1] = constants[j + 1];
291
292             if(constants[j + 2] > 1.0) lcl_const[2] = 1.0;
293             else if(constants[j + 2] < -1.0) lcl_const[2] = -1.0;
294             else lcl_const[2] = constants[j + 2];
295
296             if(constants[j + 3] > 1.0) lcl_const[3] = 1.0;
297             else if(constants[j + 3] < -1.0) lcl_const[3] = -1.0;
298             else lcl_const[3] = constants[j + 3];
299
300             GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, i, lcl_const));
301         }
302     } else {
303         if(GL_SUPPORT(EXT_GPU_PROGRAM_PARAMETERS)) {
304             /* TODO: Benchmark if we're better of with finding the dirty constants ourselves,
305              * or just reloading *all* constants at once
306              *
307             GL_EXTCALL(glProgramEnvParameters4fvEXT(target_type, 0, max_constants, constants));
308              */
309             for(i = 0; i < max_constants; i++) {
310                 if(!dirty_consts[i]) continue;
311
312                 /* Find the next block of dirty constants */
313                 dirty_consts[i] = 0;
314                 j = i;
315                 for(i++; (i < max_constants) && dirty_consts[i]; i++) {
316                     dirty_consts[i] = 0;
317                 }
318
319                 GL_EXTCALL(glProgramEnvParameters4fvEXT(target_type, j, i - j, constants + (j * 4)));
320             }
321         } else {
322             for(i = 0; i < max_constants; i++) {
323                 if(dirty_consts[i]) {
324                     dirty_consts[i] = 0;
325                     GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, i, constants + (i * 4)));
326                 }
327             }
328         }
329     }
330     checkGLcall("glProgramEnvParameter4fvARB()");
331
332     /* Load immediate constants */
333     if(This->baseShader.load_local_constsF) {
334         if (TRACE_ON(d3d_shader)) {
335             LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
336                 GLfloat* values = (GLfloat*)lconst->value;
337                 TRACE_(d3d_constants)("Loading local constants %i: %f, %f, %f, %f\n", lconst->idx,
338                         values[0], values[1], values[2], values[3]);
339             }
340         }
341         /* Immediate constants are clamped for 1.X shaders at loading times */
342         ret = 0;
343         LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
344             dirty_consts[lconst->idx] = 1; /* Dirtify so the non-immediate constant overwrites it next time */
345             ret = max(ret, lconst->idx + 1);
346             GL_EXTCALL(glProgramEnvParameter4fvARB(target_type, lconst->idx, (GLfloat*)lconst->value));
347         }
348         checkGLcall("glProgramEnvParameter4fvARB()");
349         return ret; /* The loaded immediate constants need reloading for the next shader */
350     } else {
351         return 0; /* No constants are dirty now */
352     }
353 }
354
355 /**
356  * Loads the texture dimensions for NP2 fixup into the currently set ARB_[vertex/fragment]_programs.
357  */
358 static void shader_arb_load_np2fixup_constants(
359     IWineD3DDevice* device,
360     char usePixelShader,
361     char useVertexShader) {
362     /* not implemented */
363 }
364
365 /* GL locking is done by the caller. */
366 static inline void shader_arb_ps_local_constants(IWineD3DDeviceImpl* deviceImpl)
367 {
368     IWineD3DStateBlockImpl* stateBlock = deviceImpl->stateBlock;
369     const WineD3D_GL_Info *gl_info = &deviceImpl->adapter->gl_info;
370     unsigned char i;
371     struct shader_arb_priv *priv = deviceImpl->shader_priv;
372     const struct arb_ps_compiled_shader *gl_shader = priv->compiled_fprog;
373
374     for(i = 0; i < gl_shader->numbumpenvmatconsts; i++)
375     {
376         int texunit = gl_shader->bumpenvmatconst[i].texunit;
377
378         /* The state manager takes care that this function is always called if the bump env matrix changes */
379         const float *data = (const float *)&stateBlock->textureState[texunit][WINED3DTSS_BUMPENVMAT00];
380         GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, gl_shader->bumpenvmatconst[i].const_num, data));
381
382         if (gl_shader->luminanceconst[i].const_num != WINED3D_CONST_NUM_UNUSED)
383         {
384             /* WINED3DTSS_BUMPENVLSCALE and WINED3DTSS_BUMPENVLOFFSET are next to each other.
385              * point gl to the scale, and load 4 floats. x = scale, y = offset, z and w are junk, we
386              * don't care about them. The pointers are valid for sure because the stateblock is bigger.
387              * (they're WINED3DTSS_TEXTURETRANSFORMFLAGS and WINED3DTSS_ADDRESSW, so most likely 0 or NaN
388             */
389             const float *scale = (const float *)&stateBlock->textureState[texunit][WINED3DTSS_BUMPENVLSCALE];
390             GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, gl_shader->luminanceconst[i].const_num, scale));
391         }
392     }
393     checkGLcall("Load bumpmap consts\n");
394
395     if(gl_shader->ycorrection != WINED3D_CONST_NUM_UNUSED)
396     {
397         /* ycorrection.x: Backbuffer height(onscreen) or 0(offscreen).
398         * ycorrection.y: -1.0(onscreen), 1.0(offscreen)
399         * ycorrection.z: 1.0
400         * ycorrection.w: 0.0
401         */
402         float val[4];
403         val[0] = deviceImpl->render_offscreen ? 0.0 : ((IWineD3DSurfaceImpl *) deviceImpl->render_targets[0])->currentDesc.Height;
404         val[1] = deviceImpl->render_offscreen ? 1.0 : -1.0;
405         val[2] = 1.0;
406         val[3] = 0.0;
407         GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, gl_shader->ycorrection, val));
408         checkGLcall("y correction loading\n");
409     }
410
411     if(gl_shader->num_int_consts == 0) return;
412
413     for(i = 0; i < MAX_CONST_I; i++)
414     {
415         if(gl_shader->int_consts[i] != WINED3D_CONST_NUM_UNUSED)
416         {
417             float val[4];
418             val[0] = stateBlock->pixelShaderConstantI[4 * i];
419             val[1] = stateBlock->pixelShaderConstantI[4 * i + 1];
420             val[2] = stateBlock->pixelShaderConstantI[4 * i + 2];
421             val[3] = -1.0;
422
423             GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, gl_shader->int_consts[i], val));
424         }
425     }
426     checkGLcall("Load ps int consts\n");
427 }
428
429 /* GL locking is done by the caller. */
430 static inline void shader_arb_vs_local_constants(IWineD3DDeviceImpl* deviceImpl)
431 {
432     IWineD3DStateBlockImpl* stateBlock;
433     const WineD3D_GL_Info *gl_info = &deviceImpl->adapter->gl_info;
434     unsigned char i;
435     struct shader_arb_priv *priv = deviceImpl->shader_priv;
436     const struct arb_vs_compiled_shader *gl_shader = priv->compiled_vprog;
437
438     /* Upload the position fixup */
439     GL_EXTCALL(glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, gl_shader->pos_fixup, deviceImpl->posFixup));
440
441     if(gl_shader->num_int_consts == 0) return;
442
443     stateBlock = deviceImpl->stateBlock;
444
445     for(i = 0; i < MAX_CONST_I; i++)
446     {
447         if(gl_shader->int_consts[i] != WINED3D_CONST_NUM_UNUSED)
448         {
449             float val[4];
450             val[0] = stateBlock->vertexShaderConstantI[4 * i];
451             val[1] = stateBlock->vertexShaderConstantI[4 * i + 1];
452             val[2] = stateBlock->vertexShaderConstantI[4 * i + 2];
453             val[3] = -1.0;
454
455             GL_EXTCALL(glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, gl_shader->int_consts[i], val));
456         }
457     }
458     checkGLcall("Load vs int consts\n");
459 }
460
461 /**
462  * Loads the app-supplied constants into the currently set ARB_[vertex/fragment]_programs.
463  * 
464  * We only support float constants in ARB at the moment, so don't 
465  * worry about the Integers or Booleans
466  */
467 /* GL locking is done by the caller (state handler) */
468 static void shader_arb_load_constants(
469     IWineD3DDevice* device,
470     char usePixelShader,
471     char useVertexShader) {
472    
473     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) device; 
474     IWineD3DStateBlockImpl* stateBlock = deviceImpl->stateBlock;
475     const WineD3D_GL_Info *gl_info = &deviceImpl->adapter->gl_info;
476
477     if (useVertexShader) {
478         IWineD3DBaseShaderImpl* vshader = (IWineD3DBaseShaderImpl*) stateBlock->vertexShader;
479
480         /* Load DirectX 9 float constants for vertex shader */
481         deviceImpl->highest_dirty_vs_const = shader_arb_load_constantsF(
482                 vshader, gl_info, GL_VERTEX_PROGRAM_ARB,
483                 deviceImpl->highest_dirty_vs_const,
484                 stateBlock->vertexShaderConstantF,
485                 deviceImpl->activeContext->vshader_const_dirty);
486
487         shader_arb_vs_local_constants(deviceImpl);
488     }
489
490     if (usePixelShader) {
491         IWineD3DBaseShaderImpl* pshader = (IWineD3DBaseShaderImpl*) stateBlock->pixelShader;
492
493         /* Load DirectX 9 float constants for pixel shader */
494         deviceImpl->highest_dirty_ps_const = shader_arb_load_constantsF(
495                 pshader, gl_info, GL_FRAGMENT_PROGRAM_ARB,
496                 deviceImpl->highest_dirty_ps_const,
497                 stateBlock->pixelShaderConstantF,
498                 deviceImpl->activeContext->pshader_const_dirty);
499         shader_arb_ps_local_constants(deviceImpl);
500     }
501 }
502
503 static void shader_arb_update_float_vertex_constants(IWineD3DDevice *iface, UINT start, UINT count)
504 {
505     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
506
507     /* We don't want shader constant dirtification to be an O(contexts), so just dirtify the active
508      * context. On a context switch the old context will be fully dirtified */
509     memset(This->activeContext->vshader_const_dirty + start, 1,
510             sizeof(*This->activeContext->vshader_const_dirty) * count);
511     This->highest_dirty_vs_const = max(This->highest_dirty_vs_const, start + count + 1);
512 }
513
514 static void shader_arb_update_float_pixel_constants(IWineD3DDevice *iface, UINT start, UINT count)
515 {
516     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
517
518     /* We don't want shader constant dirtification to be an O(contexts), so just dirtify the active
519      * context. On a context switch the old context will be fully dirtified */
520     memset(This->activeContext->pshader_const_dirty + start, 1,
521             sizeof(*This->activeContext->pshader_const_dirty) * count);
522     This->highest_dirty_ps_const = max(This->highest_dirty_ps_const, start + count + 1);
523 }
524
525 static DWORD *local_const_mapping(IWineD3DBaseShaderImpl *This)
526 {
527     DWORD *ret;
528     DWORD idx = 0;
529     const local_constant *lconst;
530
531     if(This->baseShader.load_local_constsF || list_empty(&This->baseShader.constantsF)) return NULL;
532
533     ret = HeapAlloc(GetProcessHeap(), 0, sizeof(DWORD) * This->baseShader.limits.constant_float);
534     if(!ret) {
535         ERR("Out of memory\n");
536         return NULL;
537     }
538
539     LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
540         ret[lconst->idx] = idx++;
541     }
542     return ret;
543 }
544
545 /* Generate the variable & register declarations for the ARB_vertex_program output target */
546 static DWORD shader_generate_arb_declarations(IWineD3DBaseShader *iface, const shader_reg_maps *reg_maps,
547         SHADER_BUFFER *buffer, const WineD3D_GL_Info *gl_info, DWORD *lconst_map, DWORD *num_clipplanes,
548         struct shader_arb_ctx_priv *ctx)
549 {
550     IWineD3DBaseShaderImpl* This = (IWineD3DBaseShaderImpl*) iface;
551     DWORD i, next_local = 0;
552     char pshader = shader_is_pshader_version(reg_maps->shader_version.type);
553     unsigned max_constantsF;
554     const local_constant *lconst;
555
556     /* In pixel shaders, all private constants are program local, we don't need anything
557      * from program.env. Thus we can advertise the full set of constants in pixel shaders.
558      * If we need a private constant the GL implementation will squeeze it in somewhere
559      *
560      * With vertex shaders we need the posFixup and on some GL implementations 4 helper
561      * immediate values. The posFixup is loaded using program.env for now, so always
562      * subtract one from the number of constants. If the shader uses indirect addressing,
563      * account for the helper const too because we have to declare all availabke d3d constants
564      * and don't know which are actually used.
565      */
566     if(pshader) {
567         max_constantsF = GL_LIMITS(pshader_constantsF);
568     } else {
569         if(This->baseShader.reg_maps.usesrelconstF) {
570             DWORD highest_constf = 0, clip_limit;
571             max_constantsF = GL_LIMITS(vshader_constantsF) - reserved_vs_const(iface, gl_info);
572             max_constantsF -= count_bits(This->baseShader.reg_maps.integer_constants);
573
574             for(i = 0; i < This->baseShader.limits.constant_float; i++)
575             {
576                 DWORD idx = i >> 5;
577                 DWORD shift = i & 0x1f;
578                 if(reg_maps->constf[idx] & (1 << shift)) highest_constf = i;
579             }
580
581             clip_limit = GL_LIMITS(clipplanes);
582             if(ctx->target_version == ARB) clip_limit = min(clip_limit, 4);
583             *num_clipplanes = min(clip_limit, max_constantsF - highest_constf - 1);
584             max_constantsF -= *num_clipplanes;
585             if(*num_clipplanes < clip_limit)
586             {
587                 WARN("Only %u clipplanes out of %u enabled\n", *num_clipplanes, GL_LIMITS(clipplanes));
588             }
589         }
590         else
591         {
592             if(ctx->target_version >= NV2) *num_clipplanes = GL_LIMITS(clipplanes);
593             else *num_clipplanes = min(GL_LIMITS(clipplanes), 4);
594             max_constantsF = GL_LIMITS(vshader_constantsF);
595         }
596     }
597
598     for(i = 0; i < This->baseShader.limits.temporary; i++) {
599         if (reg_maps->temporary[i])
600             shader_addline(buffer, "TEMP R%u;\n", i);
601     }
602
603     for (i = 0; i < This->baseShader.limits.address; i++) {
604         if (reg_maps->address[i])
605             shader_addline(buffer, "ADDRESS A%d;\n", i);
606     }
607
608     if(pshader && reg_maps->shader_version.major == 1 && reg_maps->shader_version.minor <= 3) {
609         for(i = 0; i < This->baseShader.limits.texcoord; i++) {
610             if (reg_maps->texcoord[i] && pshader)
611                 shader_addline(buffer,"TEMP T%u;\n", i);
612         }
613     }
614
615     /* Load local constants using the program-local space,
616      * this avoids reloading them each time the shader is used
617      */
618     if(lconst_map) {
619         LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
620             shader_addline(buffer, "PARAM C%u = program.local[%u];\n", lconst->idx,
621                            lconst_map[lconst->idx]);
622             next_local = max(next_local, lconst_map[lconst->idx] + 1);
623         }
624     }
625
626     /* we use the array-based constants array if the local constants are marked for loading,
627      * because then we use indirect addressing, or when the local constant list is empty,
628      * because then we don't know if we're using indirect addressing or not. If we're hardcoding
629      * local constants do not declare the loaded constants as an array because ARB compilers usually
630      * do not optimize unused constants away
631      */
632     if(This->baseShader.reg_maps.usesrelconstF) {
633         /* Need to PARAM the environment parameters (constants) so we can use relative addressing */
634         shader_addline(buffer, "PARAM C[%d] = { program.env[0..%d] };\n",
635                     max_constantsF, max_constantsF - 1);
636     } else {
637         for(i = 0; i < max_constantsF; i++) {
638             DWORD idx, mask;
639             idx = i >> 5;
640             mask = 1 << (i & 0x1f);
641             if(!shader_constant_is_local(This, i) && (This->baseShader.reg_maps.constf[idx] & mask)) {
642                 shader_addline(buffer, "PARAM C%d = program.env[%d];\n",i, i);
643             }
644         }
645     }
646
647     return next_local;
648 }
649
650 static const char * const shift_tab[] = {
651     "dummy",     /*  0 (none) */
652     "coefmul.x", /*  1 (x2)   */
653     "coefmul.y", /*  2 (x4)   */
654     "coefmul.z", /*  3 (x8)   */
655     "coefmul.w", /*  4 (x16)  */
656     "dummy",     /*  5 (x32)  */
657     "dummy",     /*  6 (x64)  */
658     "dummy",     /*  7 (x128) */
659     "dummy",     /*  8 (d256) */
660     "dummy",     /*  9 (d128) */
661     "dummy",     /* 10 (d64)  */
662     "dummy",     /* 11 (d32)  */
663     "coefdiv.w", /* 12 (d16)  */
664     "coefdiv.z", /* 13 (d8)   */
665     "coefdiv.y", /* 14 (d4)   */
666     "coefdiv.x"  /* 15 (d2)   */
667 };
668
669 static void shader_arb_get_write_mask(const struct wined3d_shader_instruction *ins,
670         const struct wined3d_shader_dst_param *dst, char *write_mask)
671 {
672     char *ptr = write_mask;
673
674     if (dst->write_mask != WINED3DSP_WRITEMASK_ALL)
675     {
676         *ptr++ = '.';
677         if (dst->write_mask & WINED3DSP_WRITEMASK_0) *ptr++ = 'x';
678         if (dst->write_mask & WINED3DSP_WRITEMASK_1) *ptr++ = 'y';
679         if (dst->write_mask & WINED3DSP_WRITEMASK_2) *ptr++ = 'z';
680         if (dst->write_mask & WINED3DSP_WRITEMASK_3) *ptr++ = 'w';
681     }
682
683     *ptr = '\0';
684 }
685
686 static void shader_arb_get_swizzle(const struct wined3d_shader_src_param *param, BOOL fixup, char *swizzle_str)
687 {
688     /* For registers of type WINED3DDECLTYPE_D3DCOLOR, data is stored as "bgra",
689      * but addressed as "rgba". To fix this we need to swap the register's x
690      * and z components. */
691     const char *swizzle_chars = fixup ? "zyxw" : "xyzw";
692     char *ptr = swizzle_str;
693
694     /* swizzle bits fields: wwzzyyxx */
695     DWORD swizzle = param->swizzle;
696     DWORD swizzle_x = swizzle & 0x03;
697     DWORD swizzle_y = (swizzle >> 2) & 0x03;
698     DWORD swizzle_z = (swizzle >> 4) & 0x03;
699     DWORD swizzle_w = (swizzle >> 6) & 0x03;
700
701     /* If the swizzle is the default swizzle (ie, "xyzw"), we don't need to
702      * generate a swizzle string. Unless we need to our own swizzling. */
703     if (swizzle != WINED3DSP_NOSWIZZLE || fixup)
704     {
705         *ptr++ = '.';
706         if (swizzle_x == swizzle_y && swizzle_x == swizzle_z && swizzle_x == swizzle_w) {
707             *ptr++ = swizzle_chars[swizzle_x];
708         } else {
709             *ptr++ = swizzle_chars[swizzle_x];
710             *ptr++ = swizzle_chars[swizzle_y];
711             *ptr++ = swizzle_chars[swizzle_z];
712             *ptr++ = swizzle_chars[swizzle_w];
713         }
714     }
715
716     *ptr = '\0';
717 }
718
719 static void shader_arb_request_a0(const struct wined3d_shader_instruction *ins, const char *src)
720 {
721     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
722     SHADER_BUFFER *buffer = ins->ctx->buffer;
723
724     if(strcmp(priv->addr_reg, src) == 0) return;
725
726     strcpy(priv->addr_reg, src);
727     shader_addline(buffer, "ARL A0.x, %s;\n", src);
728 }
729
730 static void shader_arb_get_src_param(const struct wined3d_shader_instruction *ins,
731         const struct wined3d_shader_src_param *src, unsigned int tmpreg, char *outregstr);
732
733 static void shader_arb_get_register_name(const struct wined3d_shader_instruction *ins,
734         const struct wined3d_shader_register *reg, char *register_name, BOOL *is_color)
735 {
736     /* oPos, oFog and oPts in D3D */
737     static const char * const rastout_reg_names[] = {"TMP_OUT", "result.fogcoord", "result.pointsize"};
738     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
739     BOOL pshader = shader_is_pshader_version(This->baseShader.reg_maps.shader_version.type);
740     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
741
742     *is_color = FALSE;
743
744     switch (reg->type)
745     {
746         case WINED3DSPR_TEMP:
747             sprintf(register_name, "R%u", reg->idx);
748             break;
749
750         case WINED3DSPR_INPUT:
751             if (pshader)
752             {
753                 if(This->baseShader.reg_maps.shader_version.major < 3)
754                 {
755                     if (reg->idx == 0) strcpy(register_name, "fragment.color.primary");
756                     else strcpy(register_name, "fragment.color.secondary");
757                 }
758                 else
759                 {
760                     if(reg->rel_addr)
761                     {
762                         char rel_reg[50];
763                         shader_arb_get_src_param(ins, reg->rel_addr, 0, rel_reg);
764
765                         if(strcmp(rel_reg, "**aL_emul**") == 0)
766                         {
767                             DWORD idx = ctx->aL + reg->idx;
768                             if(idx < MAX_REG_INPUT)
769                             {
770                                 strcpy(register_name, ctx->ps_input[idx]);
771                             }
772                             else
773                             {
774                                 ERR("Pixel shader input register out of bounds: %u\n", idx);
775                                 sprintf(register_name, "out_of_bounds_%u", idx);
776                             }
777                         }
778                         else if(This->baseShader.reg_maps.input_registers & 0x0300)
779                         {
780                             /* There are two ways basically:
781                              *
782                              * 1) Use the unrolling code that is used for loop emulation and unroll the loop.
783                              *    That means trouble if the loop also contains a breakc or if the control values
784                              *    aren't local constants.
785                              * 2) Generate an if block that checks if aL.y < 8, == 8 or == 9 and selects the
786                              *    source dynamically. The trouble is that we cannot simply read aL.y because it
787                              *    is an ADDRESS register. We could however push it, load .zw with a value and use
788                              *    ADAC to load the condition code register and pop it again afterwards
789                              */
790                             FIXME("Relative input register addressing with more than 8 registers\n");
791
792                             /* This is better than nothing for now */
793                             sprintf(register_name, "fragment.texcoord[%s + %u]", rel_reg, reg->idx);
794                         }
795                         else if(ctx->cur_ps_args->super.vp_mode != vertexshader)
796                         {
797                             /* This is problematic because we'd have to consult the ctx->ps_input strings
798                              * for where to find the varying. Some may be "0.0", others can be texcoords or
799                              * colors. This needs either a pipeline replacement to make the vertex shader feed
800                              * proper varyings, or loop unrolling
801                              *
802                              * For now use the texcoords and hope for the best
803                              */
804                             FIXME("Non-vertex shader varying input with indirect addressing\n");
805                             sprintf(register_name, "fragment.texcoord[%s + %u]", rel_reg, reg->idx);
806                         }
807                         else
808                         {
809                             /* D3D supports indirect addressing only with aL in loop registers. The loop instruction
810                              * pulls GL_NV_fragment_program2 in
811                              */
812                             sprintf(register_name, "fragment.texcoord[%s + %u]", rel_reg, reg->idx);
813                         }
814                     }
815                     else
816                     {
817                         if(reg->idx < MAX_REG_INPUT)
818                         {
819                             strcpy(register_name, ctx->ps_input[reg->idx]);
820                         }
821                         else
822                         {
823                             ERR("Pixel shader input register out of bounds: %u\n", reg->idx);
824                             sprintf(register_name, "out_of_bounds_%u", reg->idx);
825                         }
826                     }
827                 }
828             }
829             else
830             {
831                 if (ctx->cur_vs_args->super.swizzle_map & (1 << reg->idx)) *is_color = TRUE;
832                 sprintf(register_name, "vertex.attrib[%u]", reg->idx);
833             }
834             break;
835
836         case WINED3DSPR_CONST:
837             if (!pshader && reg->rel_addr)
838             {
839                 BOOL aL = FALSE;
840                 char rel_reg[50];
841                 UINT rel_offset = ((IWineD3DVertexShaderImpl *)This)->rel_offset;
842                 if(This->baseShader.reg_maps.shader_version.major < 2) {
843                     sprintf(rel_reg, "A0.x");
844                 } else {
845                     shader_arb_get_src_param(ins, reg->rel_addr, 0, rel_reg);
846                     if(ctx->target_version == ARB) {
847                         if(strcmp(rel_reg, "**aL_emul**") == 0) {
848                             aL = TRUE;
849                         } else {
850                             shader_arb_request_a0(ins, rel_reg);
851                             sprintf(rel_reg, "A0.x");
852                         }
853                     }
854                 }
855                 if(aL)
856                     sprintf(register_name, "C[%u]", ctx->aL + reg->idx);
857                 else if (reg->idx >= rel_offset)
858                     sprintf(register_name, "C[%s + %u]", rel_reg, reg->idx - rel_offset);
859                 else
860                     sprintf(register_name, "C[%s - %u]", rel_reg, -reg->idx + rel_offset);
861             }
862             else
863             {
864                 if (This->baseShader.reg_maps.usesrelconstF)
865                     sprintf(register_name, "C[%u]", reg->idx);
866                 else
867                     sprintf(register_name, "C%u", reg->idx);
868             }
869             break;
870
871         case WINED3DSPR_TEXTURE: /* case WINED3DSPR_ADDR: */
872             if (pshader) {
873                 if(This->baseShader.reg_maps.shader_version.major == 1 &&
874                    This->baseShader.reg_maps.shader_version.minor <= 3) {
875                     /* In ps <= 1.3, Tx is a temporary register as destination to all instructions,
876                      * and as source to most instructions. For some instructions it is the texcoord
877                      * input. Those instructions know about the special use
878                      */
879                     sprintf(register_name, "T%u", reg->idx);
880                 } else {
881                     /* in ps 1.4 and 2.x Tx is always a (read-only) varying */
882                     sprintf(register_name, "fragment.texcoord[%u]", reg->idx);
883                 }
884             }
885             else
886             {
887                 if(This->baseShader.reg_maps.shader_version.major == 1 || ctx->target_version >= NV2)
888                 {
889                     sprintf(register_name, "A%u", reg->idx);
890                 }
891                 else
892                 {
893                     sprintf(register_name, "A%u_SHADOW", reg->idx);
894                 }
895             }
896             break;
897
898         case WINED3DSPR_COLOROUT:
899             if(ctx->cur_ps_args->super.srgb_correction && reg->idx == 0)
900             {
901                 strcpy(register_name, "TMP_COLOR");
902             }
903             else
904             {
905                 if(ctx->cur_ps_args->super.srgb_correction) FIXME("sRGB correction on higher render targets\n");
906                 if(This->baseShader.reg_maps.highest_render_target > 0)
907                 {
908                     sprintf(register_name, "result.color[%u]", reg->idx);
909                 }
910                 else
911                 {
912                     strcpy(register_name, "result.color");
913                 }
914             }
915             break;
916
917         case WINED3DSPR_RASTOUT:
918             if(reg->idx == 1) sprintf(register_name, "%s", ctx->fog_output);
919             else sprintf(register_name, "%s", rastout_reg_names[reg->idx]);
920             break;
921
922         case WINED3DSPR_DEPTHOUT:
923             strcpy(register_name, "result.depth");
924             break;
925
926         case WINED3DSPR_ATTROUT:
927         /* case WINED3DSPR_OUTPUT: */
928             if (pshader) sprintf(register_name, "oD[%u]", reg->idx);
929             else strcpy(register_name, ctx->color_output[reg->idx]);
930             break;
931
932         case WINED3DSPR_TEXCRDOUT:
933             if (pshader)
934             {
935                 sprintf(register_name, "oT[%u]", reg->idx);
936             }
937             else
938             {
939                 if(This->baseShader.reg_maps.shader_version.major < 3)
940                 {
941                     strcpy(register_name, ctx->texcrd_output[reg->idx]);
942                 }
943                 else
944                 {
945                     strcpy(register_name, ctx->vs_output[reg->idx]);
946                 }
947             }
948             break;
949
950         case WINED3DSPR_LOOP:
951             if(ctx->target_version >= NV2)
952             {
953                 /* Pshader has an implicitly declared loop index counter A0.x that cannot be renamed */
954                 if(pshader) sprintf(register_name, "A0.x");
955                 else sprintf(register_name, "aL.y");
956             }
957             else
958             {
959                 /* Unfortunately this code cannot return the value of ctx->aL here. An immediate value
960                  * would be valid, but if aL is used for indexing(its only use), there's likely an offset,
961                  * thus the result would be something like C[15 + 30], which is not valid in the ARB program
962                  * grammar. So return a marker for the emulated aL and intercept it in constant and varying
963                  * indexing
964                  */
965                 sprintf(register_name, "**aL_emul**");
966             }
967
968             break;
969
970         case WINED3DSPR_CONSTINT:
971             sprintf(register_name, "I%u", reg->idx);
972             break;
973
974         case WINED3DSPR_MISCTYPE:
975             if(reg->idx == 0)
976             {
977                 sprintf(register_name, "vpos");
978             }
979             else if(reg->idx == 1)
980             {
981                 sprintf(register_name, "fragment.facing.x");
982             }
983             else
984             {
985                 FIXME("Unknown MISCTYPE register index %u\n", reg->idx);
986             }
987             break;
988
989         default:
990             FIXME("Unhandled register type %#x[%u]\n", reg->type, reg->idx);
991             sprintf(register_name, "unrecognized_register[%u]", reg->idx);
992             break;
993     }
994 }
995
996 static void shader_arb_get_dst_param(const struct wined3d_shader_instruction *ins,
997         const struct wined3d_shader_dst_param *wined3d_dst, char *str)
998 {
999     char register_name[255];
1000     char write_mask[6];
1001     BOOL is_color;
1002
1003     shader_arb_get_register_name(ins, &wined3d_dst->reg, register_name, &is_color);
1004     strcpy(str, register_name);
1005
1006     shader_arb_get_write_mask(ins, wined3d_dst, write_mask);
1007     strcat(str, write_mask);
1008 }
1009
1010 static const char *shader_arb_get_fixup_swizzle(enum fixup_channel_source channel_source)
1011 {
1012     switch(channel_source)
1013     {
1014         case CHANNEL_SOURCE_ZERO: return "0";
1015         case CHANNEL_SOURCE_ONE: return "1";
1016         case CHANNEL_SOURCE_X: return "x";
1017         case CHANNEL_SOURCE_Y: return "y";
1018         case CHANNEL_SOURCE_Z: return "z";
1019         case CHANNEL_SOURCE_W: return "w";
1020         default:
1021             FIXME("Unhandled channel source %#x\n", channel_source);
1022             return "undefined";
1023     }
1024 }
1025
1026 static void gen_color_correction(SHADER_BUFFER *buffer, const char *reg, DWORD dst_mask,
1027                                  const char *one, const char *two, struct color_fixup_desc fixup)
1028 {
1029     DWORD mask;
1030
1031     if (is_yuv_fixup(fixup))
1032     {
1033         enum yuv_fixup yuv_fixup = get_yuv_fixup(fixup);
1034         FIXME("YUV fixup (%#x) not supported\n", yuv_fixup);
1035         return;
1036     }
1037
1038     mask = 0;
1039     if (fixup.x_source != CHANNEL_SOURCE_X) mask |= WINED3DSP_WRITEMASK_0;
1040     if (fixup.y_source != CHANNEL_SOURCE_Y) mask |= WINED3DSP_WRITEMASK_1;
1041     if (fixup.z_source != CHANNEL_SOURCE_Z) mask |= WINED3DSP_WRITEMASK_2;
1042     if (fixup.w_source != CHANNEL_SOURCE_W) mask |= WINED3DSP_WRITEMASK_3;
1043     mask &= dst_mask;
1044
1045     if (mask)
1046     {
1047         shader_addline(buffer, "SWZ %s, %s, %s, %s, %s, %s;\n", reg, reg,
1048                 shader_arb_get_fixup_swizzle(fixup.x_source), shader_arb_get_fixup_swizzle(fixup.y_source),
1049                 shader_arb_get_fixup_swizzle(fixup.z_source), shader_arb_get_fixup_swizzle(fixup.w_source));
1050     }
1051
1052     mask = 0;
1053     if (fixup.x_sign_fixup) mask |= WINED3DSP_WRITEMASK_0;
1054     if (fixup.y_sign_fixup) mask |= WINED3DSP_WRITEMASK_1;
1055     if (fixup.z_sign_fixup) mask |= WINED3DSP_WRITEMASK_2;
1056     if (fixup.w_sign_fixup) mask |= WINED3DSP_WRITEMASK_3;
1057     mask &= dst_mask;
1058
1059     if (mask)
1060     {
1061         char reg_mask[6];
1062         char *ptr = reg_mask;
1063
1064         if (mask != WINED3DSP_WRITEMASK_ALL)
1065         {
1066             *ptr++ = '.';
1067             if (mask & WINED3DSP_WRITEMASK_0) *ptr++ = 'x';
1068             if (mask & WINED3DSP_WRITEMASK_1) *ptr++ = 'y';
1069             if (mask & WINED3DSP_WRITEMASK_2) *ptr++ = 'z';
1070             if (mask & WINED3DSP_WRITEMASK_3) *ptr++ = 'w';
1071         }
1072         *ptr = '\0';
1073
1074         shader_addline(buffer, "MAD %s%s, %s, %s, -%s;\n", reg, reg_mask, reg, two, one);
1075     }
1076 }
1077
1078 static const char *shader_arb_get_modifier(const struct wined3d_shader_instruction *ins)
1079 {
1080     DWORD mod;
1081     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
1082     if (!ins->dst_count) return "";
1083
1084     mod = ins->dst[0].modifiers;
1085
1086     /* Silently ignore PARTIALPRECISION if its not supported */
1087     if(priv->target_version == ARB) mod &= ~WINED3DSPDM_PARTIALPRECISION;
1088
1089     if(mod & WINED3DSPDM_MSAMPCENTROID)
1090     {
1091         FIXME("Unhandled modifier WINED3DSPDM_MSAMPCENTROID\n");
1092         mod &= ~WINED3DSPDM_MSAMPCENTROID;
1093     }
1094
1095     switch(mod)
1096     {
1097         case WINED3DSPDM_SATURATE | WINED3DSPDM_PARTIALPRECISION:
1098             return "H_SAT";
1099
1100         case WINED3DSPDM_SATURATE:
1101             return "_SAT";
1102
1103         case WINED3DSPDM_PARTIALPRECISION:
1104             return "H";
1105
1106         case 0:
1107             return "";
1108
1109         default:
1110             FIXME("Unknown modifiers 0x%08x\n", mod);
1111             return "";
1112     }
1113 }
1114
1115 #define TEX_PROJ        0x1
1116 #define TEX_BIAS        0x2
1117 #define TEX_LOD         0x4
1118 #define TEX_DERIV       0x10
1119
1120 static void shader_hw_sample(const struct wined3d_shader_instruction *ins, DWORD sampler_idx,
1121         const char *dst_str, const char *coord_reg, WORD flags, const char *dsx, const char *dsy)
1122 {
1123     SHADER_BUFFER *buffer = ins->ctx->buffer;
1124     DWORD sampler_type = ins->ctx->reg_maps->sampler_type[sampler_idx];
1125     const char *tex_type;
1126     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
1127     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) This->baseShader.device;
1128     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
1129     const char *mod;
1130     BOOL pshader = shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type);
1131
1132     /* D3D vertex shader sampler IDs are vertex samplers(0-3), not global d3d samplers */
1133     if(!pshader) sampler_idx += MAX_FRAGMENT_SAMPLERS;
1134
1135     switch(sampler_type) {
1136         case WINED3DSTT_1D:
1137             tex_type = "1D";
1138             break;
1139
1140         case WINED3DSTT_2D:
1141             if(device->stateBlock->textures[sampler_idx] &&
1142                IWineD3DBaseTexture_GetTextureDimensions(device->stateBlock->textures[sampler_idx]) == GL_TEXTURE_RECTANGLE_ARB) {
1143                 tex_type = "RECT";
1144             } else {
1145                 tex_type = "2D";
1146             }
1147             if (shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type))
1148             {
1149                 if(priv->cur_ps_args->super.np2_fixup & (1 << sampler_idx))
1150                 {
1151                     FIXME("NP2 texcoord fixup is currently not implemented in ARB mode (use GLSL instead).\n");
1152                 }
1153             }
1154             break;
1155
1156         case WINED3DSTT_VOLUME:
1157             tex_type = "3D";
1158             break;
1159
1160         case WINED3DSTT_CUBE:
1161             tex_type = "CUBE";
1162             break;
1163
1164         default:
1165             ERR("Unexpected texture type %d\n", sampler_type);
1166             tex_type = "";
1167     }
1168
1169     /* TEX, TXL, TXD and TXP do not support the "H" modifier,
1170      * so don't use shader_arb_get_modifier
1171      */
1172     if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE) mod = "_SAT";
1173     else mod = "";
1174
1175     /* Fragment samplers always have indentity mapping */
1176     if(sampler_idx >= MAX_FRAGMENT_SAMPLERS)
1177     {
1178         sampler_idx = priv->cur_vs_args->vertex_samplers[sampler_idx - MAX_FRAGMENT_SAMPLERS];
1179     }
1180
1181     if (flags & TEX_DERIV)
1182     {
1183         if(flags & TEX_PROJ) FIXME("Projected texture sampling with custom derivates\n");
1184         if(flags & TEX_BIAS) FIXME("Biased texture sampling with custom derivates\n");
1185         shader_addline(buffer, "TXD%s %s, %s, %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg,
1186                        dsx, dsy,sampler_idx, tex_type);
1187     }
1188     else if(flags & TEX_LOD)
1189     {
1190         if(flags & TEX_PROJ) FIXME("Projected texture sampling with explicit lod\n");
1191         if(flags & TEX_BIAS) FIXME("Biased texture sampling with explicit lod\n");
1192         shader_addline(buffer, "TXL%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg,
1193                        sampler_idx, tex_type);
1194     }
1195     else if (flags & TEX_BIAS)
1196     {
1197         /* Shouldn't be possible, but let's check for it */
1198         if(flags & TEX_PROJ) FIXME("Biased and Projected texture sampling\n");
1199         /* TXB takes the 4th component of the source vector automatically, as d3d. Nothing more to do */
1200         shader_addline(buffer, "TXB%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg, sampler_idx, tex_type);
1201     }
1202     else if (flags & TEX_PROJ)
1203     {
1204         shader_addline(buffer, "TXP%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg, sampler_idx, tex_type);
1205     }
1206     else
1207     {
1208         shader_addline(buffer, "TEX%s %s, %s, texture[%u], %s;\n", mod, dst_str, coord_reg, sampler_idx, tex_type);
1209     }
1210
1211     if (pshader)
1212     {
1213         gen_color_correction(buffer, dst_str, ins->dst[0].write_mask,
1214                 "one", "coefmul.x", priv->cur_ps_args->super.color_fixup[sampler_idx]);
1215     }
1216 }
1217
1218 static void shader_arb_get_src_param(const struct wined3d_shader_instruction *ins,
1219         const struct wined3d_shader_src_param *src, unsigned int tmpreg, char *outregstr)
1220 {
1221     /* Generate a line that does the input modifier computation and return the input register to use */
1222     BOOL is_color = FALSE;
1223     char regstr[256];
1224     char swzstr[20];
1225     int insert_line;
1226     SHADER_BUFFER *buffer = ins->ctx->buffer;
1227     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1228
1229     /* Assume a new line will be added */
1230     insert_line = 1;
1231
1232     /* Get register name */
1233     shader_arb_get_register_name(ins, &src->reg, regstr, &is_color);
1234     shader_arb_get_swizzle(src, is_color, swzstr);
1235
1236     switch (src->modifiers)
1237     {
1238     case WINED3DSPSM_NONE:
1239         sprintf(outregstr, "%s%s", regstr, swzstr);
1240         insert_line = 0;
1241         break;
1242     case WINED3DSPSM_NEG:
1243         sprintf(outregstr, "-%s%s", regstr, swzstr);
1244         insert_line = 0;
1245         break;
1246     case WINED3DSPSM_BIAS:
1247         shader_addline(buffer, "ADD T%c, %s, -coefdiv.x;\n", 'A' + tmpreg, regstr);
1248         break;
1249     case WINED3DSPSM_BIASNEG:
1250         shader_addline(buffer, "ADD T%c, -%s, coefdiv.x;\n", 'A' + tmpreg, regstr);
1251         break;
1252     case WINED3DSPSM_SIGN:
1253         shader_addline(buffer, "MAD T%c, %s, coefmul.x, -one.x;\n", 'A' + tmpreg, regstr);
1254         break;
1255     case WINED3DSPSM_SIGNNEG:
1256         shader_addline(buffer, "MAD T%c, %s, -coefmul.x, one.x;\n", 'A' + tmpreg, regstr);
1257         break;
1258     case WINED3DSPSM_COMP:
1259         shader_addline(buffer, "SUB T%c, one.x, %s;\n", 'A' + tmpreg, regstr);
1260         break;
1261     case WINED3DSPSM_X2:
1262         shader_addline(buffer, "ADD T%c, %s, %s;\n", 'A' + tmpreg, regstr, regstr);
1263         break;
1264     case WINED3DSPSM_X2NEG:
1265         shader_addline(buffer, "ADD T%c, -%s, -%s;\n", 'A' + tmpreg, regstr, regstr);
1266         break;
1267     case WINED3DSPSM_DZ:
1268         shader_addline(buffer, "RCP T%c, %s.z;\n", 'A' + tmpreg, regstr);
1269         shader_addline(buffer, "MUL T%c, %s, T%c;\n", 'A' + tmpreg, regstr, 'A' + tmpreg);
1270         break;
1271     case WINED3DSPSM_DW:
1272         shader_addline(buffer, "RCP T%c, %s.w;\n", 'A' + tmpreg, regstr);
1273         shader_addline(buffer, "MUL T%c, %s, T%c;\n", 'A' + tmpreg, regstr, 'A' + tmpreg);
1274         break;
1275     case WINED3DSPSM_ABS:
1276         if(ctx->target_version >= NV2) {
1277             sprintf(outregstr, "|%s%s|", regstr, swzstr);
1278             insert_line = 0;
1279         } else {
1280             shader_addline(buffer, "ABS T%c, %s;\n", 'A' + tmpreg, regstr);
1281         }
1282         break;
1283     case WINED3DSPSM_ABSNEG:
1284         if(ctx->target_version >= NV2) {
1285             sprintf(outregstr, "-|%s%s|", regstr, swzstr);
1286         } else {
1287             shader_addline(buffer, "ABS T%c, %s;\n", 'A' + tmpreg, regstr);
1288             sprintf(outregstr, "-T%c%s", 'A' + tmpreg, swzstr);
1289         }
1290         insert_line = 0;
1291         break;
1292     default:
1293         sprintf(outregstr, "%s%s", regstr, swzstr);
1294         insert_line = 0;
1295     }
1296
1297     /* Return modified or original register, with swizzle */
1298     if (insert_line)
1299         sprintf(outregstr, "T%c%s", 'A' + tmpreg, swzstr);
1300 }
1301
1302 static void pshader_hw_bem(const struct wined3d_shader_instruction *ins)
1303 {
1304     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1305     SHADER_BUFFER *buffer = ins->ctx->buffer;
1306     char dst_name[50];
1307     char src_name[2][50];
1308     DWORD sampler_code = dst->reg.idx;
1309
1310     shader_arb_get_dst_param(ins, dst, dst_name);
1311
1312     /* Sampling the perturbation map in Tsrc was done already, including the signedness correction if needed
1313      *
1314      * Keep in mind that src_name[1] can be "TB" and src_name[0] can be "TA" because modifiers like _x2 are valid
1315      * with bem. So delay loading the first parameter until after the perturbation calculation which needs two
1316      * temps is done.
1317      */
1318     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1319     shader_addline(buffer, "SWZ TA, bumpenvmat%d, x, z, 0, 0;\n", sampler_code);
1320     shader_addline(buffer, "DP3 TC.r, TA, %s;\n", src_name[1]);
1321     shader_addline(buffer, "SWZ TA, bumpenvmat%d, y, w, 0, 0;\n", sampler_code);
1322     shader_addline(buffer, "DP3 TC.g, TA, %s;\n", src_name[1]);
1323
1324     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
1325     shader_addline(buffer, "ADD %s, %s, TC;\n", dst_name, src_name[0]);
1326 }
1327
1328 static void pshader_hw_cnd(const struct wined3d_shader_instruction *ins)
1329 {
1330     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1331     SHADER_BUFFER *buffer = ins->ctx->buffer;
1332     char dst_name[50];
1333     char src_name[3][50];
1334     DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
1335             ins->ctx->reg_maps->shader_version.minor);
1336     BOOL is_color;
1337
1338     shader_arb_get_dst_param(ins, dst, dst_name);
1339     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1340
1341     /* The coissue flag changes the semantic of the cnd instruction in <= 1.3 shaders */
1342     if (shader_version <= WINED3D_SHADER_VERSION(1, 3) && ins->coissue)
1343     {
1344         shader_addline(buffer, "MOV%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name[1]);
1345     } else {
1346         shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
1347         shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1348         shader_addline(buffer, "ADD TA, -%s, coefdiv.x;\n", src_name[0]);
1349         /* No modifiers supported on CMP */
1350         shader_addline(buffer, "CMP %s, TA, %s, %s;\n", dst_name, src_name[1], src_name[2]);
1351
1352         /* _SAT on CMP doesn't make much sense, but it is not a pure NOP */
1353         if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE)
1354         {
1355             shader_arb_get_register_name(ins, &dst->reg, src_name[0], &is_color);
1356             shader_addline(buffer, "MOV_SAT %s, %s;\n", dst_name, dst_name);
1357         }
1358     }
1359 }
1360
1361 static void pshader_hw_cmp(const struct wined3d_shader_instruction *ins)
1362 {
1363     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1364     SHADER_BUFFER *buffer = ins->ctx->buffer;
1365     char dst_name[50];
1366     char src_name[3][50];
1367     BOOL is_color;
1368
1369     shader_arb_get_dst_param(ins, dst, dst_name);
1370
1371     /* Generate input register names (with modifiers) */
1372     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
1373     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1374     shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1375
1376     /* No modifiers are supported on CMP */
1377     shader_addline(buffer, "CMP %s, %s, %s, %s;\n", dst_name,
1378                    src_name[0], src_name[2], src_name[1]);
1379
1380     if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE)
1381     {
1382         shader_arb_get_register_name(ins, &dst->reg, src_name[0], &is_color);
1383         shader_addline(buffer, "MOV_SAT %s, %s;\n", dst_name, src_name[0]);
1384     }
1385 }
1386
1387 /** Process the WINED3DSIO_DP2ADD instruction in ARB.
1388  * dst = dot2(src0, src1) + src2 */
1389 static void pshader_hw_dp2add(const struct wined3d_shader_instruction *ins)
1390 {
1391     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1392     SHADER_BUFFER *buffer = ins->ctx->buffer;
1393     char dst_name[50];
1394     char src_name[3][50];
1395     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1396
1397     shader_arb_get_dst_param(ins, dst, dst_name);
1398     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
1399     shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
1400
1401     if(ctx->target_version >= NV3)
1402     {
1403         /* GL_NV_fragment_program2 has a 1:1 matching instruction */
1404         shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1405         shader_addline(buffer, "DP2A%s %s, %s, %s, %s;\n", shader_arb_get_modifier(ins),
1406                        dst_name, src_name[0], src_name[1], src_name[2]);
1407     }
1408     else if(ctx->target_version >= NV2)
1409     {
1410         /* dst.x = src2.?, src0.x, src1.x + src0.y * src1.y
1411          * dst.y = src2.?, src0.x, src1.z + src0.y * src1.w
1412          * dst.z = src2.?, src0.x, src1.x + src0.y * src1.y
1413          * dst.z = src2.?, src0.x, src1.z + src0.y * src1.w
1414          *
1415          * Make sure that src1.zw = src1.xy, then we get a classic dp2add
1416          *
1417          * .xyxy and other swizzles that we could get with this are not valid in
1418          * plain ARBfp, but luckily the NV extension grammar lifts this limitation.
1419          */
1420         struct wined3d_shader_src_param tmp_param = ins->src[1];
1421         DWORD swizzle = tmp_param.swizzle & 0xf; /* Selects .xy */
1422         tmp_param.swizzle = swizzle | (swizzle << 4); /* Creates .xyxy */
1423
1424         shader_arb_get_src_param(ins, &tmp_param, 1, src_name[1]);
1425
1426         shader_addline(buffer, "X2D%s %s, %s, %s, %s;\n", shader_arb_get_modifier(ins),
1427                        dst_name, src_name[2], src_name[0], src_name[1]);
1428     }
1429     else
1430     {
1431         shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
1432         /* Emulate a DP2 with a DP3 and 0.0. Don't use the dest as temp register, it could be src[1] or src[2]
1433         * src_name[0] can be TA, but TA is a private temp for modifiers, so it is save to overwrite
1434         */
1435         shader_addline(buffer, "MOV TA, %s;\n", src_name[0]);
1436         shader_addline(buffer, "MOV TA.z, 0.0;\n");
1437         shader_addline(buffer, "DP3 TA, TA, %s;\n", src_name[1]);
1438         shader_addline(buffer, "ADD%s %s, TA, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name[2]);
1439     }
1440 }
1441
1442 /* Map the opcode 1-to-1 to the GL code */
1443 static void shader_hw_map2gl(const struct wined3d_shader_instruction *ins)
1444 {
1445     SHADER_BUFFER *buffer = ins->ctx->buffer;
1446     const char *instruction;
1447     char arguments[256], dst_str[50];
1448     unsigned int i;
1449     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1450
1451     switch (ins->handler_idx)
1452     {
1453         case WINED3DSIH_ABS: instruction = "ABS"; break;
1454         case WINED3DSIH_ADD: instruction = "ADD"; break;
1455         case WINED3DSIH_CRS: instruction = "XPD"; break;
1456         case WINED3DSIH_DP3: instruction = "DP3"; break;
1457         case WINED3DSIH_DP4: instruction = "DP4"; break;
1458         case WINED3DSIH_DST: instruction = "DST"; break;
1459         case WINED3DSIH_EXP: instruction = "EX2"; break;
1460         case WINED3DSIH_EXPP: instruction = "EXP"; break;
1461         case WINED3DSIH_FRC: instruction = "FRC"; break;
1462         case WINED3DSIH_LIT: instruction = "LIT"; break;
1463         case WINED3DSIH_LOG: instruction = "LG2"; break;
1464         case WINED3DSIH_LOGP: instruction = "LOG"; break;
1465         case WINED3DSIH_LRP: instruction = "LRP"; break;
1466         case WINED3DSIH_MAD: instruction = "MAD"; break;
1467         case WINED3DSIH_MAX: instruction = "MAX"; break;
1468         case WINED3DSIH_MIN: instruction = "MIN"; break;
1469         case WINED3DSIH_MOV: instruction = "MOV"; break;
1470         case WINED3DSIH_MUL: instruction = "MUL"; break;
1471         case WINED3DSIH_POW: instruction = "POW"; break;
1472         case WINED3DSIH_SGE: instruction = "SGE"; break;
1473         case WINED3DSIH_SLT: instruction = "SLT"; break;
1474         case WINED3DSIH_SUB: instruction = "SUB"; break;
1475         case WINED3DSIH_MOVA:instruction = "ARR"; break;
1476         case WINED3DSIH_SGN: instruction = "SSG"; break;
1477         case WINED3DSIH_DSX: instruction = "DDX"; break;
1478         default: instruction = "";
1479             FIXME("Unhandled opcode %#x\n", ins->handler_idx);
1480             break;
1481     }
1482
1483     /* Note that shader_arb_add_dst_param() adds spaces. */
1484     arguments[0] = '\0';
1485     shader_arb_get_dst_param(ins, dst, dst_str);
1486     for (i = 0; i < ins->src_count; ++i)
1487     {
1488         char operand[100];
1489         strcat(arguments, ", ");
1490         shader_arb_get_src_param(ins, &ins->src[i], i, operand);
1491         strcat(arguments, operand);
1492     }
1493     shader_addline(buffer, "%s%s %s%s;\n", instruction, shader_arb_get_modifier(ins), dst_str, arguments);
1494 }
1495
1496 static void shader_hw_nop(const struct wined3d_shader_instruction *ins)
1497 {
1498     SHADER_BUFFER *buffer = ins->ctx->buffer;
1499     shader_addline(buffer, "NOP;\n");
1500 }
1501
1502 static void shader_hw_mov(const struct wined3d_shader_instruction *ins)
1503 {
1504     IWineD3DBaseShaderImpl *shader = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
1505     BOOL pshader = shader_is_pshader_version(shader->baseShader.reg_maps.shader_version.type);
1506     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
1507
1508     SHADER_BUFFER *buffer = ins->ctx->buffer;
1509     char src0_param[256];
1510
1511     if(ins->handler_idx == WINED3DSIH_MOVA) {
1512         struct wined3d_shader_src_param tmp_src = ins->src[0];
1513         char write_mask[6];
1514
1515         if(ctx->target_version >= NV2) {
1516             shader_hw_map2gl(ins);
1517             return;
1518         }
1519         tmp_src.swizzle = (tmp_src.swizzle & 0x3) * 0x55;
1520         shader_arb_get_src_param(ins, &tmp_src, 0, src0_param);
1521         shader_arb_get_write_mask(ins, &ins->dst[0], write_mask);
1522
1523         /* This implements the mova formula used in GLSL. The first two instructions
1524          * prepare the sign() part. Note that it is fine to have my_sign(0.0) = 1.0
1525          * in this case:
1526          * mova A0.x, 0.0
1527          *
1528          * A0.x = arl(floor(abs(0.0) + 0.5) * 1.0) = floor(0.5) = 0.0 since arl does a floor
1529          *
1530          * The ARL is performed when A0 is used - the requested component is read from A0_SHADOW into
1531          * A0.x. We can use the overwritten component of A0_shadow as temporary storage for the sign.
1532          */
1533         shader_addline(buffer, "SGE A0_SHADOW%s, %s, mova_const.y;\n", write_mask, src0_param);
1534         shader_addline(buffer, "MAD A0_SHADOW%s, A0_SHADOW, mova_const.z, -mova_const.w;\n", write_mask);
1535
1536         shader_addline(buffer, "ABS TA%s, %s;\n", write_mask, src0_param);
1537         shader_addline(buffer, "ADD TA%s, TA, mova_const.x;\n", write_mask);
1538         shader_addline(buffer, "FLR TA%s, TA;\n", write_mask);
1539         if (((IWineD3DVertexShaderImpl *)shader)->rel_offset)
1540         {
1541             shader_addline(buffer, "ADD TA%s, TA, helper_const.z;\n", write_mask);
1542         }
1543         shader_addline(buffer, "MUL A0_SHADOW%s, TA, A0_SHADOW;\n", write_mask);
1544
1545         ((struct shader_arb_ctx_priv *)ins->ctx->backend_data)->addr_reg[0] = '\0';
1546     } else if (ins->ctx->reg_maps->shader_version.major == 1
1547           && !shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)
1548           && ins->dst[0].reg.type == WINED3DSPR_ADDR)
1549     {
1550         src0_param[0] = '\0';
1551         if (((IWineD3DVertexShaderImpl *)shader)->rel_offset)
1552         {
1553             shader_arb_get_src_param(ins, &ins->src[0], 0, src0_param);
1554             shader_addline(buffer, "ADD TA.x, %s, helper_const.z;\n", src0_param);
1555             shader_addline(buffer, "ARL A0.x, TA.x;\n");
1556         }
1557         else
1558         {
1559             /* Apple's ARB_vertex_program implementation does not accept an ARL source argument
1560              * with more than one component. Thus replicate the first source argument over all
1561              * 4 components. For example, .xyzw -> .x (or better: .xxxx), .zwxy -> .z, etc) */
1562             struct wined3d_shader_src_param tmp_src = ins->src[0];
1563             tmp_src.swizzle = (tmp_src.swizzle & 0x3) * 0x55;
1564             shader_arb_get_src_param(ins, &tmp_src, 0, src0_param);
1565             shader_addline(buffer, "ARL A0.x, %s;\n", src0_param);
1566         }
1567     }
1568     else if(ins->dst[0].reg.type == WINED3DSPR_COLOROUT && ins->dst[0].reg.idx == 0 && pshader)
1569     {
1570         IWineD3DPixelShaderImpl *ps = (IWineD3DPixelShaderImpl *) shader;
1571         if(ctx->cur_ps_args->super.srgb_correction && ps->color0_mov)
1572         {
1573             shader_addline(buffer, "#mov handled in srgb write code\n");
1574             return;
1575         }
1576         shader_hw_map2gl(ins);
1577     }
1578     else
1579     {
1580         shader_hw_map2gl(ins);
1581     }
1582 }
1583
1584 static void pshader_hw_texkill(const struct wined3d_shader_instruction *ins)
1585 {
1586     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1587     SHADER_BUFFER *buffer = ins->ctx->buffer;
1588     char reg_dest[40];
1589
1590     /* No swizzles are allowed in d3d's texkill. PS 1.x ignores the 4th component as documented,
1591      * but >= 2.0 honors it(undocumented, but tested by the d3d9 testsuit)
1592      */
1593     shader_arb_get_dst_param(ins, dst, reg_dest);
1594
1595     if (ins->ctx->reg_maps->shader_version.major >= 2)
1596     {
1597         /* The arb backend doesn't claim ps 2.0 support, but try to eat what the app feeds to us */
1598         shader_arb_get_dst_param(ins, dst, reg_dest);
1599         shader_addline(buffer, "KIL %s;\n", reg_dest);
1600     } else {
1601         /* ARB fp doesn't like swizzles on the parameter of the KIL instruction. To mask the 4th component,
1602          * copy the register into our general purpose TMP variable, overwrite .w and pass TMP to KIL
1603          *
1604          * ps_1_3 shaders use the texcoord incarnation of the Tx register. ps_1_4 shaders can use the same,
1605          * or pass in any temporary register(in shader phase 2)
1606          */
1607         if(ins->ctx->reg_maps->shader_version.minor <= 3) {
1608             sprintf(reg_dest, "fragment.texcoord[%u]", dst->reg.idx);
1609         } else {
1610             shader_arb_get_dst_param(ins, dst, reg_dest);
1611         }
1612         shader_addline(buffer, "SWZ TA, %s, x, y, z, 1;\n", reg_dest);
1613         shader_addline(buffer, "KIL TA;\n");
1614     }
1615 }
1616
1617 static void pshader_hw_tex(const struct wined3d_shader_instruction *ins)
1618 {
1619     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1620     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1621     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1622     DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
1623             ins->ctx->reg_maps->shader_version.minor);
1624     struct wined3d_shader_src_param src;
1625
1626     char reg_dest[40];
1627     char reg_coord[40];
1628     DWORD reg_sampler_code;
1629     DWORD myflags = 0;
1630
1631     /* All versions have a destination register */
1632     shader_arb_get_dst_param(ins, dst, reg_dest);
1633
1634     /* 1.0-1.4: Use destination register number as texture code.
1635        2.0+: Use provided sampler number as texure code. */
1636     if (shader_version < WINED3D_SHADER_VERSION(2,0))
1637         reg_sampler_code = dst->reg.idx;
1638     else
1639         reg_sampler_code = ins->src[1].reg.idx;
1640
1641     /* 1.0-1.3: Use the texcoord varying.
1642        1.4+: Use provided coordinate source register. */
1643     if (shader_version < WINED3D_SHADER_VERSION(1,4))
1644         sprintf(reg_coord, "fragment.texcoord[%u]", reg_sampler_code);
1645     else {
1646         /* TEX is the only instruction that can handle DW and DZ natively */
1647         src = ins->src[0];
1648         if(src.modifiers == WINED3DSPSM_DW) src.modifiers = WINED3DSPSM_NONE;
1649         if(src.modifiers == WINED3DSPSM_DZ) src.modifiers = WINED3DSPSM_NONE;
1650         shader_arb_get_src_param(ins, &src, 0, reg_coord);
1651     }
1652
1653     /* projection flag:
1654      * 1.1, 1.2, 1.3: Use WINED3DTSS_TEXTURETRANSFORMFLAGS
1655      * 1.4: Use WINED3DSPSM_DZ or WINED3DSPSM_DW on src[0]
1656      * 2.0+: Use WINED3DSI_TEXLD_PROJECT on the opcode
1657      */
1658     if (shader_version < WINED3D_SHADER_VERSION(1,4))
1659     {
1660         DWORD flags = 0;
1661         if(reg_sampler_code < MAX_TEXTURES) {
1662             flags = deviceImpl->stateBlock->textureState[reg_sampler_code][WINED3DTSS_TEXTURETRANSFORMFLAGS];
1663         }
1664         if (flags & WINED3DTTFF_PROJECTED) {
1665             myflags |= TEX_PROJ;
1666         }
1667     }
1668     else if (shader_version < WINED3D_SHADER_VERSION(2,0))
1669     {
1670         DWORD src_mod = ins->src[0].modifiers;
1671         if (src_mod == WINED3DSPSM_DZ) {
1672             /* TXP cannot handle DZ natively, so move the z coordinate to .w. reg_coord is a read-only
1673              * varying register, so we need a temp reg
1674              */
1675             shader_addline(ins->ctx->buffer, "SWZ TA, %s, x, y, z, z;\n", reg_coord);
1676             strcpy(reg_coord, "TA");
1677             myflags |= TEX_PROJ;
1678         } else if(src_mod == WINED3DSPSM_DW) {
1679             myflags |= TEX_PROJ;
1680         }
1681     } else {
1682         if (ins->flags & WINED3DSI_TEXLD_PROJECT) myflags |= TEX_PROJ;
1683         if (ins->flags & WINED3DSI_TEXLD_BIAS) myflags |= TEX_BIAS;
1684     }
1685     shader_hw_sample(ins, reg_sampler_code, reg_dest, reg_coord, myflags, NULL, NULL);
1686 }
1687
1688 static void pshader_hw_texcoord(const struct wined3d_shader_instruction *ins)
1689 {
1690     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1691     SHADER_BUFFER *buffer = ins->ctx->buffer;
1692     DWORD shader_version = WINED3D_SHADER_VERSION(ins->ctx->reg_maps->shader_version.major,
1693             ins->ctx->reg_maps->shader_version.minor);
1694     char dst_str[50];
1695
1696     if (shader_version < WINED3D_SHADER_VERSION(1,4))
1697     {
1698         DWORD reg = dst->reg.idx;
1699
1700         shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1701         shader_addline(buffer, "MOV_SAT %s, fragment.texcoord[%u];\n", dst_str, reg);
1702     } else {
1703         char reg_src[40];
1704
1705         shader_arb_get_src_param(ins, &ins->src[0], 0, reg_src);
1706         shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1707         shader_addline(buffer, "MOV %s, %s;\n", dst_str, reg_src);
1708    }
1709 }
1710
1711 static void pshader_hw_texreg2ar(const struct wined3d_shader_instruction *ins)
1712 {
1713      SHADER_BUFFER *buffer = ins->ctx->buffer;
1714      IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1715      IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1716      DWORD flags;
1717
1718      DWORD reg1 = ins->dst[0].reg.idx;
1719      char dst_str[50];
1720      char src_str[50];
1721
1722      /* Note that texreg2ar treats Tx as a temporary register, not as a varying */
1723      shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1724      shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
1725      /* Move .x first in case src_str is "TA" */
1726      shader_addline(buffer, "MOV TA.y, %s.x;\n", src_str);
1727      shader_addline(buffer, "MOV TA.x, %s.w;\n", src_str);
1728      flags = reg1 < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg1][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1729      shader_hw_sample(ins, reg1, dst_str, "TA", flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
1730 }
1731
1732 static void pshader_hw_texreg2gb(const struct wined3d_shader_instruction *ins)
1733 {
1734      SHADER_BUFFER *buffer = ins->ctx->buffer;
1735
1736      DWORD reg1 = ins->dst[0].reg.idx;
1737      char dst_str[50];
1738      char src_str[50];
1739
1740      /* Note that texreg2gb treats Tx as a temporary register, not as a varying */
1741      shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1742      shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
1743      shader_addline(buffer, "MOV TA.x, %s.y;\n", src_str);
1744      shader_addline(buffer, "MOV TA.y, %s.z;\n", src_str);
1745      shader_hw_sample(ins, reg1, dst_str, "TA", 0, NULL, NULL);
1746 }
1747
1748 static void pshader_hw_texreg2rgb(const struct wined3d_shader_instruction *ins)
1749 {
1750     DWORD reg1 = ins->dst[0].reg.idx;
1751     char dst_str[50];
1752     char src_str[50];
1753
1754     /* Note that texreg2rg treats Tx as a temporary register, not as a varying */
1755     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1756     shader_arb_get_src_param(ins, &ins->src[0], 0, src_str);
1757     shader_hw_sample(ins, reg1, dst_str, src_str, 0, NULL, NULL);
1758 }
1759
1760 static void pshader_hw_texbem(const struct wined3d_shader_instruction *ins)
1761 {
1762     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1763     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1764     SHADER_BUFFER *buffer = ins->ctx->buffer;
1765     char reg_coord[40], dst_reg[50], src_reg[50];
1766     DWORD reg_dest_code;
1767
1768     /* All versions have a destination register. The Tx where the texture coordinates come
1769      * from is the varying incarnation of the texture register
1770      */
1771     reg_dest_code = dst->reg.idx;
1772     shader_arb_get_dst_param(ins, &ins->dst[0], dst_reg);
1773     shader_arb_get_src_param(ins, &ins->src[0], 0, src_reg);
1774     sprintf(reg_coord, "fragment.texcoord[%u]", reg_dest_code);
1775
1776     /* Sampling the perturbation map in Tsrc was done already, including the signedness correction if needed
1777      * The Tx in which the perturbation map is stored is the tempreg incarnation of the texture register
1778      *
1779      * GL_NV_fragment_program_option could handle this in one instruction via X2D:
1780      * X2D TA.xy, fragment.texcoord, T%u, bumpenvmat%u.xzyw
1781      *
1782      * However, the NV extensions are never enabled for <= 2.0 shaders because of the performance penalty that
1783      * comes with it, and texbem is an 1.x only instruction. No 1.x instruction forces us to enable the NV
1784      * extension.
1785      */
1786     shader_addline(buffer, "SWZ TB, bumpenvmat%d, x, z, 0, 0;\n", reg_dest_code);
1787     shader_addline(buffer, "DP3 TA.x, TB, %s;\n", src_reg);
1788     shader_addline(buffer, "SWZ TB, bumpenvmat%d, y, w, 0, 0;\n", reg_dest_code);
1789     shader_addline(buffer, "DP3 TA.y, TB, %s;\n", src_reg);
1790
1791     /* with projective textures, texbem only divides the static texture coord, not the displacement,
1792      * so we can't let the GL handle this.
1793      */
1794     if (((IWineD3DDeviceImpl*) This->baseShader.device)->stateBlock->textureState[reg_dest_code][WINED3DTSS_TEXTURETRANSFORMFLAGS]
1795             & WINED3DTTFF_PROJECTED) {
1796         shader_addline(buffer, "RCP TB.w, %s.w;\n", reg_coord);
1797         shader_addline(buffer, "MUL TB.xy, %s, TB.w;\n", reg_coord);
1798         shader_addline(buffer, "ADD TA.xy, TA, TB;\n");
1799     } else {
1800         shader_addline(buffer, "ADD TA.xy, TA, %s;\n", reg_coord);
1801     }
1802
1803     shader_hw_sample(ins, reg_dest_code, dst_reg, "TA", 0, NULL, NULL);
1804
1805     if (ins->handler_idx == WINED3DSIH_TEXBEML)
1806     {
1807         /* No src swizzles are allowed, so this is ok */
1808         shader_addline(buffer, "MAD TA, %s.z, luminance%d.x, luminance%d.y;\n",
1809                        src_reg, reg_dest_code, reg_dest_code);
1810         shader_addline(buffer, "MUL %s, %s, TA;\n", dst_reg, dst_reg);
1811     }
1812 }
1813
1814 static void pshader_hw_texm3x2pad(const struct wined3d_shader_instruction *ins)
1815 {
1816     DWORD reg = ins->dst[0].reg.idx;
1817     SHADER_BUFFER *buffer = ins->ctx->buffer;
1818     char src0_name[50], dst_name[50];
1819     BOOL is_color;
1820     struct wined3d_shader_register tmp_reg = ins->dst[0].reg;
1821
1822     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1823     /* The next instruction will be a texm3x2tex or texm3x2depth that writes to the uninitialized
1824      * T<reg+1> register. Use this register to store the calculated vector
1825      */
1826     tmp_reg.idx = reg + 1;
1827     shader_arb_get_register_name(ins, &tmp_reg, dst_name, &is_color);
1828     shader_addline(buffer, "DP3 %s.x, fragment.texcoord[%u], %s;\n", dst_name, reg, src0_name);
1829 }
1830
1831 static void pshader_hw_texm3x2tex(const struct wined3d_shader_instruction *ins)
1832 {
1833     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1834     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1835     DWORD flags;
1836     DWORD reg = ins->dst[0].reg.idx;
1837     SHADER_BUFFER *buffer = ins->ctx->buffer;
1838     char dst_str[50];
1839     char src0_name[50];
1840     char dst_reg[50];
1841     BOOL is_color;
1842
1843     /* We know that we're writing to the uninitialized T<reg> register, so use it for temporary storage */
1844     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
1845
1846     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1847     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1848     shader_addline(buffer, "DP3 %s.y, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
1849     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1850     shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
1851 }
1852
1853 static void pshader_hw_texm3x3pad(const struct wined3d_shader_instruction *ins)
1854 {
1855     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1856     DWORD reg = ins->dst[0].reg.idx;
1857     SHADER_BUFFER *buffer = ins->ctx->buffer;
1858     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
1859     char src0_name[50], dst_name[50];
1860     struct wined3d_shader_register tmp_reg = ins->dst[0].reg;
1861     BOOL is_color;
1862
1863     /* There are always 2 texm3x3pad instructions followed by one texm3x3[tex,vspec, ...] instruction, with
1864      * incrementing ins->dst[0].register_idx numbers. So the pad instruction already knows the final destination
1865      * register, and this register is uninitialized(otherwise the assembler complains that it is 'redeclared')
1866      */
1867     tmp_reg.idx = reg + 2 - current_state->current_row;
1868     shader_arb_get_register_name(ins, &tmp_reg, dst_name, &is_color);
1869
1870     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1871     shader_addline(buffer, "DP3 %s.%c, fragment.texcoord[%u], %s;\n",
1872                    dst_name, 'x' + current_state->current_row, reg, src0_name);
1873     current_state->texcoord_w[current_state->current_row++] = reg;
1874 }
1875
1876 static void pshader_hw_texm3x3tex(const struct wined3d_shader_instruction *ins)
1877 {
1878     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1879     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1880     DWORD flags;
1881     DWORD reg = ins->dst[0].reg.idx;
1882     SHADER_BUFFER *buffer = ins->ctx->buffer;
1883     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
1884     char dst_str[50];
1885     char src0_name[50], dst_name[50];
1886     BOOL is_color;
1887
1888     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
1889     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1890     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_name, reg, src0_name);
1891
1892     /* Sample the texture using the calculated coordinates */
1893     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1894     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1895     shader_hw_sample(ins, reg, dst_str, dst_name, flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
1896     current_state->current_row = 0;
1897 }
1898
1899 static void pshader_hw_texm3x3vspec(const struct wined3d_shader_instruction *ins)
1900 {
1901     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1902     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1903     DWORD flags;
1904     DWORD reg = ins->dst[0].reg.idx;
1905     SHADER_BUFFER *buffer = ins->ctx->buffer;
1906     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
1907     char dst_str[50];
1908     char src0_name[50];
1909     char dst_reg[8];
1910     BOOL is_color;
1911
1912     /* Get the dst reg without writemask strings. We know this register is uninitialized, so we can use all
1913      * components for temporary data storage
1914      */
1915     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
1916     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1917     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
1918
1919     /* Construct the eye-ray vector from w coordinates */
1920     shader_addline(buffer, "MOV TB.x, fragment.texcoord[%u].w;\n", current_state->texcoord_w[0]);
1921     shader_addline(buffer, "MOV TB.y, fragment.texcoord[%u].w;\n", current_state->texcoord_w[1]);
1922     shader_addline(buffer, "MOV TB.z, fragment.texcoord[%u].w;\n", reg);
1923
1924     /* Calculate reflection vector
1925      */
1926     shader_addline(buffer, "DP3 %s.w, %s, TB;\n", dst_reg, dst_reg);
1927     /* The .w is ignored when sampling, so I can use TB.w to calculate dot(N, N) */
1928     shader_addline(buffer, "DP3 TB.w, %s, %s;\n", dst_reg, dst_reg);
1929     shader_addline(buffer, "RCP TB.w, TB.w;\n");
1930     shader_addline(buffer, "MUL %s.w, %s.w, TB.w;\n", dst_reg, dst_reg);
1931     shader_addline(buffer, "MUL %s, %s.w, %s;\n", dst_reg, dst_reg, dst_reg);
1932     shader_addline(buffer, "MAD %s, coefmul.x, %s, -TB;\n", dst_reg, dst_reg);
1933
1934     /* Sample the texture using the calculated coordinates */
1935     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1936     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1937     shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
1938     current_state->current_row = 0;
1939 }
1940
1941 static void pshader_hw_texm3x3spec(const struct wined3d_shader_instruction *ins)
1942 {
1943     IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *)ins->ctx->shader;
1944     IWineD3DDeviceImpl* deviceImpl = (IWineD3DDeviceImpl*) This->baseShader.device;
1945     DWORD flags;
1946     DWORD reg = ins->dst[0].reg.idx;
1947     SHADER_PARSE_STATE* current_state = &This->baseShader.parse_state;
1948     SHADER_BUFFER *buffer = ins->ctx->buffer;
1949     char dst_str[50];
1950     char src0_name[50];
1951     char src1_name[50];
1952     char dst_reg[8];
1953     BOOL is_color;
1954
1955     shader_arb_get_src_param(ins, &ins->src[0], 0, src0_name);
1956     shader_arb_get_src_param(ins, &ins->src[0], 1, src1_name);
1957     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_reg, &is_color);
1958     /* Note: dst_reg.xy is input here, generated by two texm3x3pad instructions */
1959     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_reg, reg, src0_name);
1960
1961     /* Calculate reflection vector.
1962      *
1963      *                   dot(N, E)
1964      * dst_reg.xyz = 2 * --------- * N - E
1965      *                   dot(N, N)
1966      *
1967      * Which normalizes the normal vector
1968      */
1969     shader_addline(buffer, "DP3 %s.w, %s, %s;\n", dst_reg, dst_reg, src1_name);
1970     shader_addline(buffer, "DP3 TC.w, %s, %s;\n", dst_reg, dst_reg);
1971     shader_addline(buffer, "RCP TC.w, TC.w;\n");
1972     shader_addline(buffer, "MUL %s.w, %s.w, TC.w;\n", dst_reg, dst_reg);
1973     shader_addline(buffer, "MUL %s, %s.w, %s;\n", dst_reg, dst_reg, dst_reg);
1974     shader_addline(buffer, "MAD %s, coefmul.x, %s, -%s;\n", dst_reg, dst_reg, src1_name);
1975
1976     /* Sample the texture using the calculated coordinates */
1977     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
1978     flags = reg < MAX_TEXTURES ? deviceImpl->stateBlock->textureState[reg][WINED3DTSS_TEXTURETRANSFORMFLAGS] : 0;
1979     shader_hw_sample(ins, reg, dst_str, dst_reg, flags & WINED3DTTFF_PROJECTED ? TEX_PROJ : 0, NULL, NULL);
1980     current_state->current_row = 0;
1981 }
1982
1983 static void pshader_hw_texdepth(const struct wined3d_shader_instruction *ins)
1984 {
1985     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
1986     SHADER_BUFFER *buffer = ins->ctx->buffer;
1987     char dst_name[50];
1988
1989     /* texdepth has an implicit destination, the fragment depth value. It's only parameter,
1990      * which is essentially an input, is the destination register because it is the first
1991      * parameter. According to the msdn, this must be register r5, but let's keep it more flexible
1992      * here(writemasks/swizzles are not valid on texdepth)
1993      */
1994     shader_arb_get_dst_param(ins, dst, dst_name);
1995
1996     /* According to the msdn, the source register(must be r5) is unusable after
1997      * the texdepth instruction, so we're free to modify it
1998      */
1999     shader_addline(buffer, "MIN %s.y, %s.y, one.y;\n", dst_name, dst_name);
2000
2001     /* How to deal with the special case dst_name.g == 0? if r != 0, then
2002      * the r * (1 / 0) will give infinity, which is clamped to 1.0, the correct
2003      * result. But if r = 0.0, then 0 * inf = 0, which is incorrect.
2004      */
2005     shader_addline(buffer, "RCP %s.y, %s.y;\n", dst_name, dst_name);
2006     shader_addline(buffer, "MUL TA.x, %s.x, %s.y;\n", dst_name, dst_name);
2007     shader_addline(buffer, "MIN TA.x, TA.x, one.x;\n");
2008     shader_addline(buffer, "MAX result.depth, TA.x, 0.0;\n");
2009 }
2010
2011 /** Process the WINED3DSIO_TEXDP3TEX instruction in ARB:
2012  * Take a 3-component dot product of the TexCoord[dstreg] and src,
2013  * then perform a 1D texture lookup from stage dstregnum, place into dst. */
2014 static void pshader_hw_texdp3tex(const struct wined3d_shader_instruction *ins)
2015 {
2016     SHADER_BUFFER *buffer = ins->ctx->buffer;
2017     DWORD sampler_idx = ins->dst[0].reg.idx;
2018     char src0[50];
2019     char dst_str[50];
2020
2021     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
2022     shader_addline(buffer, "MOV TB, 0.0;\n");
2023     shader_addline(buffer, "DP3 TB.x, fragment.texcoord[%u], %s;\n", sampler_idx, src0);
2024
2025     shader_arb_get_dst_param(ins, &ins->dst[0], dst_str);
2026     shader_hw_sample(ins, sampler_idx, dst_str, "TB", 0 /* Only one coord, can't be projected */, NULL, NULL);
2027 }
2028
2029 /** Process the WINED3DSIO_TEXDP3 instruction in ARB:
2030  * Take a 3-component dot product of the TexCoord[dstreg] and src. */
2031 static void pshader_hw_texdp3(const struct wined3d_shader_instruction *ins)
2032 {
2033     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2034     char src0[50];
2035     char dst_str[50];
2036     SHADER_BUFFER *buffer = ins->ctx->buffer;
2037
2038     /* Handle output register */
2039     shader_arb_get_dst_param(ins, dst, dst_str);
2040     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
2041     shader_addline(buffer, "DP3 %s, fragment.texcoord[%u], %s;\n", dst_str, dst->reg.idx, src0);
2042 }
2043
2044 /** Process the WINED3DSIO_TEXM3X3 instruction in ARB
2045  * Perform the 3rd row of a 3x3 matrix multiply */
2046 static void pshader_hw_texm3x3(const struct wined3d_shader_instruction *ins)
2047 {
2048     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2049     SHADER_BUFFER *buffer = ins->ctx->buffer;
2050     char dst_str[50], dst_name[50];
2051     char src0[50];
2052     BOOL is_color;
2053
2054     shader_arb_get_dst_param(ins, dst, dst_str);
2055     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
2056     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
2057     shader_addline(buffer, "DP3 %s.z, fragment.texcoord[%u], %s;\n", dst_name, dst->reg.idx, src0);
2058     shader_addline(buffer, "MOV %s, %s;\n", dst_str, dst_name);
2059 }
2060
2061 /** Process the WINED3DSIO_TEXM3X2DEPTH instruction in ARB:
2062  * Last row of a 3x2 matrix multiply, use the result to calculate the depth:
2063  * Calculate tmp0.y = TexCoord[dstreg] . src.xyz;  (tmp0.x has already been calculated)
2064  * depth = (tmp0.y == 0.0) ? 1.0 : tmp0.x / tmp0.y
2065  */
2066 static void pshader_hw_texm3x2depth(const struct wined3d_shader_instruction *ins)
2067 {
2068     SHADER_BUFFER *buffer = ins->ctx->buffer;
2069     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2070     char src0[50], dst_name[50];
2071     BOOL is_color;
2072
2073     shader_arb_get_src_param(ins, &ins->src[0], 0, src0);
2074     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
2075     shader_addline(buffer, "DP3 %s.y, fragment.texcoord[%u], %s;\n", dst_name, dst->reg.idx, src0);
2076
2077     /* How to deal with the special case dst_name.g == 0? if r != 0, then
2078      * the r * (1 / 0) will give infinity, which is clamped to 1.0, the correct
2079      * result. But if r = 0.0, then 0 * inf = 0, which is incorrect.
2080      */
2081     shader_addline(buffer, "RCP %s.y, %s.y;\n", dst_name, dst_name);
2082     shader_addline(buffer, "MUL %s.x, %s.x, %s.y;\n", dst_name, dst_name, dst_name);
2083     shader_addline(buffer, "MIN %s.x, %s.x, one.x;\n", dst_name, dst_name);
2084     shader_addline(buffer, "MAX result.depth, %s.x, 0.0;\n", dst_name);
2085 }
2086
2087 /** Handles transforming all WINED3DSIO_M?x? opcodes for
2088     Vertex/Pixel shaders to ARB_vertex_program codes */
2089 static void shader_hw_mnxn(const struct wined3d_shader_instruction *ins)
2090 {
2091     int i;
2092     int nComponents = 0;
2093     struct wined3d_shader_dst_param tmp_dst = {{0}};
2094     struct wined3d_shader_src_param tmp_src[2] = {{{0}}};
2095     struct wined3d_shader_instruction tmp_ins;
2096
2097     memset(&tmp_ins, 0, sizeof(tmp_ins));
2098
2099     /* Set constants for the temporary argument */
2100     tmp_ins.ctx = ins->ctx;
2101     tmp_ins.dst_count = 1;
2102     tmp_ins.dst = &tmp_dst;
2103     tmp_ins.src_count = 2;
2104     tmp_ins.src = tmp_src;
2105
2106     switch(ins->handler_idx)
2107     {
2108         case WINED3DSIH_M4x4:
2109             nComponents = 4;
2110             tmp_ins.handler_idx = WINED3DSIH_DP4;
2111             break;
2112         case WINED3DSIH_M4x3:
2113             nComponents = 3;
2114             tmp_ins.handler_idx = WINED3DSIH_DP4;
2115             break;
2116         case WINED3DSIH_M3x4:
2117             nComponents = 4;
2118             tmp_ins.handler_idx = WINED3DSIH_DP3;
2119             break;
2120         case WINED3DSIH_M3x3:
2121             nComponents = 3;
2122             tmp_ins.handler_idx = WINED3DSIH_DP3;
2123             break;
2124         case WINED3DSIH_M3x2:
2125             nComponents = 2;
2126             tmp_ins.handler_idx = WINED3DSIH_DP3;
2127             break;
2128         default:
2129             FIXME("Unhandled opcode %#x\n", ins->handler_idx);
2130             break;
2131     }
2132
2133     tmp_dst = ins->dst[0];
2134     tmp_src[0] = ins->src[0];
2135     tmp_src[1] = ins->src[1];
2136     for (i = 0; i < nComponents; i++) {
2137         tmp_dst.write_mask = WINED3DSP_WRITEMASK_0 << i;
2138         shader_hw_map2gl(&tmp_ins);
2139         ++tmp_src[1].reg.idx;
2140     }
2141 }
2142
2143 static void shader_hw_rsq_rcp(const struct wined3d_shader_instruction *ins)
2144 {
2145     SHADER_BUFFER *buffer = ins->ctx->buffer;
2146     const char *instruction;
2147
2148     char dst[50];
2149     char src[50];
2150
2151     switch(ins->handler_idx)
2152     {
2153         case WINED3DSIH_RSQ: instruction = "RSQ"; break;
2154         case WINED3DSIH_RCP: instruction = "RCP"; break;
2155         default: instruction = "";
2156             FIXME("Unhandled opcode %#x\n", ins->handler_idx);
2157             break;
2158     }
2159
2160     shader_arb_get_dst_param(ins, &ins->dst[0], dst); /* Destination */
2161     shader_arb_get_src_param(ins, &ins->src[0], 0, src);
2162     if (ins->src[0].swizzle == WINED3DSP_NOSWIZZLE)
2163     {
2164         /* Dx sdk says .x is used if no swizzle is given, but our test shows that
2165          * .w is used
2166          */
2167         strcat(src, ".w");
2168     }
2169
2170     shader_addline(buffer, "%s%s %s, %s;\n", instruction, shader_arb_get_modifier(ins), dst, src);
2171 }
2172
2173 static void shader_hw_nrm(const struct wined3d_shader_instruction *ins)
2174 {
2175     SHADER_BUFFER *buffer = ins->ctx->buffer;
2176     char dst_name[50];
2177     char src_name[50];
2178     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2179     BOOL pshader = shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type);
2180
2181     shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
2182     shader_arb_get_src_param(ins, &ins->src[0], 1 /* Use TB */, src_name);
2183
2184     if(pshader && priv->target_version >= NV3)
2185     {
2186         shader_addline(buffer, "NRM%s %s, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name);
2187     }
2188     else
2189     {
2190         shader_addline(buffer, "DP3 TA, %s, %s;\n", src_name, src_name);
2191         shader_addline(buffer, "RSQ TA, TA.x;\n");
2192         /* dst.w = src[0].w * 1 / (src.x^2 + src.y^2 + src.z^2)^(1/2) according to msdn*/
2193         shader_addline(buffer, "MUL%s %s, %s, TA;\n", shader_arb_get_modifier(ins), dst_name,
2194                     src_name);
2195     }
2196 }
2197
2198 static void shader_hw_lrp(const struct wined3d_shader_instruction *ins)
2199 {
2200     SHADER_BUFFER *buffer = ins->ctx->buffer;
2201     char dst_name[50];
2202     char src_name[3][50];
2203
2204     /* ARB_fragment_program has a convenient LRP instruction */
2205     if(shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)) {
2206         shader_hw_map2gl(ins);
2207         return;
2208     }
2209
2210     shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
2211     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name[0]);
2212     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name[1]);
2213     shader_arb_get_src_param(ins, &ins->src[2], 2, src_name[2]);
2214
2215     shader_addline(buffer, "SUB TA, %s, %s;\n", src_name[1], src_name[2]);
2216     shader_addline(buffer, "MAD%s %s, %s, TA, %s;\n", shader_arb_get_modifier(ins),
2217                    dst_name, src_name[0], src_name[2]);
2218 }
2219
2220 static void shader_hw_sincos(const struct wined3d_shader_instruction *ins)
2221 {
2222     /* This instruction exists in ARB, but the d3d instruction takes two extra parameters which
2223      * must contain fixed constants. So we need a separate function to filter those constants and
2224      * can't use map2gl
2225      */
2226     SHADER_BUFFER *buffer = ins->ctx->buffer;
2227     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2228     const struct wined3d_shader_dst_param *dst = &ins->dst[0];
2229     char dst_name[50];
2230     char src_name0[50], src_name1[50], src_name2[50];
2231     BOOL is_color;
2232
2233     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0);
2234     if(shader_is_pshader_version(ins->ctx->reg_maps->shader_version.type)) {
2235         shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
2236         /* No modifiers are supported on SCS */
2237         shader_addline(buffer, "SCS %s, %s;\n", dst_name, src_name0);
2238
2239         if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE)
2240         {
2241             shader_arb_get_register_name(ins, &dst->reg, src_name0, &is_color);
2242             shader_addline(buffer, "MOV_SAT %s, %s;\n", dst_name, src_name0);
2243         }
2244     } else if(priv->target_version >= NV2) {
2245         shader_arb_get_register_name(ins, &dst->reg, dst_name, &is_color);
2246
2247         /* Sincos writemask must be .x, .y or .xy */
2248         if(dst->write_mask & WINED3DSP_WRITEMASK_0)
2249             shader_addline(buffer, "COS%s %s.x, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name0);
2250         if(dst->write_mask & WINED3DSP_WRITEMASK_1)
2251             shader_addline(buffer, "SIN%s %s.y, %s;\n", shader_arb_get_modifier(ins), dst_name, src_name0);
2252     } else {
2253         /* Approximate sine and cosine with a taylor series, as per math textbook. The application passes 8
2254          * helper constants(D3DSINCOSCONST1 and D3DSINCOSCONST2) in src1 and src2.
2255          *
2256          * sin(x) = x - x^3/3! + x^5/5! - x^7/7! + ...
2257          * cos(x) = 1 - x^2/2! + x^4/4! - x^6/6! + ...
2258          *
2259          * The constants we get are:
2260          *
2261          *  +1   +1,     -1     -1     +1      +1      -1       -1
2262          *      ---- ,  ---- , ---- , ----- , ----- , ----- , ------
2263          *      1!*2    2!*4   3!*8   4!*16   5!*32   6!*64   7!*128
2264          *
2265          * If used with x^2, x^3, x^4 etc they calculate sin(x/2) and cos(x/2):
2266          *
2267          * (x/2)^2 = x^2 / 4
2268          * (x/2)^3 = x^3 / 8
2269          * (x/2)^4 = x^4 / 16
2270          * (x/2)^5 = x^5 / 32
2271          * etc
2272          *
2273          * To get the final result:
2274          * sin(x) = 2 * sin(x/2) * cos(x/2)
2275          * cos(x) = cos(x/2)^2 - sin(x/2)^2
2276          * (from sin(x+y) and cos(x+y) rules)
2277          *
2278          * As per MSDN, dst.z is undefined after the operation, and so is
2279          * dst.x and dst.y if they're masked out by the writemask. Ie
2280          * sincos dst.y, src1, c0, c1
2281          * returns the sine in dst.y. dst.x and dst.z are undefined, dst.w is not touched. The assembler
2282          * vsa.exe also stops with an error if the dest register is the same register as the source
2283          * register. This means we can use dest.xyz as temporary storage. The assembler vsa.exe output also
2284          * indicates that sincos consumes 8 instruction slots in vs_2_0(and, strangely, in vs_3_0).
2285          */
2286         shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1);
2287         shader_arb_get_src_param(ins, &ins->src[2], 2, src_name2);
2288         shader_arb_get_register_name(ins, &dst->reg, dst_name, &is_color);
2289
2290         shader_addline(buffer, "MUL %s.x, %s, %s;\n", dst_name, src_name0, src_name0);  /* x ^ 2 */
2291         shader_addline(buffer, "MUL TA.y, %s.x, %s;\n", dst_name, src_name0);           /* x ^ 3 */
2292         shader_addline(buffer, "MUL %s.y, TA.y, %s;\n", dst_name, src_name0);           /* x ^ 4 */
2293         shader_addline(buffer, "MUL TA.z, %s.y, %s;\n", dst_name, src_name0);           /* x ^ 5 */
2294         shader_addline(buffer, "MUL %s.z, TA.z, %s;\n", dst_name, src_name0);           /* x ^ 6 */
2295         shader_addline(buffer, "MUL TA.w, %s.z, %s;\n", dst_name, src_name0);           /* x ^ 7 */
2296
2297         /* sin(x/2)
2298          *
2299          * Unfortunately we don't get the constants in a DP4-capable form. Is there a way to
2300          * properly merge that with MULs in the code above?
2301          * The swizzles .yz and xw however fit into the .yzxw swizzle added to ps_2_0. Maybe
2302          * we can merge the sine and cosine MAD rows to calculate them together.
2303          */
2304         shader_addline(buffer, "MUL TA.x, %s, %s.w;\n", src_name0, src_name2); /* x^1, +1/(1!*2) */
2305         shader_addline(buffer, "MAD TA.x, TA.y, %s.x, TA.x;\n", src_name2); /* -1/(3!*8) */
2306         shader_addline(buffer, "MAD TA.x, TA.z, %s.w, TA.x;\n", src_name1); /* +1/(5!*32) */
2307         shader_addline(buffer, "MAD TA.x, TA.w, %s.x, TA.x;\n", src_name1); /* -1/(7!*128) */
2308
2309         /* cos(x/2) */
2310         shader_addline(buffer, "MAD TA.y, %s.x, %s.y, %s.z;\n", dst_name, src_name2, src_name2); /* -1/(2!*4), +1.0 */
2311         shader_addline(buffer, "MAD TA.y, %s.y, %s.z, TA.y;\n", dst_name, src_name1); /* +1/(4!*16) */
2312         shader_addline(buffer, "MAD TA.y, %s.z, %s.y, TA.y;\n", dst_name, src_name1); /* -1/(6!*64) */
2313
2314         if(dst->write_mask & WINED3DSP_WRITEMASK_0) {
2315             /* cos x */
2316             shader_addline(buffer, "MUL TA.z, TA.y, TA.y;\n");
2317             shader_addline(buffer, "MAD %s.x, -TA.x, TA.x, TA.z;\n", dst_name);
2318         }
2319         if(dst->write_mask & WINED3DSP_WRITEMASK_1) {
2320             /* sin x */
2321             shader_addline(buffer, "MUL %s.y, TA.x, TA.y;\n", dst_name);
2322             shader_addline(buffer, "ADD %s.y, %s.y, %s.y;\n", dst_name, dst_name, dst_name);
2323         }
2324     }
2325 }
2326
2327 static void shader_hw_sgn(const struct wined3d_shader_instruction *ins)
2328 {
2329     SHADER_BUFFER *buffer = ins->ctx->buffer;
2330     char dst_name[50];
2331     char src_name[50];
2332     struct shader_arb_ctx_priv *ctx = ins->ctx->backend_data;
2333
2334     /* SGN is only valid in vertex shaders */
2335     if(ctx->target_version == NV2) {
2336         shader_hw_map2gl(ins);
2337         return;
2338     }
2339     shader_arb_get_dst_param(ins, &ins->dst[0], dst_name);
2340     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name);
2341
2342     /* If SRC > 0.0, -SRC < SRC = TRUE, otherwise false.
2343      * if SRC < 0.0,  SRC < -SRC = TRUE. If neither is true, src = 0.0
2344      */
2345     if(ins->dst[0].modifiers & WINED3DSPDM_SATURATE) {
2346         shader_addline(buffer, "SLT %s, -%s, %s;\n", dst_name, src_name, src_name);
2347     } else {
2348         /* src contains TA? Write to the dest first. This won't overwrite our destination.
2349          * Then use TA, and calculate the final result
2350          *
2351          * Not reading from TA? Store the first result in TA to avoid overwriting the
2352          * destination if src reg = dst reg
2353          */
2354         if(strstr(src_name, "TA"))
2355         {
2356             shader_addline(buffer, "SLT %s,  %s, -%s;\n", dst_name, src_name, src_name);
2357             shader_addline(buffer, "SLT TA, -%s, %s;\n", src_name, src_name);
2358             shader_addline(buffer, "ADD %s, %s, -TA;\n", dst_name, dst_name);
2359         }
2360         else
2361         {
2362             shader_addline(buffer, "SLT TA, -%s, %s;\n", src_name, src_name);
2363             shader_addline(buffer, "SLT %s,  %s, -%s;\n", dst_name, src_name, src_name);
2364             shader_addline(buffer, "ADD %s, TA, -%s;\n", dst_name, dst_name);
2365         }
2366     }
2367 }
2368
2369 static void shader_hw_dsy(const struct wined3d_shader_instruction *ins)
2370 {
2371     SHADER_BUFFER *buffer = ins->ctx->buffer;
2372     char src[50];
2373     char dst[50];
2374     char dst_name[50];
2375     BOOL is_color;
2376
2377     shader_arb_get_dst_param(ins, &ins->dst[0], dst);
2378     shader_arb_get_src_param(ins, &ins->src[0], 0, src);
2379     shader_arb_get_register_name(ins, &ins->dst[0].reg, dst_name, &is_color);
2380
2381     shader_addline(buffer, "DDY %s, %s;\n", dst, src);
2382     shader_addline(buffer, "MUL%s %s, %s, ycorrection.y;\n", shader_arb_get_modifier(ins), dst, dst_name);
2383 }
2384
2385 static void shader_hw_loop(const struct wined3d_shader_instruction *ins)
2386 {
2387     SHADER_BUFFER *buffer = ins->ctx->buffer;
2388     char src_name[50];
2389     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2390
2391     /* src0 is aL */
2392     shader_arb_get_src_param(ins, &ins->src[1], 0, src_name);
2393
2394     if(vshader)
2395     {
2396         struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2397         struct list *e = list_head(&priv->control_frames);
2398         struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2399
2400         if(priv->loop_depth > 1) shader_addline(buffer, "PUSHA aL;\n");
2401         /* The constant loader makes sure to load -1 into iX.w */
2402         shader_addline(buffer, "ARLC aL, %s.xywz;\n", src_name);
2403         shader_addline(buffer, "BRA loop_%u_end (LE.x);\n", control_frame->loop_no);
2404         shader_addline(buffer, "loop_%u_start:\n", control_frame->loop_no);
2405     }
2406     else
2407     {
2408         shader_addline(buffer, "LOOP %s;\n", src_name);
2409     }
2410 }
2411
2412 static void shader_hw_rep(const struct wined3d_shader_instruction *ins)
2413 {
2414     SHADER_BUFFER *buffer = ins->ctx->buffer;
2415     char src_name[50];
2416     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2417
2418     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name);
2419
2420     /* The constant loader makes sure to load -1 into iX.w */
2421     if(vshader)
2422     {
2423         struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2424         struct list *e = list_head(&priv->control_frames);
2425         struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2426
2427         if(priv->loop_depth > 1) shader_addline(buffer, "PUSHA aL;\n");
2428
2429         shader_addline(buffer, "ARLC aL, %s.xywz;\n", src_name);
2430         shader_addline(buffer, "BRA loop_%u_end (LE.x);\n", control_frame->loop_no);
2431         shader_addline(buffer, "loop_%u_start:\n", control_frame->loop_no);
2432     }
2433     else
2434     {
2435         shader_addline(buffer, "REP %s;\n", src_name);
2436     }
2437 }
2438
2439 static void shader_hw_endloop(const struct wined3d_shader_instruction *ins)
2440 {
2441     SHADER_BUFFER *buffer = ins->ctx->buffer;
2442     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2443
2444     if(vshader)
2445     {
2446         struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2447         struct list *e = list_head(&priv->control_frames);
2448         struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2449
2450         shader_addline(buffer, "ARAC aL.xy, aL;\n");
2451         shader_addline(buffer, "BRA loop_%u_start (GT.x);\n", control_frame->loop_no);
2452         shader_addline(buffer, "loop_%u_end:\n", control_frame->loop_no);
2453
2454         if(priv->loop_depth > 1) shader_addline(buffer, "POPA aL;\n");
2455     }
2456     else
2457     {
2458         shader_addline(buffer, "ENDLOOP;\n");
2459     }
2460 }
2461
2462 static void shader_hw_endrep(const struct wined3d_shader_instruction *ins)
2463 {
2464     SHADER_BUFFER *buffer = ins->ctx->buffer;
2465     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2466
2467     if(vshader)
2468     {
2469         struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2470         struct list *e = list_head(&priv->control_frames);
2471         struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2472
2473         shader_addline(buffer, "ARAC aL.xy, aL;\n");
2474         shader_addline(buffer, "BRA loop_%u_start (GT.x);\n", control_frame->loop_no);
2475         shader_addline(buffer, "loop_%u_end:\n", control_frame->loop_no);
2476
2477         if(priv->loop_depth > 1) shader_addline(buffer, "POPA aL;\n");
2478     }
2479     else
2480     {
2481         shader_addline(buffer, "ENDREP;\n");
2482     }
2483 }
2484
2485 static const struct control_frame *find_last_loop(const struct shader_arb_ctx_priv *priv)
2486 {
2487     struct control_frame *control_frame;
2488
2489     LIST_FOR_EACH_ENTRY(control_frame, &priv->control_frames, struct control_frame, entry)
2490     {
2491         if(control_frame->type == LOOP || control_frame->type == REP) return control_frame;
2492     }
2493     ERR("Could not find loop for break\n");
2494     return NULL;
2495 }
2496
2497 static void shader_hw_break(const struct wined3d_shader_instruction *ins)
2498 {
2499     SHADER_BUFFER *buffer = ins->ctx->buffer;
2500     const struct control_frame *control_frame = find_last_loop(ins->ctx->backend_data);
2501     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2502
2503     if(vshader)
2504     {
2505         shader_addline(buffer, "BRA loop_%u_end;\n", control_frame->loop_no);
2506     }
2507     else
2508     {
2509         shader_addline(buffer, "BRK;\n");
2510     }
2511 }
2512
2513 static const char *get_compare(COMPARISON_TYPE flags)
2514 {
2515     switch (flags)
2516     {
2517         case COMPARISON_GT: return "GT";
2518         case COMPARISON_EQ: return "EQ";
2519         case COMPARISON_GE: return "GE";
2520         case COMPARISON_LT: return "LT";
2521         case COMPARISON_NE: return "NE";
2522         case COMPARISON_LE: return "LE";
2523         default:
2524             FIXME("Unrecognized comparison value: %u\n", flags);
2525             return "(\?\?)";
2526     }
2527 }
2528
2529 static COMPARISON_TYPE invert_compare(COMPARISON_TYPE flags)
2530 {
2531     switch (flags)
2532     {
2533         case COMPARISON_GT: return COMPARISON_LE;
2534         case COMPARISON_EQ: return COMPARISON_NE;
2535         case COMPARISON_GE: return COMPARISON_LT;
2536         case COMPARISON_LT: return COMPARISON_GE;
2537         case COMPARISON_NE: return COMPARISON_EQ;
2538         case COMPARISON_LE: return COMPARISON_GT;
2539         default:
2540             FIXME("Unrecognized comparison value: %u\n", flags);
2541             return -1;
2542     }
2543 }
2544
2545 static void shader_hw_breakc(const struct wined3d_shader_instruction *ins)
2546 {
2547     SHADER_BUFFER *buffer = ins->ctx->buffer;
2548     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2549     const struct control_frame *control_frame = find_last_loop(ins->ctx->backend_data);
2550     char src_name0[50];
2551     char src_name1[50];
2552     const char *comp = get_compare(ins->flags);
2553
2554     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0);
2555     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1);
2556
2557     if(vshader)
2558     {
2559         /* SUBC CC, src0, src1" works only in pixel shaders, so use TA to throw
2560          * away the subtraction result
2561          */
2562         shader_addline(buffer, "SUBC TA, %s, %s;\n", src_name0, src_name1);
2563         shader_addline(buffer, "BRA loop_%u_end (%s.x);\n", control_frame->loop_no, comp);
2564     }
2565     else
2566     {
2567         shader_addline(buffer, "SUBC TA, %s, %s;\n", src_name0, src_name1);
2568         shader_addline(buffer, "BRK (%s.x);\n", comp);
2569     }
2570 }
2571
2572 static void shader_hw_ifc(const struct wined3d_shader_instruction *ins)
2573 {
2574     SHADER_BUFFER *buffer = ins->ctx->buffer;
2575     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2576     struct list *e = list_head(&priv->control_frames);
2577     struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2578     const char *comp;
2579     char src_name0[50];
2580     char src_name1[50];
2581     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2582
2583     shader_arb_get_src_param(ins, &ins->src[0], 0, src_name0);
2584     shader_arb_get_src_param(ins, &ins->src[1], 1, src_name1);
2585
2586     if(vshader)
2587     {
2588         /* Invert the flag. We jump to the else label if the condition is NOT true */
2589         comp = get_compare(invert_compare(ins->flags));
2590         shader_addline(buffer, "SUBC TA, %s, %s;\n", src_name0, src_name1);
2591         shader_addline(buffer, "BRA ifc_%u_endif (%s.x);\n", control_frame->ifc_no, comp);
2592     }
2593     else
2594     {
2595         comp = get_compare(ins->flags);
2596         shader_addline(buffer, "SUBC TA, %s, %s;\n", src_name0, src_name1);
2597         shader_addline(buffer, "IF %s.x;\n", comp);
2598     }
2599 }
2600
2601 static void shader_hw_else(const struct wined3d_shader_instruction *ins)
2602 {
2603     SHADER_BUFFER *buffer = ins->ctx->buffer;
2604     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2605     struct list *e = list_head(&priv->control_frames);
2606     struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2607     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2608
2609     if(vshader)
2610     {
2611         shader_addline(buffer, "BRA ifc_%u_endif;\n", control_frame->ifc_no);
2612         shader_addline(buffer, "ifc_%u_else:\n", control_frame->ifc_no);
2613         control_frame->had_else = TRUE;
2614     }
2615     else
2616     {
2617         shader_addline(buffer, "ELSE;\n");
2618     }
2619 }
2620
2621 static void shader_hw_endif(const struct wined3d_shader_instruction *ins)
2622 {
2623     SHADER_BUFFER *buffer = ins->ctx->buffer;
2624     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
2625     struct list *e = list_head(&priv->control_frames);
2626     struct control_frame *control_frame = LIST_ENTRY(e, struct control_frame, entry);
2627     BOOL vshader = shader_is_vshader_version(ins->ctx->reg_maps->shader_version.type);
2628
2629     if(vshader)
2630     {
2631         if(control_frame->had_else)
2632         {
2633             shader_addline(buffer, "ifc_%u_endif:\n", control_frame->ifc_no);
2634         }
2635         else
2636         {
2637             shader_addline(buffer, "#No else branch. else is endif\n");
2638             shader_addline(buffer, "ifc_%u_else:\n", control_frame->ifc_no);
2639         }
2640     }
2641     else
2642     {
2643         shader_addline(buffer, "ENDIF;\n");
2644     }
2645 }
2646
2647 static void shader_hw_texldd(const struct wined3d_shader_instruction *ins)
2648 {
2649     DWORD sampler_idx = ins->src[1].reg.idx;
2650     char reg_dest[40];
2651     char reg_src[3][40];
2652     DWORD flags = TEX_DERIV;
2653
2654     shader_arb_get_dst_param(ins, &ins->dst[0], reg_dest);
2655     shader_arb_get_src_param(ins, &ins->src[0], 0, reg_src[0]);
2656     shader_arb_get_src_param(ins, &ins->src[2], 1, reg_src[1]);
2657     shader_arb_get_src_param(ins, &ins->src[3], 2, reg_src[2]);
2658
2659     if (ins->flags & WINED3DSI_TEXLD_PROJECT) flags |= TEX_PROJ;
2660     if (ins->flags & WINED3DSI_TEXLD_BIAS) flags |= TEX_BIAS;
2661
2662     shader_hw_sample(ins, sampler_idx, reg_dest, reg_src[0], flags, reg_src[1], reg_src[2]);
2663 }
2664
2665 static void shader_hw_texldl(const struct wined3d_shader_instruction *ins)
2666 {
2667     DWORD sampler_idx = ins->src[1].reg.idx;
2668     char reg_dest[40];
2669     char reg_coord[40];
2670     DWORD flags = TEX_LOD;
2671
2672     shader_arb_get_dst_param(ins, &ins->dst[0], reg_dest);
2673     shader_arb_get_src_param(ins, &ins->src[0], 0, reg_coord);
2674
2675     if (ins->flags & WINED3DSI_TEXLD_PROJECT) flags |= TEX_PROJ;
2676     if (ins->flags & WINED3DSI_TEXLD_BIAS) flags |= TEX_BIAS;
2677
2678     shader_hw_sample(ins, sampler_idx, reg_dest, reg_coord, flags, NULL, NULL);
2679 }
2680
2681 /* GL locking is done by the caller */
2682 static GLuint create_arb_blt_vertex_program(const WineD3D_GL_Info *gl_info)
2683 {
2684     GLuint program_id = 0;
2685     const char *blt_vprogram =
2686         "!!ARBvp1.0\n"
2687         "PARAM c[1] = { { 1, 0.5 } };\n"
2688         "MOV result.position, vertex.position;\n"
2689         "MOV result.color, c[0].x;\n"
2690         "MOV result.texcoord[0], vertex.texcoord[0];\n"
2691         "END\n";
2692
2693     GL_EXTCALL(glGenProgramsARB(1, &program_id));
2694     GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, program_id));
2695     GL_EXTCALL(glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, strlen(blt_vprogram), blt_vprogram));
2696
2697     if (glGetError() == GL_INVALID_OPERATION) {
2698         GLint pos;
2699         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
2700         FIXME("Vertex program error at position %d: %s\n", pos,
2701             debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
2702     }
2703
2704     return program_id;
2705 }
2706
2707 /* GL locking is done by the caller */
2708 static GLuint create_arb_blt_fragment_program(const WineD3D_GL_Info *gl_info, enum tex_types tex_type)
2709 {
2710     GLuint program_id = 0;
2711     static const char * const blt_fprograms[tex_type_count] =
2712     {
2713         /* tex_1d */
2714         NULL,
2715         /* tex_2d */
2716         "!!ARBfp1.0\n"
2717         "TEMP R0;\n"
2718         "TEX R0.x, fragment.texcoord[0], texture[0], 2D;\n"
2719         "MOV result.depth.z, R0.x;\n"
2720         "END\n",
2721         /* tex_3d */
2722         NULL,
2723         /* tex_cube */
2724         "!!ARBfp1.0\n"
2725         "TEMP R0;\n"
2726         "TEX R0.x, fragment.texcoord[0], texture[0], CUBE;\n"
2727         "MOV result.depth.z, R0.x;\n"
2728         "END\n",
2729         /* tex_rect */
2730         "!!ARBfp1.0\n"
2731         "TEMP R0;\n"
2732         "TEX R0.x, fragment.texcoord[0], texture[0], RECT;\n"
2733         "MOV result.depth.z, R0.x;\n"
2734         "END\n",
2735     };
2736
2737     if (!blt_fprograms[tex_type])
2738     {
2739         FIXME("tex_type %#x not supported\n", tex_type);
2740         tex_type = tex_2d;
2741     }
2742
2743     GL_EXTCALL(glGenProgramsARB(1, &program_id));
2744     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, program_id));
2745     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, strlen(blt_fprograms[tex_type]), blt_fprograms[tex_type]));
2746
2747     if (glGetError() == GL_INVALID_OPERATION) {
2748         GLint pos;
2749         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
2750         FIXME("Fragment program error at position %d: %s\n", pos,
2751             debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
2752     }
2753
2754     return program_id;
2755 }
2756
2757 static void arbfp_add_sRGB_correction(SHADER_BUFFER *buffer, const char *fragcolor, const char *tmp1,
2758                                       const char *tmp2, const char *tmp3, const char *tmp4, BOOL condcode) {
2759     /* Perform sRGB write correction. See GLX_EXT_framebuffer_sRGB */
2760
2761     if(condcode)
2762     {
2763         /* Sigh. MOVC CC doesn't work, so use one of the temps as dummy dest */
2764         shader_addline(buffer, "SUBC %s, %s.x, srgb_consts1.y;\n", tmp1, fragcolor);
2765         /* Calculate the > 0.0031308 case */
2766         shader_addline(buffer, "POW %s.x (GE), %s.x, srgb_consts1.z;\n", fragcolor, fragcolor);
2767         shader_addline(buffer, "POW %s.y (GE), %s.y, srgb_consts1.z;\n", fragcolor, fragcolor);
2768         shader_addline(buffer, "POW %s.z (GE), %s.z, srgb_consts1.z;\n", fragcolor, fragcolor);
2769         shader_addline(buffer, "MUL %s.xyz (GE), %s, srgb_consts1.w;\n", fragcolor, fragcolor);
2770         shader_addline(buffer, "SUB %s.xyz (GE), %s, srgb_consts2.x;\n", fragcolor, fragcolor);
2771         /* Calculate the < case */
2772         shader_addline(buffer, "MUL %s.xyz (LT), srgb_consts1.x, %s;\n", fragcolor, fragcolor);
2773     }
2774     else
2775     {
2776         /* Calculate the > 0.0031308 case */
2777         shader_addline(buffer, "POW %s.x, %s.x, srgb_consts1.z;\n", tmp1, fragcolor);
2778         shader_addline(buffer, "POW %s.y, %s.y, srgb_consts1.z;\n", tmp1, fragcolor);
2779         shader_addline(buffer, "POW %s.z, %s.z, srgb_consts1.z;\n", tmp1, fragcolor);
2780         shader_addline(buffer, "MUL %s, %s, srgb_consts1.w;\n", tmp1, tmp1);
2781         shader_addline(buffer, "SUB %s, %s, srgb_consts2.x;\n", tmp1, tmp1);
2782         /* Calculate the < case */
2783         shader_addline(buffer, "MUL %s, srgb_consts1.x, %s;\n", tmp2, fragcolor);
2784         /* Get 1.0 / 0.0 masks for > 0.0031308 and < 0.0031308 */
2785         shader_addline(buffer, "SLT %s, srgb_consts1.y, %s;\n", tmp3, fragcolor);
2786         shader_addline(buffer, "SGE %s, srgb_consts1.y, %s;\n", tmp4, fragcolor);
2787         /* Store the components > 0.0031308 in the destination */
2788         shader_addline(buffer, "MUL %s.xyz, %s, %s;\n", fragcolor, tmp1, tmp3);
2789         /* Add the components that are < 0.0031308 */
2790         shader_addline(buffer, "MAD %s.xyz, %s, %s, %s;\n", fragcolor, tmp2, tmp4, fragcolor);
2791         /* Move everything into result.color at once. Nvidia hardware cannot handle partial
2792         * result.color writes(.rgb first, then .a), or handle overwriting already written
2793         * components. The assembler uses a temporary register in this case, which is usually
2794         * not allocated from one of our registers that were used earlier.
2795         */
2796     }
2797     shader_addline(buffer, "MOV result.color, %s;\n", fragcolor);
2798     /* [0.0;1.0] clamping. Not needed, this is done implicitly */
2799 }
2800
2801 static const DWORD *find_loop_control_values(IWineD3DBaseShaderImpl *This, DWORD idx)
2802 {
2803     const local_constant *constant;
2804
2805     LIST_FOR_EACH_ENTRY(constant, &This->baseShader.constantsI, local_constant, entry)
2806     {
2807         if (constant->idx == idx)
2808         {
2809             return constant->value;
2810         }
2811     }
2812     return NULL;
2813 }
2814
2815 static void init_ps_input(const IWineD3DPixelShaderImpl *This, const struct arb_ps_compile_args *args,
2816                           struct shader_arb_ctx_priv *priv)
2817 {
2818     const char *texcoords[8] =
2819     {
2820         "fragment.texcoord[0]", "fragment.texcoord[1]", "fragment.texcoord[2]", "fragment.texcoord[3]",
2821         "fragment.texcoord[4]", "fragment.texcoord[5]", "fragment.texcoord[6]", "fragment.texcoord[7]"
2822     };
2823     unsigned int i;
2824     const struct wined3d_shader_signature_element *sig = This->input_signature;
2825     const char *semantic_name;
2826     DWORD semantic_idx;
2827
2828     switch(args->super.vp_mode)
2829     {
2830         case pretransformed:
2831         case fixedfunction:
2832             /* The pixelshader has to collect the varyings on its own. In any case properly load
2833              * color0 and color1. In the case of pretransformed vertices also load texcoords. Set
2834              * other attribs to 0.0.
2835              *
2836              * For fixedfunction this behavior is correct, according to the tests. For pretransformed
2837              * we'd either need a replacement shader that can load other attribs like BINORMAL, or
2838              * load the texcoord attrib pointers to match the pixel shader signature
2839              */
2840             for(i = 0; i < MAX_REG_INPUT; i++)
2841             {
2842                 semantic_name = sig[i].semantic_name;
2843                 semantic_idx = sig[i].semantic_idx;
2844                 if(semantic_name == NULL) continue;
2845
2846                 if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_COLOR))
2847                 {
2848                     if(semantic_idx == 0) priv->ps_input[i] = "fragment.color.primary";
2849                     else if(semantic_idx == 1) priv->ps_input[i] = "fragment.color.secondary";
2850                     else priv->ps_input[i] = "0.0";
2851                 }
2852                 else if(args->super.vp_mode == fixedfunction)
2853                 {
2854                     priv->ps_input[i] = "0.0";
2855                 }
2856                 else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_TEXCOORD))
2857                 {
2858                     if(semantic_idx < 8) priv->ps_input[i] = texcoords[semantic_idx];
2859                     else priv->ps_input[i] = "0.0";
2860                 }
2861                 else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_FOG))
2862                 {
2863                     if(semantic_idx == 0) priv->ps_input[i] = "fragment.fogcoord";
2864                     else priv->ps_input[i] = "0.0";
2865                 }
2866                 else
2867                 {
2868                     priv->ps_input[i] = "0.0";
2869                 }
2870
2871                 TRACE("v%u, semantic %s%u is %s\n", i, semantic_name, semantic_idx, priv->ps_input[i]);
2872             }
2873             break;
2874
2875         case vertexshader:
2876             /* That one is easy. The vertex shaders provide v0-v7 in fragment.texcoord and v8 and v9 in
2877              * fragment.color
2878              */
2879             for(i = 0; i < 8; i++)
2880             {
2881                 priv->ps_input[i] = texcoords[i];
2882             }
2883             priv->ps_input[8] = "fragment.color.primary";
2884             priv->ps_input[9] = "fragment.color.secondary";
2885             break;
2886     }
2887 }
2888
2889 /* GL locking is done by the caller */
2890 static GLuint shader_arb_generate_pshader(IWineD3DPixelShaderImpl *This,
2891         SHADER_BUFFER *buffer, const struct arb_ps_compile_args *args, struct arb_ps_compiled_shader *compiled)
2892 {
2893     const shader_reg_maps* reg_maps = &This->baseShader.reg_maps;
2894     CONST DWORD *function = This->baseShader.function;
2895     const WineD3D_GL_Info *gl_info = &((IWineD3DDeviceImpl *)This->baseShader.device)->adapter->gl_info;
2896     const local_constant *lconst;
2897     GLuint retval;
2898     char fragcolor[16];
2899     DWORD *lconst_map = local_const_mapping((IWineD3DBaseShaderImpl *) This), next_local, cur;
2900     struct shader_arb_ctx_priv priv_ctx;
2901     BOOL dcl_tmp = args->super.srgb_correction, dcl_td = FALSE;
2902     BOOL want_nv_prog = FALSE;
2903     struct arb_pshader_private *shader_priv = This->backend_priv;
2904
2905     char srgbtmp[4][4];
2906     unsigned int i, found = 0;
2907
2908     for(i = 0; i < This->baseShader.limits.temporary; i++) {
2909
2910         /* Don't overwrite the color source */
2911         if(This->color0_mov && i == This->color0_reg) continue;
2912         else if(reg_maps->shader_version.major < 2 && i == 0) continue;
2913
2914         if(reg_maps->temporary[i]) {
2915             sprintf(srgbtmp[found], "R%u", i);
2916             found++;
2917             if(found == 4) break;
2918         }
2919     }
2920
2921     switch(found) {
2922         case 4: dcl_tmp = FALSE; break;
2923         case 0:
2924             sprintf(srgbtmp[0], "TA");
2925             sprintf(srgbtmp[1], "TB");
2926             sprintf(srgbtmp[2], "TC");
2927             sprintf(srgbtmp[3], "TD");
2928             dcl_td = TRUE;
2929             break;
2930         case 1:
2931             sprintf(srgbtmp[1], "TA");
2932             sprintf(srgbtmp[2], "TB");
2933             sprintf(srgbtmp[3], "TC");
2934             break;
2935         case 2:
2936             sprintf(srgbtmp[2], "TA");
2937             sprintf(srgbtmp[3], "TB");
2938             break;
2939         case 3:
2940             sprintf(srgbtmp[3], "TA");
2941             break;
2942     }
2943
2944     /*  Create the hw ARB shader */
2945     memset(&priv_ctx, 0, sizeof(priv_ctx));
2946     priv_ctx.cur_ps_args = args;
2947     priv_ctx.compiled_fprog = compiled;
2948     init_ps_input(This, args, &priv_ctx);
2949     list_init(&priv_ctx.control_frames);
2950
2951     /* Avoid enabling NV_fragment_program* if we do not need it.
2952      *
2953      * Enabling GL_NV_fragment_program_option causes the driver to occupy a temporary register,
2954      * and it slows down the shader execution noticeably(about 5%). Usually our instruction emulation
2955      * is faster than what we gain from using higher native instructions. There are some things though
2956      * that cannot be emulated. In that case enable the extensions.
2957      * If the extension is enabled, instruction handlers that support both ways will use it.
2958      *
2959      * Testing shows no performance difference between OPTION NV_fragment_program2 and NV_fragment_program.
2960      * So enable the best we can get.
2961      */
2962     if(reg_maps->usesdsx || reg_maps->usesdsy || reg_maps->loop_depth > 0 || reg_maps->usestexldd ||
2963        reg_maps->usestexldl || reg_maps->usesfacing)
2964     {
2965         want_nv_prog = TRUE;
2966     }
2967
2968     shader_addline(buffer, "!!ARBfp1.0\n");
2969     if(want_nv_prog && GL_SUPPORT(NV_FRAGMENT_PROGRAM2)) {
2970         shader_addline(buffer, "OPTION NV_fragment_program2;\n");
2971         priv_ctx.target_version = NV3;
2972     } else if(want_nv_prog && GL_SUPPORT(NV_FRAGMENT_PROGRAM_OPTION)) {
2973         shader_addline(buffer, "OPTION NV_fragment_program;\n");
2974         priv_ctx.target_version = NV2;
2975     } else {
2976         if(want_nv_prog)
2977         {
2978             /* This is an error - either we're advertising the wrong shader version, or aren't enforcing some
2979              * limits properly
2980              */
2981             ERR("The shader requires instructions that are not available in plain GL_ARB_fragment_program\n");
2982             ERR("Try GLSL\n");
2983         }
2984         priv_ctx.target_version = ARB;
2985     }
2986
2987     if(This->baseShader.reg_maps.highest_render_target > 0)
2988     {
2989         shader_addline(buffer, "OPTION ARB_draw_buffers;\n");
2990     }
2991
2992     if (reg_maps->shader_version.major < 3)
2993     {
2994         switch(args->super.fog) {
2995             case FOG_OFF:
2996                 break;
2997             case FOG_LINEAR:
2998                 shader_addline(buffer, "OPTION ARB_fog_linear;\n");
2999                 break;
3000             case FOG_EXP:
3001                 shader_addline(buffer, "OPTION ARB_fog_exp;\n");
3002                 break;
3003             case FOG_EXP2:
3004                 shader_addline(buffer, "OPTION ARB_fog_exp2;\n");
3005                 break;
3006         }
3007     }
3008
3009     /* For now always declare the temps. At least the Nvidia assembler optimizes completely
3010      * unused temps away(but occupies them for the whole shader if they're used once). Always
3011      * declaring them avoids tricky bookkeeping work
3012      */
3013     shader_addline(buffer, "TEMP TA;\n");      /* Used for modifiers */
3014     shader_addline(buffer, "TEMP TB;\n");      /* Used for modifiers */
3015     shader_addline(buffer, "TEMP TC;\n");      /* Used for modifiers */
3016     if(dcl_td) shader_addline(buffer, "TEMP TD;\n"); /* Used for sRGB writing */
3017     shader_addline(buffer, "PARAM coefdiv = { 0.5, 0.25, 0.125, 0.0625 };\n");
3018     shader_addline(buffer, "PARAM coefmul = { 2, 4, 8, 16 };\n");
3019     shader_addline(buffer, "PARAM one = { 1.0, 1.0, 1.0, 1.0 };\n");
3020
3021     if (reg_maps->shader_version.major < 2)
3022     {
3023         strcpy(fragcolor, "R0");
3024     } else {
3025         if(args->super.srgb_correction) {
3026             if(This->color0_mov) {
3027                 sprintf(fragcolor, "R%u", This->color0_reg);
3028             } else {
3029                 shader_addline(buffer, "TEMP TMP_COLOR;\n");
3030                 strcpy(fragcolor, "TMP_COLOR");
3031             }
3032         } else {
3033             strcpy(fragcolor, "result.color");
3034         }
3035     }
3036
3037     if(args->super.srgb_correction) {
3038         shader_addline(buffer, "PARAM srgb_consts1 = {%f, %f, %f, %f};\n",
3039                        srgb_mul_low, srgb_cmp, srgb_pow, srgb_mul_high);
3040         shader_addline(buffer, "PARAM srgb_consts2 = {%f, %f, %f, %f};\n",
3041                        srgb_sub_high, 0.0, 0.0, 0.0);
3042     }
3043
3044     /* Base Declarations */
3045     next_local = shader_generate_arb_declarations( (IWineD3DBaseShader*) This, reg_maps, buffer, &GLINFO_LOCATION,
3046             lconst_map, NULL, &priv_ctx);
3047
3048     for(i = 0; i < (sizeof(reg_maps->bumpmat) / sizeof(reg_maps->bumpmat[0])); i++) {
3049         if(!reg_maps->bumpmat[i]) continue;
3050
3051         cur = compiled->numbumpenvmatconsts;
3052         compiled->bumpenvmatconst[cur].const_num = WINED3D_CONST_NUM_UNUSED;
3053         compiled->bumpenvmatconst[cur].texunit = i;
3054         compiled->luminanceconst[cur].const_num = WINED3D_CONST_NUM_UNUSED;
3055         compiled->luminanceconst[cur].texunit = i;
3056
3057         /* We can fit the constants into the constant limit for sure because texbem, texbeml, bem and beml are only supported
3058          * in 1.x shaders, and GL_ARB_fragment_program has a constant limit of 24 constants. So in the worst case we're loading
3059          * 8 shader constants, 8 bump matrices and 8 luminance parameters and are perfectly fine. (No NP2 fixup on bumpmapped
3060          * textures due to conditional NP2 restrictions)
3061          *
3062          * Use local constants to load the bump env parameters, not program.env. This avoids collisions with d3d constants of
3063          * shaders in newer shader models. Since the bump env parameters have to share their space with NP2 fixup constants,
3064          * their location is shader dependent anyway and they cannot be loaded globally.
3065          */
3066         compiled->bumpenvmatconst[cur].const_num = next_local++;
3067         shader_addline(buffer, "PARAM bumpenvmat%d = program.local[%d];\n",
3068                        i, compiled->bumpenvmatconst[cur].const_num);
3069         compiled->numbumpenvmatconsts = cur + 1;
3070
3071         if(!reg_maps->luminanceparams[i]) continue;
3072
3073         compiled->luminanceconst[cur].const_num = next_local++;
3074         shader_addline(buffer, "PARAM luminance%d = program.local[%d];\n",
3075                        i, compiled->luminanceconst[cur].const_num);
3076     }
3077
3078     for(i = 0; i < MAX_CONST_I; i++)
3079     {
3080         compiled->int_consts[i] = WINED3D_CONST_NUM_UNUSED;
3081         if (reg_maps->integer_constants & (1 << i) && priv_ctx.target_version >= NV2)
3082         {
3083             const DWORD *control_values = find_loop_control_values((IWineD3DBaseShaderImpl *) This, i);
3084
3085             if(control_values)
3086             {
3087                 shader_addline(buffer, "PARAM I%u = {%u, %u, %u, -1};\n", i,
3088                                 control_values[0], control_values[1], control_values[2]);
3089             }
3090             else
3091             {
3092                 compiled->int_consts[i] = next_local;
3093                 compiled->num_int_consts++;
3094                 shader_addline(buffer, "PARAM I%u = program.local[%u];\n", i, next_local++);
3095             }
3096         }
3097     }
3098
3099     if(reg_maps->vpos || reg_maps->usesdsy)
3100     {
3101         compiled->ycorrection = next_local;
3102         shader_addline(buffer, "PARAM ycorrection = program.local[%u];\n", next_local++);
3103
3104         if(reg_maps->vpos)
3105         {
3106             shader_addline(buffer, "TEMP vpos;\n");
3107             /* ycorrection.x: Backbuffer height(onscreen) or 0(offscreen).
3108              * ycorrection.y: -1.0(onscreen), 1.0(offscreen)
3109              * ycorrection.z: 1.0
3110              * ycorrection.w: 0.0
3111              */
3112             shader_addline(buffer, "MAD vpos, fragment.position, ycorrection.zyww, ycorrection.wxww;\n");
3113             shader_addline(buffer, "FLR vpos.xy, vpos;\n");
3114         }
3115     }
3116     else
3117     {
3118         compiled->ycorrection = WINED3D_CONST_NUM_UNUSED;
3119     }
3120
3121     if(shader_priv->clipplane_emulation)
3122     {
3123         shader_addline(buffer, "KIL fragment.texcoord[%u];\n", shader_priv->clipplane_emulation - 1);
3124     }
3125
3126     /* Base Shader Body */
3127     shader_generate_main((IWineD3DBaseShader *)This, buffer, reg_maps, function, &priv_ctx);
3128
3129     if(args->super.srgb_correction) {
3130         arbfp_add_sRGB_correction(buffer, fragcolor, srgbtmp[0], srgbtmp[1], srgbtmp[2], srgbtmp[3],
3131                                   priv_ctx.target_version >= NV2);
3132     } else if(reg_maps->shader_version.major < 2) {
3133         shader_addline(buffer, "MOV result.color, %s;\n", fragcolor);
3134     }
3135     shader_addline(buffer, "END\n");
3136
3137     /* TODO: change to resource.glObjectHandle or something like that */
3138     GL_EXTCALL(glGenProgramsARB(1, &retval));
3139
3140     TRACE("Creating a hw pixel shader, prg=%d\n", retval);
3141     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, retval));
3142
3143     TRACE("Created hw pixel shader, prg=%d\n", retval);
3144     /* Create the program and check for errors */
3145     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
3146                buffer->bsize, buffer->buffer));
3147
3148     if (glGetError() == GL_INVALID_OPERATION) {
3149         GLint errPos;
3150         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &errPos);
3151         FIXME("HW PixelShader Error at position %d: %s\n",
3152               errPos, debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
3153         retval = 0;
3154     }
3155
3156     /* Load immediate constants */
3157     if(lconst_map) {
3158         LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
3159             const float *value = (const float *)lconst->value;
3160             GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, lconst_map[lconst->idx], value));
3161             checkGLcall("glProgramLocalParameter4fvARB");
3162         }
3163         HeapFree(GetProcessHeap(), 0, lconst_map);
3164     }
3165
3166     return retval;
3167 }
3168
3169 static int compare_sig(const struct wined3d_shader_signature_element *sig1, const struct wined3d_shader_signature_element *sig2)
3170 {
3171     unsigned int i;
3172     int ret;
3173
3174     for(i = 0; i < MAX_REG_INPUT; i++)
3175     {
3176         if(sig1[i].semantic_name == NULL || sig2[i].semantic_name == NULL)
3177         {
3178             /* Compare pointers, not contents. One string is NULL(element does not exist), the other one is not NULL */
3179             if(sig1[i].semantic_name != sig2[i].semantic_name) return sig1[i].semantic_name < sig2[i].semantic_name ? -1 : 1;
3180             continue;
3181         }
3182
3183         ret = strcmp(sig1[i].semantic_name, sig2[i].semantic_name);
3184         if(ret != 0) return ret;
3185         if(sig1[i].semantic_idx    != sig2[i].semantic_idx)    return sig1[i].semantic_idx    < sig2[i].semantic_idx    ? -1 : 1;
3186         if(sig1[i].sysval_semantic != sig2[i].sysval_semantic) return sig1[i].sysval_semantic < sig2[i].sysval_semantic ? -1 : 1;
3187         if(sig1[i].component_type  != sig2[i].component_type)  return sig1[i].sysval_semantic < sig2[i].component_type  ? -1 : 1;
3188         if(sig1[i].register_idx    != sig2[i].register_idx)    return sig1[i].register_idx    < sig2[i].register_idx    ? -1 : 1;
3189         if(sig1[i].mask            != sig2->mask)              return sig1[i].mask            < sig2[i].mask            ? -1 : 1;
3190     }
3191     return 0;
3192 }
3193
3194 static struct wined3d_shader_signature_element *clone_sig(const struct wined3d_shader_signature_element *sig)
3195 {
3196     struct wined3d_shader_signature_element *new;
3197     int i;
3198     char *name;
3199
3200     new = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*new) * MAX_REG_INPUT);
3201     for(i = 0; i < MAX_REG_INPUT; i++)
3202     {
3203         if(sig[i].semantic_name == NULL)
3204         {
3205             continue;
3206         }
3207
3208         new[i] = sig[i];
3209         /* Clone the semantic string */
3210         name = HeapAlloc(GetProcessHeap(), 0, strlen(sig[i].semantic_name) + 1);
3211         strcpy(name, sig[i].semantic_name);
3212         new[i].semantic_name = name;
3213     }
3214     return new;
3215 }
3216
3217 static DWORD find_input_signature(struct shader_arb_priv *priv, const struct wined3d_shader_signature_element *sig)
3218 {
3219     struct wine_rb_entry *entry = wine_rb_get(&priv->signature_tree, sig);
3220     struct ps_signature *found_sig;
3221
3222     if(entry != NULL)
3223     {
3224         found_sig = WINE_RB_ENTRY_VALUE(entry, struct ps_signature, entry);
3225         TRACE("Found existing signature %u\n", found_sig->idx);
3226         return found_sig->idx;
3227     }
3228     found_sig = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*sig));
3229     found_sig->sig = clone_sig(sig);
3230     found_sig->idx = priv->ps_sig_number++;
3231     TRACE("New signature stored and assigned number %u\n", found_sig->idx);
3232     if(wine_rb_put(&priv->signature_tree, sig, &found_sig->entry) == -1)
3233     {
3234         ERR("Failed to insert program entry.\n");
3235     }
3236     return found_sig->idx;
3237 }
3238
3239 static void init_output_registers(IWineD3DVertexShaderImpl *shader, DWORD sig_num, struct shader_arb_ctx_priv *priv_ctx)
3240 {
3241     unsigned int i, j;
3242     static const char *texcoords[8] =
3243     {
3244         "result.texcoord[0]", "result.texcoord[1]", "result.texcoord[2]", "result.texcoord[3]",
3245         "result.texcoord[4]", "result.texcoord[5]", "result.texcoord[6]", "result.texcoord[7]"
3246     };
3247     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) shader->baseShader.device;
3248     const struct wined3d_shader_signature_element *sig;
3249     const char *semantic_name;
3250     DWORD semantic_idx, reg_idx;
3251
3252     /* Write generic input varyings 0 to 7 to result.texcoord[], varying 8 to result.color.primary
3253      * and varying 9 to result.color.secondary
3254      */
3255     const char *decl_idx_to_string[MAX_REG_INPUT] =
3256     {
3257         texcoords[0], texcoords[1], texcoords[2], texcoords[3],
3258         texcoords[4], texcoords[5], texcoords[6], texcoords[7],
3259         "result.color.primary", "result.color.secondary"
3260     };
3261
3262     if(sig_num == ~0)
3263     {
3264         TRACE("Pixel shader uses builtin varyings\n");
3265         /* Map builtins to builtins */
3266         for(i = 0; i < 8; i++)
3267         {
3268             priv_ctx->texcrd_output[i] = texcoords[i];
3269         }
3270         priv_ctx->color_output[0] = "result.color.primary";
3271         priv_ctx->color_output[1] = "result.color.secondary";
3272         priv_ctx->fog_output = "result.fogcoord";
3273
3274         /* Map declared regs to builtins. Use "TA" to /dev/null unread output */
3275         for(i = 0; i < (sizeof(shader->output_signature) / sizeof(*shader->output_signature)); i++)
3276         {
3277             semantic_name = shader->output_signature[i].semantic_name;
3278             if(semantic_name == NULL) continue;
3279
3280             if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_POSITION))
3281             {
3282                 TRACE("o%u is TMP_OUT\n", i);
3283                 if(shader->output_signature[i].semantic_idx == 0) priv_ctx->vs_output[i] = "TMP_OUT";
3284                 else priv_ctx->vs_output[i] = "TA";
3285             }
3286             else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_PSIZE))
3287             {
3288                 TRACE("o%u is result.pointsize\n", i);
3289                 if(shader->output_signature[i].semantic_idx == 0) priv_ctx->vs_output[i] = "result.pointsize";
3290                 else priv_ctx->vs_output[i] = "TA";
3291             }
3292             else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_COLOR))
3293             {
3294                 TRACE("o%u is result.color.?, idx %u\n", i, shader->output_signature[i].semantic_idx);
3295                 if(shader->output_signature[i].semantic_idx == 0) priv_ctx->vs_output[i] = "result.color.primary";
3296                 else if(shader->output_signature[i].semantic_idx == 1) priv_ctx->vs_output[i] = "result.color.secondary";
3297                 else priv_ctx->vs_output[i] = "TA";
3298             }
3299             else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_TEXCOORD))
3300             {
3301                 TRACE("o%u is %s\n", i, texcoords[shader->output_signature[i].semantic_idx]);
3302                 if(shader->output_signature[i].semantic_idx >= 8) priv_ctx->vs_output[i] = "TA";
3303                 else priv_ctx->vs_output[i] = texcoords[shader->output_signature[i].semantic_idx];
3304             }
3305             else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_FOG))
3306             {
3307                 TRACE("o%u is result.fogcoord\n", i);
3308                 if(shader->output_signature[i].semantic_idx > 0) priv_ctx->vs_output[i] = "TA";
3309                 else priv_ctx->vs_output[i] = "result.fogcoord";
3310             }
3311             else
3312             {
3313                 priv_ctx->vs_output[i] = "TA";
3314             }
3315         }
3316         return;
3317     }
3318
3319     /* Instead of searching for the signature in the signature list, read the one from the current pixel shader.
3320      * Its maybe not the shader where the signature came from, but it is the same signature and faster to find
3321      */
3322     sig = ((IWineD3DPixelShaderImpl *)device->stateBlock->pixelShader)->input_signature;
3323     TRACE("Pixel shader uses declared varyings\n");
3324
3325     /* Map builtin to declared. /dev/null the results by default to the TA temp reg */
3326     for(i = 0; i < 8; i++)
3327     {
3328         priv_ctx->texcrd_output[i] = "TA";
3329     }
3330     priv_ctx->color_output[0] = "TA";
3331     priv_ctx->color_output[1] = "TA";
3332     priv_ctx->fog_output = "TA";
3333
3334     for(i = 0; i < MAX_REG_INPUT; i++)
3335     {
3336         semantic_name = sig[i].semantic_name;
3337         semantic_idx = sig[i].semantic_idx;
3338         reg_idx = sig[i].register_idx;
3339         if(semantic_name == NULL) continue;
3340
3341         /* If a declared input register is not written by builtin arguments, don't write to it.
3342          * GL_NV_vertex_program makes sure the input defaults to 0.0, which is correct with D3D
3343          *
3344          * Don't care about POSITION and PSIZE here - this is a builtin vertex shader, position goes
3345          * to TMP_OUT in any case
3346          */
3347         if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_TEXCOORD))
3348         {
3349             if(semantic_idx < 8) priv_ctx->texcrd_output[semantic_idx] = decl_idx_to_string[reg_idx];
3350         }
3351         else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_COLOR))
3352         {
3353             if(semantic_idx < 2) priv_ctx->color_output[semantic_idx] = decl_idx_to_string[reg_idx];
3354         }
3355         else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_FOG))
3356         {
3357             if(semantic_idx == 0) priv_ctx->fog_output = decl_idx_to_string[reg_idx];
3358         }
3359     }
3360
3361     /* Map declared to declared */
3362     for(i = 0; i < (sizeof(shader->output_signature) / sizeof(*shader->output_signature)); i++)
3363     {
3364         /* Write unread output to TA to throw them away */
3365         priv_ctx->vs_output[i] = "TA";
3366         semantic_name = shader->output_signature[i].semantic_name;
3367         if(semantic_name == NULL)
3368         {
3369             continue;
3370         }
3371
3372         if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_POSITION) &&
3373            shader->output_signature[i].semantic_idx == 0)
3374         {
3375             priv_ctx->vs_output[i] = "TMP_OUT";
3376             continue;
3377         }
3378         else if(shader_match_semantic(semantic_name, WINED3DDECLUSAGE_PSIZE) &&
3379            shader->output_signature[i].semantic_idx == 0)
3380         {
3381             priv_ctx->vs_output[i] = "result.pointsize";
3382             continue;
3383         }
3384
3385         for(j = 0; j < MAX_REG_INPUT; j++)
3386         {
3387             if(sig[j].semantic_name == NULL)
3388             {
3389                 continue;
3390             }
3391
3392             if(strcmp(sig[j].semantic_name, semantic_name) == 0 &&
3393                sig[j].semantic_idx == shader->output_signature[i].semantic_idx)
3394             {
3395                 priv_ctx->vs_output[i] = decl_idx_to_string[sig[j].register_idx];
3396             }
3397         }
3398     }
3399 }
3400
3401 /* GL locking is done by the caller */
3402 static GLuint shader_arb_generate_vshader(IWineD3DVertexShaderImpl *This,
3403         SHADER_BUFFER *buffer, const struct arb_vs_compile_args *args, struct arb_vs_compiled_shader *compiled)
3404 {
3405     const shader_reg_maps *reg_maps = &This->baseShader.reg_maps;
3406     CONST DWORD *function = This->baseShader.function;
3407     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *)This->baseShader.device;
3408     const WineD3D_GL_Info *gl_info = &device->adapter->gl_info;
3409     const local_constant *lconst;
3410     GLuint ret;
3411     DWORD next_local, *lconst_map = local_const_mapping((IWineD3DBaseShaderImpl *) This);
3412     struct shader_arb_ctx_priv priv_ctx;
3413     unsigned int i;
3414     DWORD num_clipplanes = 0;
3415
3416     memset(&priv_ctx, 0, sizeof(priv_ctx));
3417     priv_ctx.cur_vs_args = args;
3418     list_init(&priv_ctx.control_frames);
3419     init_output_registers(This, args->ps_signature, &priv_ctx);
3420
3421     /*  Create the hw ARB shader */
3422     shader_addline(buffer, "!!ARBvp1.0\n");
3423
3424     /* Always enable the NV extension if available. Unlike fragment shaders, there is no
3425      * mesurable performance penalty, and we can always make use of it for clipplanes.
3426      */
3427     if(GL_SUPPORT(NV_VERTEX_PROGRAM3)) {
3428         shader_addline(buffer, "OPTION NV_vertex_program3;\n");
3429         priv_ctx.target_version = NV3;
3430         shader_addline(buffer, "ADDRESS aL;\n");
3431     } else if(GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION)) {
3432         shader_addline(buffer, "OPTION NV_vertex_program2;\n");
3433         priv_ctx.target_version = NV2;
3434         shader_addline(buffer, "ADDRESS aL;\n");
3435     } else {
3436         priv_ctx.target_version = ARB;
3437     }
3438
3439     shader_addline(buffer, "TEMP TMP_OUT;\n");
3440     if(need_helper_const(gl_info)) {
3441         shader_addline(buffer, "PARAM helper_const = { 2.0, -1.0, %d.0, 0.0 };\n", This->rel_offset);
3442     }
3443     if(need_mova_const((IWineD3DBaseShader *) This, gl_info)) {
3444         shader_addline(buffer, "PARAM mova_const = { 0.5, 0.0, 2.0, 1.0 };\n");
3445         shader_addline(buffer, "TEMP A0_SHADOW;\n");
3446     }
3447
3448     shader_addline(buffer, "TEMP TA;\n");
3449
3450     /* Base Declarations */
3451     next_local = shader_generate_arb_declarations( (IWineD3DBaseShader*) This, reg_maps, buffer, &GLINFO_LOCATION,
3452             lconst_map, &num_clipplanes, &priv_ctx);
3453
3454     for(i = 0; i < MAX_CONST_I; i++)
3455     {
3456         compiled->int_consts[i] = WINED3D_CONST_NUM_UNUSED;
3457         if(reg_maps->integer_constants & (1 << i) && priv_ctx.target_version >= NV2)
3458         {
3459             const DWORD *control_values = find_loop_control_values((IWineD3DBaseShaderImpl *) This, i);
3460
3461             if(control_values)
3462             {
3463                 shader_addline(buffer, "PARAM I%u = {%u, %u, %u, -1};\n", i,
3464                                 control_values[0], control_values[1], control_values[2]);
3465             }
3466             else
3467             {
3468                 compiled->int_consts[i] = next_local;
3469                 compiled->num_int_consts++;
3470                 shader_addline(buffer, "PARAM I%u = program.local[%u];\n", i, next_local++);
3471             }
3472         }
3473     }
3474
3475     /* We need a constant to fixup the final position */
3476     shader_addline(buffer, "PARAM posFixup = program.local[%u];\n", next_local);
3477     compiled->pos_fixup = next_local++;
3478
3479     /* Initialize output parameters. GL_ARB_vertex_program does not require special initialization values
3480      * for output parameters. D3D in theory does not do that either, but some applications depend on a
3481      * proper initialization of the secondary color, and programs using the fixed function pipeline without
3482      * a replacement shader depend on the texcoord.w being set properly.
3483      *
3484      * GL_NV_vertex_program defines that all output values are initialized to {0.0, 0.0, 0.0, 1.0}. This
3485      * assertion is in effect even when using GL_ARB_vertex_program without any NV specific additions. So
3486      * skip this if NV_vertex_program is supported. Otherwise, initialize the secondary color. For the tex-
3487      * coords, we have a flag in the opengl caps. Many cards do not require the texcoord being set, and
3488      * this can eat a number of instructions, so skip it unless this cap is set as well
3489      */
3490     if(!GL_SUPPORT(NV_VERTEX_PROGRAM)) {
3491         shader_addline(buffer, "MOV result.color.secondary, -helper_const.wwwy;\n");
3492
3493         if((GLINFO_LOCATION).set_texcoord_w && !device->frag_pipe->ffp_proj_control) {
3494             int i;
3495             for(i = 0; i < min(8, MAX_REG_TEXCRD); i++) {
3496                 if(This->baseShader.reg_maps.texcoord_mask[i] != 0 &&
3497                 This->baseShader.reg_maps.texcoord_mask[i] != WINED3DSP_WRITEMASK_ALL) {
3498                     shader_addline(buffer, "MOV result.texcoord[%u].w, -helper_const.y;\n", i);
3499                 }
3500             }
3501         }
3502     }
3503
3504     /* Base Shader Body */
3505     shader_generate_main((IWineD3DBaseShader *)This, buffer, reg_maps, function, &priv_ctx);
3506
3507     /* The D3DRS_FOGTABLEMODE render state defines if the shader-generated fog coord is used
3508      * or if the fragment depth is used. If the fragment depth is used(FOGTABLEMODE != NONE),
3509      * the fog frag coord is thrown away. If the fog frag coord is used, but not written by
3510      * the shader, it is set to 0.0(fully fogged, since start = 1.0, end = 0.0)
3511      */
3512     if(args->super.fog_src == VS_FOG_Z) {
3513         shader_addline(buffer, "MOV result.fogcoord, TMP_OUT.z;\n");
3514     } else if (!reg_maps->fog) {
3515         /* posFixup.x is always 1.0, so we can savely use it */
3516         shader_addline(buffer, "ADD result.fogcoord, posFixup.x, -posFixup.x;\n");
3517     }
3518
3519     /* Write the final position.
3520      *
3521      * OpenGL coordinates specify the center of the pixel while d3d coords specify
3522      * the corner. The offsets are stored in z and w in posFixup. posFixup.y contains
3523      * 1.0 or -1.0 to turn the rendering upside down for offscreen rendering. PosFixup.x
3524      * contains 1.0 to allow a mad, but arb vs swizzles are too restricted for that.
3525      */
3526     shader_addline(buffer, "MUL TA, posFixup, TMP_OUT.w;\n");
3527     shader_addline(buffer, "ADD TMP_OUT.x, TMP_OUT.x, TA.z;\n");
3528     shader_addline(buffer, "MAD TMP_OUT.y, TMP_OUT.y, posFixup.y, TA.w;\n");
3529
3530     if(use_nv_clip(gl_info) && priv_ctx.target_version >= NV2)
3531     {
3532         for(i = 0; i < num_clipplanes; i++)
3533         {
3534             shader_addline(buffer, "DP4 result.clip[%u].x, TMP_OUT, state.clip[%u].plane;\n", i, i);
3535         }
3536     }
3537     else if(args->boolclip.clip_control[0])
3538     {
3539         unsigned int cur_clip = 0;
3540         char component[4] = {'x', 'y', 'z', 'w'};
3541
3542         for(i = 0; i < GL_LIMITS(clipplanes); i++)
3543         {
3544             if(args->boolclip.clip_control[1] & (1 << i))
3545             {
3546                 shader_addline(buffer, "DP4 TA.%c, TMP_OUT, state.clip[%u].plane;\n",
3547                                component[cur_clip++], i);
3548             }
3549         }
3550         switch(cur_clip)
3551         {
3552             case 0:
3553                 shader_addline(buffer, "MOV TA, -helper_const.w;\n");
3554                 break;
3555             case 1:
3556                 shader_addline(buffer, "MOV TA.yzw, -helper_const.w;\n");
3557                 break;
3558             case 2:
3559                 shader_addline(buffer, "MOV TA.zw, -helper_const.w;\n");
3560                 break;
3561             case 3:
3562                 shader_addline(buffer, "MOV TA.w, -helper_const.w;\n");
3563                 break;
3564         }
3565         shader_addline(buffer, "MOV result.texcoord[%u], TA;\n",
3566                        args->boolclip.clip_control[0] - 1);
3567     }
3568
3569     /* Z coord [0;1]->[-1;1] mapping, see comment in transform_projection in state.c
3570      * and the glsl equivalent
3571      */
3572     if(need_helper_const(gl_info)) {
3573         shader_addline(buffer, "MAD TMP_OUT.z, TMP_OUT.z, helper_const.x, -TMP_OUT.w;\n");
3574     } else {
3575         shader_addline(buffer, "ADD TMP_OUT.z, TMP_OUT.z, TMP_OUT.z;\n");
3576         shader_addline(buffer, "ADD TMP_OUT.z, TMP_OUT.z, -TMP_OUT.w;\n");
3577     }
3578
3579     shader_addline(buffer, "MOV result.position, TMP_OUT;\n");
3580
3581     shader_addline(buffer, "END\n");
3582
3583     /* TODO: change to resource.glObjectHandle or something like that */
3584     GL_EXTCALL(glGenProgramsARB(1, &ret));
3585
3586     TRACE("Creating a hw vertex shader, prg=%d\n", ret);
3587     GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, ret));
3588
3589     TRACE("Created hw vertex shader, prg=%d\n", ret);
3590     /* Create the program and check for errors */
3591     GL_EXTCALL(glProgramStringARB(GL_VERTEX_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB,
3592                buffer->bsize, buffer->buffer));
3593
3594     if (glGetError() == GL_INVALID_OPERATION) {
3595         GLint errPos;
3596         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &errPos);
3597         FIXME("HW VertexShader Error at position %d: %s\n",
3598               errPos, debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
3599         ret = -1;
3600     } else {
3601         /* Load immediate constants */
3602         if(lconst_map) {
3603             LIST_FOR_EACH_ENTRY(lconst, &This->baseShader.constantsF, local_constant, entry) {
3604                 const float *value = (const float *)lconst->value;
3605                 GL_EXTCALL(glProgramLocalParameter4fvARB(GL_VERTEX_PROGRAM_ARB, lconst_map[lconst->idx], value));
3606             }
3607         }
3608     }
3609     HeapFree(GetProcessHeap(), 0, lconst_map);
3610
3611     return ret;
3612 }
3613
3614 static void find_clip_texcoord(IWineD3DPixelShaderImpl *ps)
3615 {
3616     struct arb_pshader_private *shader_priv = ps->backend_priv;
3617     int i;
3618     const WineD3D_GL_Info *gl_info = &((IWineD3DDeviceImpl *)ps->baseShader.device)->adapter->gl_info;
3619
3620     /* See if we can use fragment.texcoord[7] for clipplane emulation
3621      *
3622      * Don't do this if it is not supported, or fragment.texcoord[7] is used
3623      */
3624     if(ps->baseShader.reg_maps.shader_version.major < 3)
3625     {
3626         for(i = GL_LIMITS(texture_stages); i > 0; i--)
3627         {
3628             if(!ps->baseShader.reg_maps.texcoord[i - 1])
3629             {
3630                 shader_priv->clipplane_emulation = i;
3631                 return;
3632             }
3633         }
3634         WARN("Did not find a free clip reg(2.0)\n");
3635     }
3636     else
3637     {
3638         for(i = GL_LIMITS(texture_stages); i > 0; i--)
3639         {
3640             if(!(ps->baseShader.reg_maps.input_registers & (1 << (i - 1))))
3641             {
3642                 shader_priv->clipplane_emulation = i;
3643                 return;
3644             }
3645         }
3646         WARN("Did not find a free clip reg(3.0)\n");
3647     }
3648 }
3649
3650 /* GL locking is done by the caller */
3651 static struct arb_ps_compiled_shader *find_arb_pshader(IWineD3DPixelShaderImpl *shader, const struct arb_ps_compile_args *args)
3652 {
3653     UINT i;
3654     DWORD new_size;
3655     struct arb_ps_compiled_shader *new_array;
3656     SHADER_BUFFER buffer;
3657     struct arb_pshader_private *shader_data;
3658     GLuint ret;
3659
3660     if(!shader->backend_priv) {
3661         IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) shader->baseShader.device;
3662         struct shader_arb_priv *priv = device->shader_priv;
3663
3664         shader->backend_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data));
3665         shader_data = shader->backend_priv;
3666         shader_data->clamp_consts = shader->baseShader.reg_maps.shader_version.major == 1;
3667
3668         if(shader->baseShader.reg_maps.shader_version.major < 3) shader_data->input_signature_idx = ~0;
3669         else shader_data->input_signature_idx = find_input_signature(priv, shader->input_signature);
3670
3671         shader_data->has_signature_idx = TRUE;
3672         TRACE("Shader got assigned input signature index %u\n", shader_data->input_signature_idx);
3673
3674         if(!device->vs_clipping) find_clip_texcoord(shader);
3675     }
3676     shader_data = shader->backend_priv;
3677
3678     /* Usually we have very few GL shaders for each d3d shader(just 1 or maybe 2),
3679      * so a linear search is more performant than a hashmap or a binary search
3680      * (cache coherency etc)
3681      */
3682     for(i = 0; i < shader_data->num_gl_shaders; i++) {
3683         if(memcmp(&shader_data->gl_shaders[i].args, args, sizeof(*args)) == 0) {
3684             return &shader_data->gl_shaders[i];
3685         }
3686     }
3687
3688     TRACE("No matching GL shader found, compiling a new shader\n");
3689     if(shader_data->shader_array_size == shader_data->num_gl_shaders) {
3690         if (shader_data->num_gl_shaders)
3691         {
3692             new_size = shader_data->shader_array_size + max(1, shader_data->shader_array_size / 2);
3693             new_array = HeapReAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, shader_data->gl_shaders,
3694                                     new_size * sizeof(*shader_data->gl_shaders));
3695         } else {
3696             new_array = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data->gl_shaders));
3697             new_size = 1;
3698         }
3699
3700         if(!new_array) {
3701             ERR("Out of memory\n");
3702             return 0;
3703         }
3704         shader_data->gl_shaders = new_array;
3705         shader_data->shader_array_size = new_size;
3706     }
3707
3708     shader_data->gl_shaders[shader_data->num_gl_shaders].args = *args;
3709
3710     pixelshader_update_samplers(&shader->baseShader.reg_maps,
3711             ((IWineD3DDeviceImpl *)shader->baseShader.device)->stateBlock->textures);
3712
3713     shader_buffer_init(&buffer);
3714     ret = shader_arb_generate_pshader(shader, &buffer, args,
3715                                       &shader_data->gl_shaders[shader_data->num_gl_shaders]);
3716     shader_buffer_free(&buffer);
3717     shader_data->gl_shaders[shader_data->num_gl_shaders].prgId = ret;
3718
3719     return &shader_data->gl_shaders[shader_data->num_gl_shaders++];
3720 }
3721
3722 static inline BOOL vs_args_equal(const struct arb_vs_compile_args *stored, const struct arb_vs_compile_args *new,
3723                                  const DWORD use_map, BOOL skip_int) {
3724     if((stored->super.swizzle_map & use_map) != new->super.swizzle_map) return FALSE;
3725     if(stored->super.fog_src != new->super.fog_src) return FALSE;
3726     if(stored->boolclip_compare != new->boolclip_compare) return FALSE;
3727     if(stored->ps_signature != new->ps_signature) return FALSE;
3728     if(stored->vertex_samplers_compare != new->vertex_samplers_compare) return FALSE;
3729     if(skip_int) return TRUE;
3730
3731     return memcmp(stored->loop_ctrl, new->loop_ctrl, sizeof(stored->loop_ctrl)) == 0;
3732 }
3733
3734 static struct arb_vs_compiled_shader *find_arb_vshader(IWineD3DVertexShaderImpl *shader, const struct arb_vs_compile_args *args)
3735 {
3736     UINT i;
3737     DWORD new_size;
3738     struct arb_vs_compiled_shader *new_array;
3739     DWORD use_map = ((IWineD3DDeviceImpl *)shader->baseShader.device)->strided_streams.use_map;
3740     SHADER_BUFFER buffer;
3741     struct arb_vshader_private *shader_data;
3742     GLuint ret;
3743     const WineD3D_GL_Info *gl_info = &((IWineD3DDeviceImpl *)shader->baseShader.device)->adapter->gl_info;
3744
3745     if(!shader->backend_priv) {
3746         shader->backend_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data));
3747     }
3748     shader_data = shader->backend_priv;
3749
3750     /* Usually we have very few GL shaders for each d3d shader(just 1 or maybe 2),
3751      * so a linear search is more performant than a hashmap or a binary search
3752      * (cache coherency etc)
3753      */
3754     for(i = 0; i < shader_data->num_gl_shaders; i++) {
3755         if(vs_args_equal(&shader_data->gl_shaders[i].args, args, use_map, GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION))) {
3756             return &shader_data->gl_shaders[i];
3757         }
3758     }
3759
3760     TRACE("No matching GL shader found, compiling a new shader\n");
3761
3762     if(shader_data->shader_array_size == shader_data->num_gl_shaders) {
3763         if (shader_data->num_gl_shaders)
3764         {
3765             new_size = shader_data->shader_array_size + max(1, shader_data->shader_array_size / 2);
3766             new_array = HeapReAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, shader_data->gl_shaders,
3767                                     new_size * sizeof(*shader_data->gl_shaders));
3768         } else {
3769             new_array = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*shader_data->gl_shaders));
3770             new_size = 1;
3771         }
3772
3773         if(!new_array) {
3774             ERR("Out of memory\n");
3775             return 0;
3776         }
3777         shader_data->gl_shaders = new_array;
3778         shader_data->shader_array_size = new_size;
3779     }
3780
3781     shader_data->gl_shaders[shader_data->num_gl_shaders].args = *args;
3782
3783     shader_buffer_init(&buffer);
3784     ret = shader_arb_generate_vshader(shader, &buffer, args,
3785             &shader_data->gl_shaders[shader_data->num_gl_shaders]);
3786     shader_buffer_free(&buffer);
3787     shader_data->gl_shaders[shader_data->num_gl_shaders].prgId = ret;
3788
3789     return &shader_data->gl_shaders[shader_data->num_gl_shaders++];
3790 }
3791
3792 static inline void find_arb_ps_compile_args(IWineD3DPixelShaderImpl *shader, IWineD3DStateBlockImpl *stateblock,
3793         struct arb_ps_compile_args *args)
3794 {
3795     int i;
3796     WORD int_skip;
3797     const WineD3D_GL_Info *gl_info = &((IWineD3DDeviceImpl *)shader->baseShader.device)->adapter->gl_info;
3798     find_ps_compile_args(shader, stateblock, &args->super);
3799
3800     /* This forces all local boolean constants to 1 to make them stateblock independent */
3801     args->bools = shader->baseShader.reg_maps.local_bool_consts;
3802
3803     for(i = 0; i < MAX_CONST_B; i++)
3804     {
3805         if(stateblock->pixelShaderConstantB[i]) args->bools |= ( 1 << i);
3806     }
3807
3808     /* Skip if unused or local, or supported natively */
3809     int_skip = ~shader->baseShader.reg_maps.integer_constants | shader->baseShader.reg_maps.local_int_consts;
3810     if(int_skip == 0xffff || GL_SUPPORT(NV_FRAGMENT_PROGRAM_OPTION))
3811     {
3812         memset(&args->loop_ctrl, 0, sizeof(args->loop_ctrl));
3813         return;
3814     }
3815
3816     for(i = 0; i < MAX_CONST_I; i++)
3817     {
3818         if(int_skip & (1 << i))
3819         {
3820             args->loop_ctrl[i][0] = 0;
3821             args->loop_ctrl[i][1] = 0;
3822             args->loop_ctrl[i][2] = 0;
3823         }
3824         else
3825         {
3826             args->loop_ctrl[i][0] = stateblock->pixelShaderConstantI[i * 4];
3827             args->loop_ctrl[i][1] = stateblock->pixelShaderConstantI[i * 4 + 1];
3828             args->loop_ctrl[i][2] = stateblock->pixelShaderConstantI[i * 4 + 2];
3829         }
3830     }
3831 }
3832
3833 static inline void find_arb_vs_compile_args(IWineD3DVertexShaderImpl *shader, IWineD3DStateBlockImpl *stateblock,
3834         struct arb_vs_compile_args *args)
3835 {
3836     int i;
3837     WORD int_skip;
3838     IWineD3DDeviceImpl *dev = (IWineD3DDeviceImpl *)shader->baseShader.device;
3839     const WineD3D_GL_Info *gl_info = &dev->adapter->gl_info;
3840     find_vs_compile_args(shader, stateblock, &args->super);
3841
3842     /* This forces all local boolean constants to 1 to make them stateblock independent */
3843     args->boolclip.bools = shader->baseShader.reg_maps.local_bool_consts;
3844
3845     if(use_ps(stateblock))
3846     {
3847         IWineD3DPixelShaderImpl *ps = (IWineD3DPixelShaderImpl *) stateblock->pixelShader;
3848         struct arb_pshader_private *shader_priv = ps->backend_priv;
3849         args->ps_signature = shader_priv->input_signature_idx;
3850
3851         args->boolclip.clip_control[0] = shader_priv->clipplane_emulation;
3852     }
3853     else
3854     {
3855         args->ps_signature = ~0;
3856         if(dev->vs_clipping)
3857         {
3858             args->boolclip.clip_control[0] = 0;
3859         }
3860         else
3861         {
3862             args->boolclip.clip_control[0] = ffp_clip_emul(stateblock) ? GL_LIMITS(texture_stages) : 0;
3863         }
3864     }
3865
3866     if(args->boolclip.clip_control[0])
3867     {
3868         if(stateblock->renderState[WINED3DRS_CLIPPING])
3869         {
3870             args->boolclip.clip_control[1] = stateblock->renderState[WINED3DRS_CLIPPLANEENABLE];
3871         }
3872         else
3873         {
3874             args->boolclip.clip_control[1] = 0;
3875         }
3876     }
3877
3878     /* TODO: Figure out if it would be better to store bool constants as bitmasks in the stateblock */
3879     for(i = 0; i < MAX_CONST_B; i++)
3880     {
3881         if(stateblock->vertexShaderConstantB[i]) args->boolclip.bools |= ( 1 << i);
3882     }
3883
3884     args->vertex_samplers[0] = dev->texUnitMap[MAX_FRAGMENT_SAMPLERS + 0];
3885     args->vertex_samplers[1] = dev->texUnitMap[MAX_FRAGMENT_SAMPLERS + 1];
3886     args->vertex_samplers[2] = dev->texUnitMap[MAX_FRAGMENT_SAMPLERS + 2];
3887     args->vertex_samplers[3] = 0;
3888
3889     /* Skip if unused or local */
3890     int_skip = ~shader->baseShader.reg_maps.integer_constants | shader->baseShader.reg_maps.local_int_consts;
3891     if(int_skip == 0xffff || GL_SUPPORT(NV_VERTEX_PROGRAM2_OPTION)) /* This is about flow control, not clipping */
3892     {
3893         memset(&args->loop_ctrl, 0, sizeof(args->loop_ctrl));
3894         return;
3895     }
3896
3897     for(i = 0; i < MAX_CONST_I; i++)
3898     {
3899         if(int_skip & (1 << i))
3900         {
3901             args->loop_ctrl[i][0] = 0;
3902             args->loop_ctrl[i][1] = 0;
3903             args->loop_ctrl[i][2] = 0;
3904         }
3905         else
3906         {
3907             args->loop_ctrl[i][0] = stateblock->vertexShaderConstantI[i * 4];
3908             args->loop_ctrl[i][1] = stateblock->vertexShaderConstantI[i * 4 + 1];
3909             args->loop_ctrl[i][2] = stateblock->vertexShaderConstantI[i * 4 + 2];
3910         }
3911     }
3912 }
3913
3914 /* GL locking is done by the caller */
3915 static void shader_arb_select(IWineD3DDevice *iface, BOOL usePS, BOOL useVS) {
3916     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
3917     struct shader_arb_priv *priv = This->shader_priv;
3918     const WineD3D_GL_Info *gl_info = &This->adapter->gl_info;
3919     int i;
3920
3921     /* Deal with pixel shaders first so the vertex shader arg function has the input signature ready */
3922     if (usePS) {
3923         struct arb_ps_compile_args compile_args;
3924         struct arb_ps_compiled_shader *compiled;
3925         IWineD3DPixelShaderImpl *ps = (IWineD3DPixelShaderImpl *) This->stateBlock->pixelShader;
3926
3927         TRACE("Using pixel shader %p\n", This->stateBlock->pixelShader);
3928         find_arb_ps_compile_args(ps, This->stateBlock, &compile_args);
3929         compiled = find_arb_pshader(ps, &compile_args);
3930         priv->current_fprogram_id = compiled->prgId;
3931         priv->compiled_fprog = compiled;
3932
3933         /* Bind the fragment program */
3934         GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id));
3935         checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id);");
3936
3937         if(!priv->use_arbfp_fixed_func) {
3938             /* Enable OpenGL fragment programs */
3939             glEnable(GL_FRAGMENT_PROGRAM_ARB);
3940             checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB);");
3941         }
3942         TRACE("(%p) : Bound fragment program %u and enabled GL_FRAGMENT_PROGRAM_ARB\n", This, priv->current_fprogram_id);
3943
3944         /* Pixel Shader 1.x constants are clamped to [-1;1], Pixel Shader 2.0 constants are not. If switching between
3945          * a 1.x and newer shader, reload the first 8 constants
3946          */
3947         if(priv->last_ps_const_clamped != ((struct arb_pshader_private *) ps->backend_priv)->clamp_consts)
3948         {
3949             priv->last_ps_const_clamped = ((struct arb_pshader_private *) ps->backend_priv)->clamp_consts;
3950             This->highest_dirty_ps_const = max(This->highest_dirty_ps_const, 8);
3951             for(i = 0; i < 8; i++)
3952             {
3953                 This->activeContext->pshader_const_dirty[i] = 1;
3954             }
3955             /* Also takes care of loading local constants */
3956             shader_arb_load_constants(iface, TRUE, FALSE);
3957         }
3958         else
3959         {
3960             shader_arb_ps_local_constants(This);
3961         }
3962     } else if(GL_SUPPORT(ARB_FRAGMENT_PROGRAM) && !priv->use_arbfp_fixed_func) {
3963         /* Disable only if we're not using arbfp fixed function fragment processing. If this is used,
3964         * keep GL_FRAGMENT_PROGRAM_ARB enabled, and the fixed function pipeline will bind the fixed function
3965         * replacement shader
3966         */
3967         glDisable(GL_FRAGMENT_PROGRAM_ARB);
3968         checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
3969         priv->current_fprogram_id = 0;
3970     }
3971
3972     if (useVS) {
3973         struct arb_vs_compile_args compile_args;
3974         struct arb_vs_compiled_shader *compiled;
3975
3976         TRACE("Using vertex shader %p\n", This->stateBlock->vertexShader);
3977         find_arb_vs_compile_args((IWineD3DVertexShaderImpl *) This->stateBlock->vertexShader, This->stateBlock, &compile_args);
3978         compiled = find_arb_vshader((IWineD3DVertexShaderImpl *) This->stateBlock->vertexShader, &compile_args);
3979         priv->current_vprogram_id = compiled->prgId;
3980         priv->compiled_vprog = compiled;
3981
3982         /* Bind the vertex program */
3983         GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id));
3984         checkGLcall("glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id);");
3985
3986         /* Enable OpenGL vertex programs */
3987         glEnable(GL_VERTEX_PROGRAM_ARB);
3988         checkGLcall("glEnable(GL_VERTEX_PROGRAM_ARB);");
3989         TRACE("(%p) : Bound vertex program %u and enabled GL_VERTEX_PROGRAM_ARB\n", This, priv->current_vprogram_id);
3990         shader_arb_vs_local_constants(This);
3991     } else if(GL_SUPPORT(ARB_VERTEX_PROGRAM)) {
3992         priv->current_vprogram_id = 0;
3993         glDisable(GL_VERTEX_PROGRAM_ARB);
3994         checkGLcall("glDisable(GL_VERTEX_PROGRAM_ARB)");
3995     }
3996 }
3997
3998 /* GL locking is done by the caller */
3999 static void shader_arb_select_depth_blt(IWineD3DDevice *iface, enum tex_types tex_type) {
4000     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
4001     struct shader_arb_priv *priv = This->shader_priv;
4002     GLuint *blt_fprogram = &priv->depth_blt_fprogram_id[tex_type];
4003     const WineD3D_GL_Info *gl_info = &This->adapter->gl_info;
4004
4005     if (!priv->depth_blt_vprogram_id) priv->depth_blt_vprogram_id = create_arb_blt_vertex_program(gl_info);
4006     GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->depth_blt_vprogram_id));
4007     glEnable(GL_VERTEX_PROGRAM_ARB);
4008
4009     if (!*blt_fprogram) *blt_fprogram = create_arb_blt_fragment_program(gl_info, tex_type);
4010     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, *blt_fprogram));
4011     glEnable(GL_FRAGMENT_PROGRAM_ARB);
4012 }
4013
4014 /* GL locking is done by the caller */
4015 static void shader_arb_deselect_depth_blt(IWineD3DDevice *iface) {
4016     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
4017     struct shader_arb_priv *priv = This->shader_priv;
4018     const WineD3D_GL_Info *gl_info = &This->adapter->gl_info;
4019
4020     if (priv->current_vprogram_id) {
4021         GL_EXTCALL(glBindProgramARB(GL_VERTEX_PROGRAM_ARB, priv->current_vprogram_id));
4022         checkGLcall("glBindProgramARB(GL_VERTEX_PROGRAM_ARB, vertexShader->prgId);");
4023
4024         glEnable(GL_VERTEX_PROGRAM_ARB);
4025         checkGLcall("glEnable(GL_VERTEX_PROGRAM_ARB);");
4026
4027         TRACE("(%p) : Bound vertex program %u and enabled GL_VERTEX_PROGRAM_ARB\n", This, priv->current_vprogram_id);
4028     } else {
4029         glDisable(GL_VERTEX_PROGRAM_ARB);
4030         checkGLcall("glDisable(GL_VERTEX_PROGRAM_ARB)");
4031     }
4032
4033     if (priv->current_fprogram_id) {
4034         GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, priv->current_fprogram_id));
4035         checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, pixelShader->prgId);");
4036
4037         glEnable(GL_FRAGMENT_PROGRAM_ARB);
4038         checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB);");
4039
4040         TRACE("(%p) : Bound fragment program %u and enabled GL_FRAGMENT_PROGRAM_ARB\n", This, priv->current_fprogram_id);
4041     } else {
4042         glDisable(GL_FRAGMENT_PROGRAM_ARB);
4043         checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
4044     }
4045 }
4046
4047 static void shader_arb_destroy(IWineD3DBaseShader *iface) {
4048     IWineD3DBaseShaderImpl *baseShader = (IWineD3DBaseShaderImpl *) iface;
4049     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *)baseShader->baseShader.device;
4050     const WineD3D_GL_Info *gl_info = &device->adapter->gl_info;
4051
4052     ActivateContext(device, device->lastActiveRenderTarget, CTXUSAGE_RESOURCELOAD);
4053
4054     if (shader_is_pshader_version(baseShader->baseShader.reg_maps.shader_version.type))
4055     {
4056         IWineD3DPixelShaderImpl *This = (IWineD3DPixelShaderImpl *) iface;
4057         struct arb_pshader_private *shader_data = This->backend_priv;
4058         UINT i;
4059
4060         if(!shader_data) return; /* This can happen if a shader was never compiled */
4061         ENTER_GL();
4062         for(i = 0; i < shader_data->num_gl_shaders; i++) {
4063             GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId));
4064             checkGLcall("GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId))");
4065         }
4066         LEAVE_GL();
4067         HeapFree(GetProcessHeap(), 0, shader_data->gl_shaders);
4068         HeapFree(GetProcessHeap(), 0, shader_data);
4069         This->backend_priv = NULL;
4070     } else {
4071         IWineD3DVertexShaderImpl *This = (IWineD3DVertexShaderImpl *) iface;
4072         struct arb_vshader_private *shader_data = This->backend_priv;
4073         UINT i;
4074
4075         if(!shader_data) return; /* This can happen if a shader was never compiled */
4076         ENTER_GL();
4077         for(i = 0; i < shader_data->num_gl_shaders; i++) {
4078             GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId));
4079             checkGLcall("GL_EXTCALL(glDeleteProgramsARB(1, &shader_data->gl_shaders[i].prgId))");
4080         }
4081         LEAVE_GL();
4082         HeapFree(GetProcessHeap(), 0, shader_data->gl_shaders);
4083         HeapFree(GetProcessHeap(), 0, shader_data);
4084         This->backend_priv = NULL;
4085     }
4086 }
4087
4088 static int sig_tree_compare(const void *key, const struct wine_rb_entry *entry)
4089 {
4090     struct ps_signature *e = WINE_RB_ENTRY_VALUE(entry, struct ps_signature, entry);
4091     return compare_sig(key, e->sig);
4092 }
4093
4094 struct wine_rb_functions sig_tree_functions =
4095 {
4096     wined3d_rb_alloc,
4097     wined3d_rb_realloc,
4098     wined3d_rb_free,
4099     sig_tree_compare
4100 };
4101
4102 static HRESULT shader_arb_alloc(IWineD3DDevice *iface) {
4103     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
4104     struct shader_arb_priv *priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*priv));
4105     if(wine_rb_init(&priv->signature_tree, &sig_tree_functions) == -1)
4106     {
4107         ERR("RB tree init failed\n");
4108         HeapFree(GetProcessHeap(), 0, priv);
4109         return E_OUTOFMEMORY;
4110     }
4111     This->shader_priv = priv;
4112     return WINED3D_OK;
4113 }
4114
4115 static void release_signature(struct wine_rb_entry *entry, void *context)
4116 {
4117     struct ps_signature *sig = WINE_RB_ENTRY_VALUE(entry, struct ps_signature, entry);
4118     int i;
4119     for(i = 0; i < MAX_REG_INPUT; i++)
4120     {
4121         HeapFree(GetProcessHeap(), 0, (char *) sig->sig[i].semantic_name);
4122     }
4123     HeapFree(GetProcessHeap(), 0, sig->sig);
4124     HeapFree(GetProcessHeap(), 0, sig);
4125 }
4126
4127 static void shader_arb_free(IWineD3DDevice *iface) {
4128     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *)iface;
4129     const WineD3D_GL_Info *gl_info = &This->adapter->gl_info;
4130     struct shader_arb_priv *priv = This->shader_priv;
4131     int i;
4132
4133     ENTER_GL();
4134     if(priv->depth_blt_vprogram_id) {
4135         GL_EXTCALL(glDeleteProgramsARB(1, &priv->depth_blt_vprogram_id));
4136     }
4137     for (i = 0; i < tex_type_count; ++i) {
4138         if (priv->depth_blt_fprogram_id[i]) {
4139             GL_EXTCALL(glDeleteProgramsARB(1, &priv->depth_blt_fprogram_id[i]));
4140         }
4141     }
4142     LEAVE_GL();
4143
4144     wine_rb_destroy(&priv->signature_tree, release_signature, NULL);
4145     HeapFree(GetProcessHeap(), 0, This->shader_priv);
4146 }
4147
4148 static BOOL shader_arb_dirty_const(IWineD3DDevice *iface) {
4149     return TRUE;
4150 }
4151
4152 static void shader_arb_get_caps(WINED3DDEVTYPE devtype, const WineD3D_GL_Info *gl_info, struct shader_caps *pCaps)
4153 {
4154     /* We don't have an ARB fixed function pipeline yet, so let the none backend set its caps,
4155      * then overwrite the shader specific ones
4156      */
4157     none_shader_backend.shader_get_caps(devtype, gl_info, pCaps);
4158
4159     if(GL_SUPPORT(ARB_VERTEX_PROGRAM)) {
4160         if(GL_SUPPORT(NV_VERTEX_PROGRAM3))
4161         {
4162             pCaps->VertexShaderVersion = WINED3DVS_VERSION(3,0);
4163             TRACE_(d3d_caps)("Hardware vertex shader version 3.0 enabled (NV_VERTEX_PROGRAM3)\n");
4164         }
4165         else if(GL_LIMITS(vshader_constantsF) >= 256)
4166         {
4167             /* Shader Model 2.0 requires at least 256 vertex shader constants */
4168             pCaps->VertexShaderVersion = WINED3DVS_VERSION(2,0);
4169             TRACE_(d3d_caps)("Hardware vertex shader version 2.0 enabled (ARB_PROGRAM)\n");
4170         }
4171         else
4172         {
4173             pCaps->VertexShaderVersion = WINED3DVS_VERSION(1,1);
4174             TRACE_(d3d_caps)("Hardware vertex shader version 1.1 enabled (ARB_PROGRAM)\n");
4175         }
4176         pCaps->MaxVertexShaderConst = GL_LIMITS(vshader_constantsF);
4177     }
4178
4179     if(GL_SUPPORT(ARB_FRAGMENT_PROGRAM)) {
4180         if(GL_SUPPORT(NV_FRAGMENT_PROGRAM2))
4181         {
4182             pCaps->PixelShaderVersion    = WINED3DPS_VERSION(3,0);
4183             TRACE_(d3d_caps)("Hardware pixel shader version 3.0 enabled (NV_FRAGMENT_PROGRAM2)\n");
4184         }
4185         else if(GL_LIMITS(vshader_constantsF) >= 32)
4186         {
4187             /* Shader Model 2.0 requires at least 32 pixel shader constants */
4188             pCaps->PixelShaderVersion    = WINED3DPS_VERSION(2,0);
4189             TRACE_(d3d_caps)("Hardware pixel shader version 2.0 enabled (ARB_PROGRAM)\n");
4190         }
4191         else
4192         {
4193             pCaps->PixelShaderVersion    = WINED3DPS_VERSION(1,4);
4194             TRACE_(d3d_caps)("Hardware pixel shader version 1.4 enabled (ARB_PROGRAM)\n");
4195         }
4196         pCaps->PixelShader1xMaxValue = 8.0;
4197         pCaps->MaxPixelShaderConst = GL_LIMITS(pshader_constantsF);
4198     }
4199
4200     pCaps->VSClipping = use_nv_clip(gl_info);
4201 }
4202
4203 static BOOL shader_arb_color_fixup_supported(struct color_fixup_desc fixup)
4204 {
4205     if (TRACE_ON(d3d_shader) && TRACE_ON(d3d))
4206     {
4207         TRACE("Checking support for color_fixup:\n");
4208         dump_color_fixup_desc(fixup);
4209     }
4210
4211     /* We support everything except YUV conversions. */
4212     if (!is_yuv_fixup(fixup))
4213     {
4214         TRACE("[OK]\n");
4215         return TRUE;
4216     }
4217
4218     TRACE("[FAILED]\n");
4219     return FALSE;
4220 }
4221
4222 static void shader_arb_add_instruction_modifiers(const struct wined3d_shader_instruction *ins) {
4223     DWORD shift;
4224     char write_mask[20], regstr[50];
4225     SHADER_BUFFER *buffer = ins->ctx->buffer;
4226     BOOL is_color = FALSE;
4227     const struct wined3d_shader_dst_param *dst;
4228
4229     if (!ins->dst_count) return;
4230
4231     dst = &ins->dst[0];
4232     shift = dst->shift;
4233     if(shift == 0) return; /* Saturate alone is handled by the instructions */
4234
4235     shader_arb_get_write_mask(ins, dst, write_mask);
4236     shader_arb_get_register_name(ins, &dst->reg, regstr, &is_color);
4237
4238     /* Generate a line that does the output modifier computation
4239      * FIXME: _SAT vs shift? _SAT alone is already handled in the instructions, if this
4240      * maps problems in e.g. _d4_sat modify shader_arb_get_modifier
4241      */
4242     shader_addline(buffer, "MUL%s %s%s, %s, %s;\n", shader_arb_get_modifier(ins),
4243                    regstr, write_mask, regstr, shift_tab[shift]);
4244 }
4245
4246 static const SHADER_HANDLER shader_arb_instruction_handler_table[WINED3DSIH_TABLE_SIZE] =
4247 {
4248     /* WINED3DSIH_ABS           */ shader_hw_map2gl,
4249     /* WINED3DSIH_ADD           */ shader_hw_map2gl,
4250     /* WINED3DSIH_BEM           */ pshader_hw_bem,
4251     /* WINED3DSIH_BREAK         */ shader_hw_break,
4252     /* WINED3DSIH_BREAKC        */ shader_hw_breakc,
4253     /* WINED3DSIH_BREAKP        */ NULL,
4254     /* WINED3DSIH_CALL          */ NULL,
4255     /* WINED3DSIH_CALLNZ        */ NULL,
4256     /* WINED3DSIH_CMP           */ pshader_hw_cmp,
4257     /* WINED3DSIH_CND           */ pshader_hw_cnd,
4258     /* WINED3DSIH_CRS           */ shader_hw_map2gl,
4259     /* WINED3DSIH_DCL           */ NULL,
4260     /* WINED3DSIH_DEF           */ NULL,
4261     /* WINED3DSIH_DEFB          */ NULL,
4262     /* WINED3DSIH_DEFI          */ NULL,
4263     /* WINED3DSIH_DP2ADD        */ pshader_hw_dp2add,
4264     /* WINED3DSIH_DP3           */ shader_hw_map2gl,
4265     /* WINED3DSIH_DP4           */ shader_hw_map2gl,
4266     /* WINED3DSIH_DST           */ shader_hw_map2gl,
4267     /* WINED3DSIH_DSX           */ shader_hw_map2gl,
4268     /* WINED3DSIH_DSY           */ shader_hw_dsy,
4269     /* WINED3DSIH_ELSE          */ shader_hw_else,
4270     /* WINED3DSIH_ENDIF         */ shader_hw_endif,
4271     /* WINED3DSIH_ENDLOOP       */ shader_hw_endloop,
4272     /* WINED3DSIH_ENDREP        */ shader_hw_endrep,
4273     /* WINED3DSIH_EXP           */ shader_hw_map2gl,
4274     /* WINED3DSIH_EXPP          */ shader_hw_map2gl,
4275     /* WINED3DSIH_FRC           */ shader_hw_map2gl,
4276     /* WINED3DSIH_IF            */ NULL /* Hardcoded into the shader */,
4277     /* WINED3DSIH_IFC           */ shader_hw_ifc,
4278     /* WINED3DSIH_LABEL         */ NULL,
4279     /* WINED3DSIH_LIT           */ shader_hw_map2gl,
4280     /* WINED3DSIH_LOG           */ shader_hw_map2gl,
4281     /* WINED3DSIH_LOGP          */ shader_hw_map2gl,
4282     /* WINED3DSIH_LOOP          */ shader_hw_loop,
4283     /* WINED3DSIH_LRP           */ shader_hw_lrp,
4284     /* WINED3DSIH_M3x2          */ shader_hw_mnxn,
4285     /* WINED3DSIH_M3x3          */ shader_hw_mnxn,
4286     /* WINED3DSIH_M3x4          */ shader_hw_mnxn,
4287     /* WINED3DSIH_M4x3          */ shader_hw_mnxn,
4288     /* WINED3DSIH_M4x4          */ shader_hw_mnxn,
4289     /* WINED3DSIH_MAD           */ shader_hw_map2gl,
4290     /* WINED3DSIH_MAX           */ shader_hw_map2gl,
4291     /* WINED3DSIH_MIN           */ shader_hw_map2gl,
4292     /* WINED3DSIH_MOV           */ shader_hw_mov,
4293     /* WINED3DSIH_MOVA          */ shader_hw_mov,
4294     /* WINED3DSIH_MUL           */ shader_hw_map2gl,
4295     /* WINED3DSIH_NOP           */ shader_hw_nop,
4296     /* WINED3DSIH_NRM           */ shader_hw_nrm,
4297     /* WINED3DSIH_PHASE         */ NULL,
4298     /* WINED3DSIH_POW           */ shader_hw_map2gl,
4299     /* WINED3DSIH_RCP           */ shader_hw_rsq_rcp,
4300     /* WINED3DSIH_REP           */ shader_hw_rep,
4301     /* WINED3DSIH_RET           */ NULL,
4302     /* WINED3DSIH_RSQ           */ shader_hw_rsq_rcp,
4303     /* WINED3DSIH_SETP          */ NULL,
4304     /* WINED3DSIH_SGE           */ shader_hw_map2gl,
4305     /* WINED3DSIH_SGN           */ shader_hw_sgn,
4306     /* WINED3DSIH_SINCOS        */ shader_hw_sincos,
4307     /* WINED3DSIH_SLT           */ shader_hw_map2gl,
4308     /* WINED3DSIH_SUB           */ shader_hw_map2gl,
4309     /* WINED3DSIH_TEX           */ pshader_hw_tex,
4310     /* WINED3DSIH_TEXBEM        */ pshader_hw_texbem,
4311     /* WINED3DSIH_TEXBEML       */ pshader_hw_texbem,
4312     /* WINED3DSIH_TEXCOORD      */ pshader_hw_texcoord,
4313     /* WINED3DSIH_TEXDEPTH      */ pshader_hw_texdepth,
4314     /* WINED3DSIH_TEXDP3        */ pshader_hw_texdp3,
4315     /* WINED3DSIH_TEXDP3TEX     */ pshader_hw_texdp3tex,
4316     /* WINED3DSIH_TEXKILL       */ pshader_hw_texkill,
4317     /* WINED3DSIH_TEXLDD        */ shader_hw_texldd,
4318     /* WINED3DSIH_TEXLDL        */ shader_hw_texldl,
4319     /* WINED3DSIH_TEXM3x2DEPTH  */ pshader_hw_texm3x2depth,
4320     /* WINED3DSIH_TEXM3x2PAD    */ pshader_hw_texm3x2pad,
4321     /* WINED3DSIH_TEXM3x2TEX    */ pshader_hw_texm3x2tex,
4322     /* WINED3DSIH_TEXM3x3       */ pshader_hw_texm3x3,
4323     /* WINED3DSIH_TEXM3x3DIFF   */ NULL,
4324     /* WINED3DSIH_TEXM3x3PAD    */ pshader_hw_texm3x3pad,
4325     /* WINED3DSIH_TEXM3x3SPEC   */ pshader_hw_texm3x3spec,
4326     /* WINED3DSIH_TEXM3x3TEX    */ pshader_hw_texm3x3tex,
4327     /* WINED3DSIH_TEXM3x3VSPEC  */ pshader_hw_texm3x3vspec,
4328     /* WINED3DSIH_TEXREG2AR     */ pshader_hw_texreg2ar,
4329     /* WINED3DSIH_TEXREG2GB     */ pshader_hw_texreg2gb,
4330     /* WINED3DSIH_TEXREG2RGB    */ pshader_hw_texreg2rgb,
4331 };
4332
4333 static inline BOOL get_bool_const(const struct wined3d_shader_instruction *ins, IWineD3DBaseShaderImpl *This, DWORD idx)
4334 {
4335     BOOL vshader = shader_is_vshader_version(This->baseShader.reg_maps.shader_version.type);
4336     WORD bools = 0;
4337     WORD flag = (1 << idx);
4338     const local_constant *constant;
4339     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
4340
4341     if(This->baseShader.reg_maps.local_bool_consts & flag)
4342     {
4343         /* What good is a if(bool) with a hardcoded local constant? I don't know, but handle it */
4344         LIST_FOR_EACH_ENTRY(constant, &This->baseShader.constantsB, local_constant, entry)
4345         {
4346             if (constant->idx == idx)
4347             {
4348                 return constant->value[0];
4349             }
4350         }
4351         ERR("Local constant not found\n");
4352         return FALSE;
4353     }
4354     else
4355     {
4356         if(vshader) bools = priv->cur_vs_args->boolclip.bools;
4357         else bools = priv->cur_ps_args->bools;
4358         return bools & flag;
4359     }
4360 }
4361
4362 static void get_loop_control_const(const struct wined3d_shader_instruction *ins,
4363         IWineD3DBaseShaderImpl *This, UINT idx, struct loop_control *loop_control)
4364 {
4365     BOOL vshader = shader_is_vshader_version(This->baseShader.reg_maps.shader_version.type);
4366     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
4367
4368     /* Integer constants can either be a local constant, or they can be stored in the shader
4369      * type specific compile args. */
4370     if (This->baseShader.reg_maps.local_int_consts & (1 << idx))
4371     {
4372         const local_constant *constant;
4373
4374         LIST_FOR_EACH_ENTRY(constant, &This->baseShader.constantsI, local_constant, entry)
4375         {
4376             if (constant->idx == idx)
4377             {
4378                 loop_control->count = constant->value[0];
4379                 loop_control->start = constant->value[1];
4380                 /* Step is signed. */
4381                 loop_control->step = (int)constant->value[2];
4382                 return;
4383             }
4384         }
4385         /* If this happens the flag was set incorrectly */
4386         ERR("Local constant not found\n");
4387         loop_control->count = 0;
4388         loop_control->start = 0;
4389         loop_control->step = 0;
4390         return;
4391     }
4392     else
4393     {
4394         if(vshader)
4395         {
4396             /* Count and aL start value are unsigned */
4397             loop_control->count = priv->cur_vs_args->loop_ctrl[idx][0];
4398             loop_control->start = priv->cur_vs_args->loop_ctrl[idx][1];
4399             /* Step is signed. */
4400             loop_control->step = ((char)priv->cur_vs_args->loop_ctrl[idx][2]);
4401         }
4402         else
4403         {
4404             loop_control->count = priv->cur_ps_args->loop_ctrl[idx][0];
4405             loop_control->start = priv->cur_ps_args->loop_ctrl[idx][1];
4406             loop_control->step = ((char)priv->cur_ps_args->loop_ctrl[idx][2]);
4407         }
4408         return;
4409     }
4410 }
4411
4412 static void record_instruction(struct list *list, const struct wined3d_shader_instruction *ins)
4413 {
4414     unsigned int i;
4415     struct wined3d_shader_dst_param *dst_param = NULL;
4416     struct wined3d_shader_src_param *src_param = NULL, *rel_addr = NULL;
4417     struct recorded_instruction *rec = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*rec));
4418     if(!rec)
4419     {
4420         ERR("Out of memory\n");
4421         return;
4422     }
4423
4424     rec->ins = *ins;
4425     dst_param = HeapAlloc(GetProcessHeap(), 0, sizeof(*dst_param));
4426     if(!dst_param) goto free;
4427     *dst_param = *ins->dst;
4428     if(ins->dst->reg.rel_addr)
4429     {
4430         rel_addr = HeapAlloc(GetProcessHeap(), 0, sizeof(*dst_param->reg.rel_addr));
4431         if(!rel_addr) goto free;
4432         *rel_addr = *ins->dst->reg.rel_addr;
4433         dst_param->reg.rel_addr = rel_addr;
4434     }
4435     rec->ins.dst = dst_param;
4436
4437     src_param = HeapAlloc(GetProcessHeap(), 0, sizeof(*src_param) * ins->src_count);
4438     if(!src_param) goto free;
4439     for(i = 0; i < ins->src_count; i++)
4440     {
4441         src_param[i] = ins->src[i];
4442         if(ins->src[i].reg.rel_addr)
4443         {
4444             rel_addr = HeapAlloc(GetProcessHeap(), 0, sizeof(*rel_addr));
4445             if(!rel_addr) goto free;
4446             *rel_addr = *ins->src[i].reg.rel_addr;
4447             src_param[i].reg.rel_addr = rel_addr;
4448         }
4449     }
4450     rec->ins.src = src_param;
4451     list_add_tail(list, &rec->entry);
4452     return;
4453
4454 free:
4455     ERR("Out of memory\n");
4456     if(dst_param)
4457     {
4458         HeapFree(GetProcessHeap(), 0, (void *) dst_param->reg.rel_addr);
4459         HeapFree(GetProcessHeap(), 0, dst_param);
4460     }
4461     if(src_param)
4462     {
4463         for(i = 0; i < ins->src_count; i++)
4464         {
4465             HeapFree(GetProcessHeap(), 0, (void *) src_param[i].reg.rel_addr);
4466         }
4467         HeapFree(GetProcessHeap(), 0, src_param);
4468     }
4469     HeapFree(GetProcessHeap(), 0, rec);
4470 }
4471
4472 static void free_recorded_instruction(struct list *list)
4473 {
4474     struct recorded_instruction *rec_ins, *entry2;
4475     unsigned int i;
4476
4477     LIST_FOR_EACH_ENTRY_SAFE(rec_ins, entry2, list, struct recorded_instruction, entry)
4478     {
4479         list_remove(&rec_ins->entry);
4480         if(rec_ins->ins.dst)
4481         {
4482             HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.dst->reg.rel_addr);
4483             HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.dst);
4484         }
4485         if(rec_ins->ins.src)
4486         {
4487             for(i = 0; i < rec_ins->ins.src_count; i++)
4488             {
4489                 HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.src[i].reg.rel_addr);
4490             }
4491             HeapFree(GetProcessHeap(), 0, (void *) rec_ins->ins.src);
4492         }
4493         HeapFree(GetProcessHeap(), 0, rec_ins);
4494     }
4495 }
4496
4497 static void shader_arb_handle_instruction(const struct wined3d_shader_instruction *ins) {
4498     SHADER_HANDLER hw_fct;
4499     struct shader_arb_ctx_priv *priv = ins->ctx->backend_data;
4500     IWineD3DBaseShaderImpl *This = (IWineD3DBaseShaderImpl *)ins->ctx->shader;
4501     struct control_frame *control_frame;
4502     SHADER_BUFFER *buffer = ins->ctx->buffer;
4503
4504     if(ins->handler_idx == WINED3DSIH_LOOP || ins->handler_idx == WINED3DSIH_REP)
4505     {
4506         control_frame = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*control_frame));
4507         list_add_head(&priv->control_frames, &control_frame->entry);
4508
4509         if(ins->handler_idx == WINED3DSIH_LOOP) control_frame->type = LOOP;
4510         if(ins->handler_idx == WINED3DSIH_REP) control_frame->type = REP;
4511
4512         if(priv->target_version >= NV2)
4513         {
4514             control_frame->loop_no = priv->num_loops++;
4515             priv->loop_depth++;
4516         }
4517         else
4518         {
4519             /* Don't bother recording when we're in a not used if branch */
4520             if(priv->muted)
4521             {
4522                 return;
4523             }
4524
4525             if(!priv->recording)
4526             {
4527                 list_init(&priv->record);
4528                 priv->recording = TRUE;
4529                 control_frame->outer_loop = TRUE;
4530                 get_loop_control_const(ins, This, ins->src[0].reg.idx, &control_frame->loop_control);
4531                 return; /* Instruction is handled */
4532             }
4533             /* Record this loop in the outer loop's recording */
4534         }
4535     }
4536     else if(ins->handler_idx == WINED3DSIH_ENDLOOP || ins->handler_idx == WINED3DSIH_ENDREP)
4537     {
4538         if(priv->target_version >= NV2)
4539         {
4540             /* Nothing to do. The control frame is popped after the HW instr handler */
4541         }
4542         else
4543         {
4544             struct list *e = list_head(&priv->control_frames);
4545             control_frame = LIST_ENTRY(e, struct control_frame, entry);
4546             list_remove(&control_frame->entry);
4547
4548             if(control_frame->outer_loop)
4549             {
4550                 int iteration, aL = 0;
4551                 struct list copy;
4552
4553                 /* Turn off recording before playback */
4554                 priv->recording = FALSE;
4555
4556                 /* Move the recorded instructions to a separate list and get them out of the private data
4557                  * structure. If there are nested loops, the shader_arb_handle_instruction below will
4558                  * be recorded again, thus priv->record might be overwritten
4559                  */
4560                 list_init(&copy);
4561                 list_move_tail(&copy, &priv->record);
4562                 list_init(&priv->record);
4563
4564                 if(ins->handler_idx == WINED3DSIH_ENDLOOP)
4565                 {
4566                     shader_addline(buffer, "#unrolling loop: %u iterations, aL=%u, inc %d\n",
4567                                    control_frame->loop_control.count, control_frame->loop_control.start,
4568                                    control_frame->loop_control.step);
4569                     aL = control_frame->loop_control.start;
4570                 }
4571                 else
4572                 {
4573                     shader_addline(buffer, "#unrolling rep: %u iterations\n", control_frame->loop_control.count);
4574                 }
4575
4576                 for (iteration = 0; iteration < control_frame->loop_control.count; ++iteration)
4577                 {
4578                     struct recorded_instruction *rec_ins;
4579                     if(ins->handler_idx == WINED3DSIH_ENDLOOP)
4580                     {
4581                         priv->aL = aL;
4582                         shader_addline(buffer, "#Iteration %d, aL=%d\n", iteration, aL);
4583                     }
4584                     else
4585                     {
4586                         shader_addline(buffer, "#Iteration %d\n", iteration);
4587                     }
4588
4589                     LIST_FOR_EACH_ENTRY(rec_ins, &copy, struct recorded_instruction, entry)
4590                     {
4591                         shader_arb_handle_instruction(&rec_ins->ins);
4592                     }
4593
4594                     if(ins->handler_idx == WINED3DSIH_ENDLOOP)
4595                     {
4596                         aL += control_frame->loop_control.step;
4597                     }
4598                 }
4599                 shader_addline(buffer, "#end loop/rep\n");
4600
4601                 free_recorded_instruction(&copy);
4602                 HeapFree(GetProcessHeap(), 0, control_frame);
4603                 return; /* Instruction is handled */
4604             }
4605             else
4606             {
4607                 /* This is a nested loop. Proceed to the normal recording function */
4608                 HeapFree(GetProcessHeap(), 0, control_frame);
4609             }
4610         }
4611     }
4612
4613     if(priv->recording)
4614     {
4615         record_instruction(&priv->record, ins);
4616         return;
4617     }
4618
4619     /* boolean if */
4620     if(ins->handler_idx == WINED3DSIH_IF)
4621     {
4622         control_frame = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*control_frame));
4623         list_add_head(&priv->control_frames, &control_frame->entry);
4624         control_frame->type = IF;
4625
4626         if(!priv->muted && get_bool_const(ins, This, ins->src[0].reg.idx) == FALSE)
4627         {
4628             shader_addline(buffer, "#if(FALSE){\n");
4629             priv->muted = TRUE;
4630             control_frame->muting = TRUE;
4631         }
4632         else shader_addline(buffer, "#if(TRUE) {\n");
4633
4634         return; /* Instruction is handled */
4635     }
4636     else if(ins->handler_idx == WINED3DSIH_IFC)
4637     {
4638         /* IF(bool) and if_cond(a, b) use the same ELSE and ENDIF tokens */
4639         control_frame = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(*control_frame));
4640         control_frame->type = IFC;
4641         control_frame->ifc_no = priv->num_ifcs++;
4642         list_add_head(&priv->control_frames, &control_frame->entry);
4643     }
4644     else if(ins->handler_idx == WINED3DSIH_ELSE)
4645     {
4646         struct list *e = list_head(&priv->control_frames);
4647         control_frame = LIST_ENTRY(e, struct control_frame, entry);
4648
4649         if(control_frame->type == IF)
4650         {
4651             shader_addline(buffer, "#} else {\n");
4652             if(!priv->muted && !control_frame->muting)
4653             {
4654                 priv->muted = TRUE;
4655                 control_frame->muting = TRUE;
4656             }
4657             else if(control_frame->muting) priv->muted = FALSE;
4658             return; /* Instruction is handled. */
4659         }
4660         /* In case of an ifc, generate a HW shader instruction */
4661     }
4662     else if(ins->handler_idx == WINED3DSIH_ENDIF)
4663     {
4664         struct list *e = list_head(&priv->control_frames);
4665         control_frame = LIST_ENTRY(e, struct control_frame, entry);
4666
4667         if(control_frame->type == IF)
4668         {
4669             shader_addline(buffer, "#} endif\n");
4670             if(control_frame->muting) priv->muted = FALSE;
4671             list_remove(&control_frame->entry);
4672             HeapFree(GetProcessHeap(), 0, control_frame);
4673             return; /* Instruction is handled */
4674         }
4675     }
4676
4677     if(priv->muted) return;
4678
4679     /* Select handler */
4680     hw_fct = shader_arb_instruction_handler_table[ins->handler_idx];
4681
4682     /* Unhandled opcode */
4683     if (!hw_fct)
4684     {
4685         FIXME("Backend can't handle opcode %#x\n", ins->handler_idx);
4686         return;
4687     }
4688     hw_fct(ins);
4689
4690     if(ins->handler_idx == WINED3DSIH_ENDLOOP || ins->handler_idx == WINED3DSIH_ENDREP)
4691     {
4692         struct list *e = list_head(&priv->control_frames);
4693         control_frame = LIST_ENTRY(e, struct control_frame, entry);
4694         list_remove(&control_frame->entry);
4695         HeapFree(GetProcessHeap(), 0, control_frame);
4696         priv->loop_depth--;
4697     }
4698     else if(ins->handler_idx == WINED3DSIH_ENDIF)
4699     {
4700         /* Non-ifc ENDIFs don't reach that place because of the return in the if block above */
4701         struct list *e = list_head(&priv->control_frames);
4702         control_frame = LIST_ENTRY(e, struct control_frame, entry);
4703         list_remove(&control_frame->entry);
4704         HeapFree(GetProcessHeap(), 0, control_frame);
4705     }
4706
4707
4708     shader_arb_add_instruction_modifiers(ins);
4709 }
4710
4711 const shader_backend_t arb_program_shader_backend = {
4712     shader_arb_handle_instruction,
4713     shader_arb_select,
4714     shader_arb_select_depth_blt,
4715     shader_arb_deselect_depth_blt,
4716     shader_arb_update_float_vertex_constants,
4717     shader_arb_update_float_pixel_constants,
4718     shader_arb_load_constants,
4719     shader_arb_load_np2fixup_constants,
4720     shader_arb_destroy,
4721     shader_arb_alloc,
4722     shader_arb_free,
4723     shader_arb_dirty_const,
4724     shader_arb_get_caps,
4725     shader_arb_color_fixup_supported,
4726 };
4727
4728 /* ARB_fragment_program fixed function pipeline replacement definitions */
4729 #define ARB_FFP_CONST_TFACTOR           0
4730 #define ARB_FFP_CONST_SPECULAR_ENABLE   ((ARB_FFP_CONST_TFACTOR) + 1)
4731 #define ARB_FFP_CONST_CONSTANT(i)       ((ARB_FFP_CONST_SPECULAR_ENABLE) + 1 + i)
4732 #define ARB_FFP_CONST_BUMPMAT(i)        ((ARB_FFP_CONST_CONSTANT(7)) + 1 + i)
4733 #define ARB_FFP_CONST_LUMINANCE(i)      ((ARB_FFP_CONST_BUMPMAT(7)) + 1 + i)
4734
4735 struct arbfp_ffp_desc
4736 {
4737     struct ffp_frag_desc parent;
4738     GLuint shader;
4739     unsigned int num_textures_used;
4740 };
4741
4742 static void arbfp_enable(IWineD3DDevice *iface, BOOL enable) {
4743     ENTER_GL();
4744     if(enable) {
4745         glEnable(GL_FRAGMENT_PROGRAM_ARB);
4746         checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB)");
4747     } else {
4748         glDisable(GL_FRAGMENT_PROGRAM_ARB);
4749         checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
4750     }
4751     LEAVE_GL();
4752 }
4753
4754 static HRESULT arbfp_alloc(IWineD3DDevice *iface) {
4755     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *) iface;
4756     struct shader_arb_priv *priv;
4757     /* Share private data between the shader backend and the pipeline replacement, if both
4758      * are the arb implementation. This is needed to figure out whether ARBfp should be disabled
4759      * if no pixel shader is bound or not
4760      */
4761     if(This->shader_backend == &arb_program_shader_backend) {
4762         This->fragment_priv = This->shader_priv;
4763     } else {
4764         This->fragment_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(struct shader_arb_priv));
4765         if(!This->fragment_priv) return E_OUTOFMEMORY;
4766     }
4767     priv = This->fragment_priv;
4768     if (wine_rb_init(&priv->fragment_shaders, &wined3d_ffp_frag_program_rb_functions) == -1)
4769     {
4770         ERR("Failed to initialize rbtree.\n");
4771         HeapFree(GetProcessHeap(), 0, This->fragment_priv);
4772         return E_OUTOFMEMORY;
4773     }
4774     priv->use_arbfp_fixed_func = TRUE;
4775     return WINED3D_OK;
4776 }
4777
4778 static void arbfp_free_ffpshader(struct wine_rb_entry *entry, void *context)
4779 {
4780     const WineD3D_GL_Info *gl_info = context;
4781     struct arbfp_ffp_desc *entry_arb = WINE_RB_ENTRY_VALUE(entry, struct arbfp_ffp_desc, parent.entry);
4782
4783     ENTER_GL();
4784     GL_EXTCALL(glDeleteProgramsARB(1, &entry_arb->shader));
4785     checkGLcall("glDeleteProgramsARB(1, &entry_arb->shader)");
4786     HeapFree(GetProcessHeap(), 0, entry_arb);
4787     LEAVE_GL();
4788 }
4789
4790 static void arbfp_free(IWineD3DDevice *iface) {
4791     IWineD3DDeviceImpl *This = (IWineD3DDeviceImpl *) iface;
4792     struct shader_arb_priv *priv = This->fragment_priv;
4793
4794     wine_rb_destroy(&priv->fragment_shaders, arbfp_free_ffpshader, &This->adapter->gl_info);
4795     priv->use_arbfp_fixed_func = FALSE;
4796
4797     if(This->shader_backend != &arb_program_shader_backend) {
4798         HeapFree(GetProcessHeap(), 0, This->fragment_priv);
4799     }
4800 }
4801
4802 static void arbfp_get_caps(WINED3DDEVTYPE devtype, const WineD3D_GL_Info *gl_info, struct fragment_caps *caps)
4803 {
4804     caps->TextureOpCaps =  WINED3DTEXOPCAPS_DISABLE                     |
4805                            WINED3DTEXOPCAPS_SELECTARG1                  |
4806                            WINED3DTEXOPCAPS_SELECTARG2                  |
4807                            WINED3DTEXOPCAPS_MODULATE4X                  |
4808                            WINED3DTEXOPCAPS_MODULATE2X                  |
4809                            WINED3DTEXOPCAPS_MODULATE                    |
4810                            WINED3DTEXOPCAPS_ADDSIGNED2X                 |
4811                            WINED3DTEXOPCAPS_ADDSIGNED                   |
4812                            WINED3DTEXOPCAPS_ADD                         |
4813                            WINED3DTEXOPCAPS_SUBTRACT                    |
4814                            WINED3DTEXOPCAPS_ADDSMOOTH                   |
4815                            WINED3DTEXOPCAPS_BLENDCURRENTALPHA           |
4816                            WINED3DTEXOPCAPS_BLENDFACTORALPHA            |
4817                            WINED3DTEXOPCAPS_BLENDTEXTUREALPHA           |
4818                            WINED3DTEXOPCAPS_BLENDDIFFUSEALPHA           |
4819                            WINED3DTEXOPCAPS_BLENDTEXTUREALPHAPM         |
4820                            WINED3DTEXOPCAPS_MODULATEALPHA_ADDCOLOR      |
4821                            WINED3DTEXOPCAPS_MODULATECOLOR_ADDALPHA      |
4822                            WINED3DTEXOPCAPS_MODULATEINVCOLOR_ADDALPHA   |
4823                            WINED3DTEXOPCAPS_MODULATEINVALPHA_ADDCOLOR   |
4824                            WINED3DTEXOPCAPS_DOTPRODUCT3                 |
4825                            WINED3DTEXOPCAPS_MULTIPLYADD                 |
4826                            WINED3DTEXOPCAPS_LERP                        |
4827                            WINED3DTEXOPCAPS_BUMPENVMAP                  |
4828                            WINED3DTEXOPCAPS_BUMPENVMAPLUMINANCE;
4829
4830     /* TODO: Implement WINED3DTEXOPCAPS_PREMODULATE */
4831
4832     caps->MaxTextureBlendStages   = 8;
4833     caps->MaxSimultaneousTextures = min(GL_LIMITS(fragment_samplers), 8);
4834
4835     caps->PrimitiveMiscCaps |= WINED3DPMISCCAPS_TSSARGTEMP;
4836 }
4837 #undef GLINFO_LOCATION
4838
4839 #define GLINFO_LOCATION stateblock->wineD3DDevice->adapter->gl_info
4840 static void state_texfactor_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
4841     float col[4];
4842     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
4843
4844     /* Don't load the parameter if we're using an arbfp pixel shader, otherwise we'll overwrite
4845      * application provided constants
4846      */
4847     if(device->shader_backend == &arb_program_shader_backend) {
4848         if (use_ps(stateblock)) return;
4849
4850         device = stateblock->wineD3DDevice;
4851         device->activeContext->pshader_const_dirty[ARB_FFP_CONST_TFACTOR] = 1;
4852         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_TFACTOR + 1);
4853     }
4854
4855     D3DCOLORTOGLFLOAT4(stateblock->renderState[WINED3DRS_TEXTUREFACTOR], col);
4856     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_TFACTOR, col));
4857     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_TFACTOR, col)");
4858
4859 }
4860
4861 static void state_arb_specularenable(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
4862     float col[4];
4863     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
4864
4865     /* Don't load the parameter if we're using an arbfp pixel shader, otherwise we'll overwrite
4866      * application provided constants
4867      */
4868     if(device->shader_backend == &arb_program_shader_backend) {
4869         if (use_ps(stateblock)) return;
4870
4871         device = stateblock->wineD3DDevice;
4872         device->activeContext->pshader_const_dirty[ARB_FFP_CONST_SPECULAR_ENABLE] = 1;
4873         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_SPECULAR_ENABLE + 1);
4874     }
4875
4876     if(stateblock->renderState[WINED3DRS_SPECULARENABLE]) {
4877         /* The specular color has no alpha */
4878         col[0] = 1.0; col[1] = 1.0;
4879         col[2] = 1.0; col[3] = 0.0;
4880     } else {
4881         col[0] = 0.0; col[1] = 0.0;
4882         col[2] = 0.0; col[3] = 0.0;
4883     }
4884     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_SPECULAR_ENABLE, col));
4885     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_SPECULAR_ENABLE, col)");
4886 }
4887
4888 static void set_bumpmat_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
4889     DWORD stage = (state - STATE_TEXTURESTAGE(0, 0)) / (WINED3D_HIGHEST_TEXTURE_STATE + 1);
4890     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
4891     float mat[2][2];
4892
4893     if (use_ps(stateblock))
4894     {
4895         if(stage != 0 &&
4896            ((IWineD3DPixelShaderImpl *) stateblock->pixelShader)->baseShader.reg_maps.bumpmat[stage]) {
4897             /* The pixel shader has to know the bump env matrix. Do a constants update if it isn't scheduled
4898              * anyway
4899              */
4900             if(!isStateDirty(context, STATE_PIXELSHADERCONSTANT)) {
4901                 device->StateTable[STATE_PIXELSHADERCONSTANT].apply(STATE_PIXELSHADERCONSTANT, stateblock, context);
4902             }
4903         }
4904
4905         if(device->shader_backend == &arb_program_shader_backend) {
4906             /* Exit now, don't set the bumpmat below, otherwise we may overwrite pixel shader constants */
4907             return;
4908         }
4909     } else if(device->shader_backend == &arb_program_shader_backend) {
4910         device->activeContext->pshader_const_dirty[ARB_FFP_CONST_BUMPMAT(stage)] = 1;
4911         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_BUMPMAT(stage) + 1);
4912     }
4913
4914     mat[0][0] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT00]);
4915     mat[0][1] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT01]);
4916     mat[1][0] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT10]);
4917     mat[1][1] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVMAT11]);
4918
4919     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_BUMPMAT(stage), &mat[0][0]));
4920     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_BUMPMAT(stage), &mat[0][0])");
4921 }
4922
4923 static void tex_bumpenvlum_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
4924     DWORD stage = (state - STATE_TEXTURESTAGE(0, 0)) / (WINED3D_HIGHEST_TEXTURE_STATE + 1);
4925     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
4926     float param[4];
4927
4928     if (use_ps(stateblock))
4929     {
4930         if(stage != 0 &&
4931            ((IWineD3DPixelShaderImpl *) stateblock->pixelShader)->baseShader.reg_maps.luminanceparams[stage]) {
4932             /* The pixel shader has to know the luminance offset. Do a constants update if it
4933              * isn't scheduled anyway
4934              */
4935             if(!isStateDirty(context, STATE_PIXELSHADERCONSTANT)) {
4936                 device->StateTable[STATE_PIXELSHADERCONSTANT].apply(STATE_PIXELSHADERCONSTANT, stateblock, context);
4937             }
4938         }
4939
4940         if(device->shader_backend == &arb_program_shader_backend) {
4941             /* Exit now, don't set the bumpmat below, otherwise we may overwrite pixel shader constants */
4942             return;
4943         }
4944     } else if(device->shader_backend == &arb_program_shader_backend) {
4945         device->activeContext->pshader_const_dirty[ARB_FFP_CONST_LUMINANCE(stage)] = 1;
4946         device->highest_dirty_ps_const = max(device->highest_dirty_ps_const, ARB_FFP_CONST_LUMINANCE(stage) + 1);
4947     }
4948
4949     param[0] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVLSCALE]);
4950     param[1] = *((float *) &stateblock->textureState[stage][WINED3DTSS_BUMPENVLOFFSET]);
4951     param[2] = 0.0;
4952     param[3] = 0.0;
4953
4954     GL_EXTCALL(glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_LUMINANCE(stage), param));
4955     checkGLcall("glProgramEnvParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, ARB_FFP_CONST_LUMINANCE(stage), param)");
4956 }
4957
4958 static const char *get_argreg(SHADER_BUFFER *buffer, DWORD argnum, unsigned int stage, DWORD arg) {
4959     const char *ret;
4960
4961     if(arg == ARG_UNUSED) return "unused"; /* This is the marker for unused registers */
4962
4963     switch(arg & WINED3DTA_SELECTMASK) {
4964         case WINED3DTA_DIFFUSE:
4965             ret = "fragment.color.primary"; break;
4966
4967         case WINED3DTA_CURRENT:
4968             if(stage == 0) ret = "fragment.color.primary";
4969             else ret = "ret";
4970             break;
4971
4972         case WINED3DTA_TEXTURE:
4973             switch(stage) {
4974                 case 0: ret = "tex0"; break;
4975                 case 1: ret = "tex1"; break;
4976                 case 2: ret = "tex2"; break;
4977                 case 3: ret = "tex3"; break;
4978                 case 4: ret = "tex4"; break;
4979                 case 5: ret = "tex5"; break;
4980                 case 6: ret = "tex6"; break;
4981                 case 7: ret = "tex7"; break;
4982                 default: ret = "unknown texture";
4983             }
4984             break;
4985
4986         case WINED3DTA_TFACTOR:
4987             ret = "tfactor"; break;
4988
4989         case WINED3DTA_SPECULAR:
4990             ret = "fragment.color.secondary"; break;
4991
4992         case WINED3DTA_TEMP:
4993             ret = "tempreg"; break;
4994
4995         case WINED3DTA_CONSTANT:
4996             FIXME("Implement perstage constants\n");
4997             switch(stage) {
4998                 case 0: ret = "const0"; break;
4999                 case 1: ret = "const1"; break;
5000                 case 2: ret = "const2"; break;
5001                 case 3: ret = "const3"; break;
5002                 case 4: ret = "const4"; break;
5003                 case 5: ret = "const5"; break;
5004                 case 6: ret = "const6"; break;
5005                 case 7: ret = "const7"; break;
5006                 default: ret = "unknown constant";
5007             }
5008             break;
5009
5010         default:
5011             return "unknown";
5012     }
5013
5014     if(arg & WINED3DTA_COMPLEMENT) {
5015         shader_addline(buffer, "SUB arg%u, const.x, %s;\n", argnum, ret);
5016         if(argnum == 0) ret = "arg0";
5017         if(argnum == 1) ret = "arg1";
5018         if(argnum == 2) ret = "arg2";
5019     }
5020     if(arg & WINED3DTA_ALPHAREPLICATE) {
5021         shader_addline(buffer, "MOV arg%u, %s.w;\n", argnum, ret);
5022         if(argnum == 0) ret = "arg0";
5023         if(argnum == 1) ret = "arg1";
5024         if(argnum == 2) ret = "arg2";
5025     }
5026     return ret;
5027 }
5028
5029 static void gen_ffp_instr(SHADER_BUFFER *buffer, unsigned int stage, BOOL color, BOOL alpha,
5030                           DWORD dst, DWORD op, DWORD dw_arg0, DWORD dw_arg1, DWORD dw_arg2) {
5031     const char *dstmask, *dstreg, *arg0, *arg1, *arg2;
5032     unsigned int mul = 1;
5033     BOOL mul_final_dest = FALSE;
5034
5035     if(color && alpha) dstmask = "";
5036     else if(color) dstmask = ".xyz";
5037     else dstmask = ".w";
5038
5039     if(dst == tempreg) dstreg = "tempreg";
5040     else dstreg = "ret";
5041
5042     arg0 = get_argreg(buffer, 0, stage, dw_arg0);
5043     arg1 = get_argreg(buffer, 1, stage, dw_arg1);
5044     arg2 = get_argreg(buffer, 2, stage, dw_arg2);
5045
5046     switch(op) {
5047         case WINED3DTOP_DISABLE:
5048             if(stage == 0) shader_addline(buffer, "MOV %s%s, fragment.color.primary;\n", dstreg, dstmask);
5049             break;
5050
5051         case WINED3DTOP_SELECTARG2:
5052             arg1 = arg2;
5053         case WINED3DTOP_SELECTARG1:
5054             shader_addline(buffer, "MOV %s%s, %s;\n", dstreg, dstmask, arg1);
5055             break;
5056
5057         case WINED3DTOP_MODULATE4X:
5058             mul = 2;
5059         case WINED3DTOP_MODULATE2X:
5060             mul *= 2;
5061             if(strcmp(dstreg, "result.color") == 0) {
5062                 dstreg = "ret";
5063                 mul_final_dest = TRUE;
5064             }
5065         case WINED3DTOP_MODULATE:
5066             shader_addline(buffer, "MUL %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
5067             break;
5068
5069         case WINED3DTOP_ADDSIGNED2X:
5070             mul = 2;
5071             if(strcmp(dstreg, "result.color") == 0) {
5072                 dstreg = "ret";
5073                 mul_final_dest = TRUE;
5074             }
5075         case WINED3DTOP_ADDSIGNED:
5076             shader_addline(buffer, "SUB arg2, %s, const.w;\n", arg2);
5077             arg2 = "arg2";
5078         case WINED3DTOP_ADD:
5079             shader_addline(buffer, "ADD_SAT %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
5080             break;
5081
5082         case WINED3DTOP_SUBTRACT:
5083             shader_addline(buffer, "SUB_SAT %s%s, %s, %s;\n", dstreg, dstmask, arg1, arg2);
5084             break;
5085
5086         case WINED3DTOP_ADDSMOOTH:
5087             shader_addline(buffer, "SUB arg1, const.x, %s;\n", arg1);
5088             shader_addline(buffer, "MAD_SAT %s%s, arg1, %s, %s;\n", dstreg, dstmask, arg2, arg1);
5089             break;
5090
5091         case WINED3DTOP_BLENDCURRENTALPHA:
5092             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_CURRENT);
5093             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
5094             break;
5095         case WINED3DTOP_BLENDFACTORALPHA:
5096             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TFACTOR);
5097             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
5098             break;
5099         case WINED3DTOP_BLENDTEXTUREALPHA:
5100             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TEXTURE);
5101             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
5102             break;
5103         case WINED3DTOP_BLENDDIFFUSEALPHA:
5104             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_DIFFUSE);
5105             shader_addline(buffer, "LRP %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
5106             break;
5107
5108         case WINED3DTOP_BLENDTEXTUREALPHAPM:
5109             arg0 = get_argreg(buffer, 0, stage, WINED3DTA_TEXTURE);
5110             shader_addline(buffer, "SUB arg0.w, const.x, %s.w;\n", arg0);
5111             shader_addline(buffer, "MAD_SAT %s%s, %s, arg0.w, %s;\n", dstreg, dstmask, arg2, arg1);
5112             break;
5113
5114         /* D3DTOP_PREMODULATE ???? */
5115
5116         case WINED3DTOP_MODULATEINVALPHA_ADDCOLOR:
5117             shader_addline(buffer, "SUB arg0.w, const.x, %s;\n", arg1);
5118             shader_addline(buffer, "MAD_SAT %s%s, arg0.w, %s, %s;\n", dstreg, dstmask, arg2, arg1);
5119             break;
5120         case WINED3DTOP_MODULATEALPHA_ADDCOLOR:
5121             shader_addline(buffer, "MAD_SAT %s%s, %s.w, %s, %s;\n", dstreg, dstmask, arg1, arg2, arg1);
5122             break;
5123         case WINED3DTOP_MODULATEINVCOLOR_ADDALPHA:
5124             shader_addline(buffer, "SUB arg0, const.x, %s;\n", arg1);
5125             shader_addline(buffer, "MAD_SAT %s%s, arg0, %s, %s.w;\n", dstreg, dstmask, arg2, arg1);
5126             break;
5127         case WINED3DTOP_MODULATECOLOR_ADDALPHA:
5128             shader_addline(buffer, "MAD_SAT %s%s, %s, %s, %s.w;\n", dstreg, dstmask, arg1, arg2, arg1);
5129             break;
5130
5131         case WINED3DTOP_DOTPRODUCT3:
5132             mul = 4;
5133             if(strcmp(dstreg, "result.color") == 0) {
5134                 dstreg = "ret";
5135                 mul_final_dest = TRUE;
5136             }
5137             shader_addline(buffer, "SUB arg1, %s, const.w;\n", arg1);
5138             shader_addline(buffer, "SUB arg2, %s, const.w;\n", arg2);
5139             shader_addline(buffer, "DP3_SAT %s%s, arg1, arg2;\n", dstreg, dstmask);
5140             break;
5141
5142         case WINED3DTOP_MULTIPLYADD:
5143             shader_addline(buffer, "MAD_SAT %s%s, %s, %s, %s;\n", dstreg, dstmask, arg1, arg2, arg0);
5144             break;
5145
5146         case WINED3DTOP_LERP:
5147             /* The msdn is not quite right here */
5148             shader_addline(buffer, "LRP %s%s, %s, %s, %s;\n", dstreg, dstmask, arg0, arg1, arg2);
5149             break;
5150
5151         case WINED3DTOP_BUMPENVMAP:
5152         case WINED3DTOP_BUMPENVMAPLUMINANCE:
5153             /* Those are handled in the first pass of the shader(generation pass 1 and 2) already */
5154             break;
5155
5156         default:
5157             FIXME("Unhandled texture op %08x\n", op);
5158     }
5159
5160     if(mul == 2) {
5161         shader_addline(buffer, "MUL_SAT %s%s, %s, const.y;\n", mul_final_dest ? "result.color" : dstreg, dstmask, dstreg);
5162     } else if(mul == 4) {
5163         shader_addline(buffer, "MUL_SAT %s%s, %s, const.z;\n", mul_final_dest ? "result.color" : dstreg, dstmask, dstreg);
5164     }
5165 }
5166
5167 /* The stateblock is passed for GLINFO_LOCATION */
5168 static GLuint gen_arbfp_ffp_shader(const struct ffp_frag_settings *settings, IWineD3DStateBlockImpl *stateblock)
5169 {
5170     unsigned int stage;
5171     SHADER_BUFFER buffer;
5172     BOOL tex_read[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
5173     BOOL bump_used[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
5174     BOOL luminance_used[MAX_TEXTURES] = {FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALSE};
5175     const char *textype;
5176     const char *instr, *sat;
5177     char colorcor_dst[8];
5178     GLuint ret;
5179     DWORD arg0, arg1, arg2;
5180     BOOL tempreg_used = FALSE, tfactor_used = FALSE;
5181     BOOL op_equal;
5182     const char *final_combiner_src = "ret";
5183
5184     /* Find out which textures are read */
5185     for(stage = 0; stage < MAX_TEXTURES; stage++) {
5186         if(settings->op[stage].cop == WINED3DTOP_DISABLE) break;
5187         arg0 = settings->op[stage].carg0 & WINED3DTA_SELECTMASK;
5188         arg1 = settings->op[stage].carg1 & WINED3DTA_SELECTMASK;
5189         arg2 = settings->op[stage].carg2 & WINED3DTA_SELECTMASK;
5190         if(arg0 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
5191         if(arg1 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
5192         if(arg2 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
5193
5194         if(settings->op[stage].cop == WINED3DTOP_BLENDTEXTUREALPHA) tex_read[stage] = TRUE;
5195         if(settings->op[stage].cop == WINED3DTOP_BLENDTEXTUREALPHAPM) tex_read[stage] = TRUE;
5196         if(settings->op[stage].cop == WINED3DTOP_BUMPENVMAP) {
5197             bump_used[stage] = TRUE;
5198             tex_read[stage] = TRUE;
5199         }
5200         if(settings->op[stage].cop == WINED3DTOP_BUMPENVMAPLUMINANCE) {
5201             bump_used[stage] = TRUE;
5202             tex_read[stage] = TRUE;
5203             luminance_used[stage] = TRUE;
5204         } else if(settings->op[stage].cop == WINED3DTOP_BLENDFACTORALPHA) {
5205             tfactor_used = TRUE;
5206         }
5207
5208         if(arg0 == WINED3DTA_TFACTOR || arg1 == WINED3DTA_TFACTOR || arg2 == WINED3DTA_TFACTOR) {
5209             tfactor_used = TRUE;
5210         }
5211
5212         if(settings->op[stage].dst == tempreg) tempreg_used = TRUE;
5213         if(arg0 == WINED3DTA_TEMP || arg1 == WINED3DTA_TEMP || arg2 == WINED3DTA_TEMP) {
5214             tempreg_used = TRUE;
5215         }
5216
5217         if(settings->op[stage].aop == WINED3DTOP_DISABLE) continue;
5218         arg0 = settings->op[stage].aarg0 & WINED3DTA_SELECTMASK;
5219         arg1 = settings->op[stage].aarg1 & WINED3DTA_SELECTMASK;
5220         arg2 = settings->op[stage].aarg2 & WINED3DTA_SELECTMASK;
5221         if(arg0 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
5222         if(arg1 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
5223         if(arg2 == WINED3DTA_TEXTURE) tex_read[stage] = TRUE;
5224
5225         if(arg0 == WINED3DTA_TEMP || arg1 == WINED3DTA_TEMP || arg2 == WINED3DTA_TEMP) {
5226             tempreg_used = TRUE;
5227         }
5228         if(arg0 == WINED3DTA_TFACTOR || arg1 == WINED3DTA_TFACTOR || arg2 == WINED3DTA_TFACTOR) {
5229             tfactor_used = TRUE;
5230         }
5231     }
5232
5233     /* Shader header */
5234     shader_buffer_init(&buffer);
5235
5236     shader_addline(&buffer, "!!ARBfp1.0\n");
5237
5238     switch(settings->fog) {
5239         case FOG_OFF:                                                         break;
5240         case FOG_LINEAR: shader_addline(&buffer, "OPTION ARB_fog_linear;\n"); break;
5241         case FOG_EXP:    shader_addline(&buffer, "OPTION ARB_fog_exp;\n");    break;
5242         case FOG_EXP2:   shader_addline(&buffer, "OPTION ARB_fog_exp2;\n");   break;
5243         default: FIXME("Unexpected fog setting %d\n", settings->fog);
5244     }
5245
5246     shader_addline(&buffer, "PARAM const = {1, 2, 4, 0.5};\n");
5247     shader_addline(&buffer, "TEMP TMP;\n");
5248     shader_addline(&buffer, "TEMP ret;\n");
5249     if(tempreg_used || settings->sRGB_write) shader_addline(&buffer, "TEMP tempreg;\n");
5250     shader_addline(&buffer, "TEMP arg0;\n");
5251     shader_addline(&buffer, "TEMP arg1;\n");
5252     shader_addline(&buffer, "TEMP arg2;\n");
5253     for(stage = 0; stage < MAX_TEXTURES; stage++) {
5254         if(!tex_read[stage]) continue;
5255         shader_addline(&buffer, "TEMP tex%u;\n", stage);
5256         if(!bump_used[stage]) continue;
5257         shader_addline(&buffer, "PARAM bumpmat%u = program.env[%u];\n", stage, ARB_FFP_CONST_BUMPMAT(stage));
5258         if(!luminance_used[stage]) continue;
5259         shader_addline(&buffer, "PARAM luminance%u = program.env[%u];\n", stage, ARB_FFP_CONST_LUMINANCE(stage));
5260     }
5261     if(tfactor_used) {
5262         shader_addline(&buffer, "PARAM tfactor = program.env[%u];\n", ARB_FFP_CONST_TFACTOR);
5263     }
5264         shader_addline(&buffer, "PARAM specular_enable = program.env[%u];\n", ARB_FFP_CONST_SPECULAR_ENABLE);
5265
5266     if(settings->sRGB_write) {
5267         shader_addline(&buffer, "PARAM srgb_consts1 = {%f, %f, %f, %f};\n",
5268                        srgb_mul_low, srgb_cmp, srgb_pow, srgb_mul_high);
5269         shader_addline(&buffer, "PARAM srgb_consts2 = {%f, %f, %f, %f};\n",
5270                        srgb_sub_high, 0.0, 0.0, 0.0);
5271     }
5272
5273     if(ffp_clip_emul(stateblock) && settings->emul_clipplanes) shader_addline(&buffer, "KIL fragment.texcoord[7];\n");
5274
5275     /* Generate texture sampling instructions) */
5276     for(stage = 0; stage < MAX_TEXTURES && settings->op[stage].cop != WINED3DTOP_DISABLE; stage++) {
5277         if(!tex_read[stage]) continue;
5278
5279         switch(settings->op[stage].tex_type) {
5280             case tex_1d:                    textype = "1D";     break;
5281             case tex_2d:                    textype = "2D";     break;
5282             case tex_3d:                    textype = "3D";     break;
5283             case tex_cube:                  textype = "CUBE";   break;
5284             case tex_rect:                  textype = "RECT";   break;
5285             default: textype = "unexpected_textype";   break;
5286         }
5287
5288         if(settings->op[stage].cop == WINED3DTOP_BUMPENVMAP ||
5289            settings->op[stage].cop == WINED3DTOP_BUMPENVMAPLUMINANCE) {
5290             sat = "";
5291         } else {
5292             sat = "_SAT";
5293         }
5294
5295         if(settings->op[stage].projected == proj_none) {
5296             instr = "TEX";
5297         } else if(settings->op[stage].projected == proj_count4 ||
5298                   settings->op[stage].projected == proj_count3) {
5299             instr = "TXP";
5300         } else {
5301             FIXME("Unexpected projection mode %d\n", settings->op[stage].projected);
5302             instr = "TXP";
5303         }
5304
5305         if(stage > 0 &&
5306            (settings->op[stage - 1].cop == WINED3DTOP_BUMPENVMAP ||
5307             settings->op[stage - 1].cop == WINED3DTOP_BUMPENVMAPLUMINANCE)) {
5308             shader_addline(&buffer, "SWZ arg1, bumpmat%u, x, z, 0, 0;\n", stage - 1);
5309             shader_addline(&buffer, "DP3 ret.x, arg1, tex%u;\n", stage - 1);
5310             shader_addline(&buffer, "SWZ arg1, bumpmat%u, y, w, 0, 0;\n", stage - 1);
5311             shader_addline(&buffer, "DP3 ret.y, arg1, tex%u;\n", stage - 1);
5312
5313             /* with projective textures, texbem only divides the static texture coord, not the displacement,
5314              * so multiply the displacement with the dividing parameter before passing it to TXP
5315              */
5316             if (settings->op[stage].projected != proj_none) {
5317                 if(settings->op[stage].projected == proj_count4) {
5318                     shader_addline(&buffer, "MOV ret.w, fragment.texcoord[%u].w;\n", stage);
5319                     shader_addline(&buffer, "MUL ret.xyz, ret, fragment.texcoord[%u].w, fragment.texcoord[%u];\n", stage, stage);
5320                 } else {
5321                     shader_addline(&buffer, "MOV ret.w, fragment.texcoord[%u].z;\n", stage);
5322                     shader_addline(&buffer, "MAD ret.xyz, ret, fragment.texcoord[%u].z, fragment.texcoord[%u];\n", stage, stage);
5323                 }
5324             } else {
5325                 shader_addline(&buffer, "ADD ret, ret, fragment.texcoord[%u];\n", stage);
5326             }
5327
5328             shader_addline(&buffer, "%s%s tex%u, ret, texture[%u], %s;\n",
5329                            instr, sat, stage, stage, textype);
5330             if(settings->op[stage - 1].cop == WINED3DTOP_BUMPENVMAPLUMINANCE) {
5331                 shader_addline(&buffer, "MAD_SAT ret.x, tex%u.z, luminance%u.x, luminance%u.y;\n",
5332                                stage - 1, stage - 1, stage - 1);
5333                 shader_addline(&buffer, "MUL tex%u, tex%u, ret.x;\n", stage, stage);
5334             }
5335         } else if(settings->op[stage].projected == proj_count3) {
5336             shader_addline(&buffer, "MOV ret, fragment.texcoord[%u];\n", stage);
5337             shader_addline(&buffer, "MOV ret.w, ret.z;\n");
5338             shader_addline(&buffer, "%s%s tex%u, ret, texture[%u], %s;\n",
5339                             instr, sat, stage, stage, textype);
5340         } else {
5341             shader_addline(&buffer, "%s%s tex%u, fragment.texcoord[%u], texture[%u], %s;\n",
5342                             instr, sat, stage, stage, stage, textype);
5343         }
5344
5345         sprintf(colorcor_dst, "tex%u", stage);
5346         gen_color_correction(&buffer, colorcor_dst, WINED3DSP_WRITEMASK_ALL, "const.x", "const.y",
5347                 settings->op[stage].color_fixup);
5348     }
5349
5350     /* Generate the main shader */
5351     for(stage = 0; stage < MAX_TEXTURES; stage++) {
5352         if(settings->op[stage].cop == WINED3DTOP_DISABLE) {
5353             if(stage == 0) {
5354                 final_combiner_src = "fragment.color.primary";
5355             }
5356             break;
5357         }
5358
5359         if(settings->op[stage].cop == WINED3DTOP_SELECTARG1 &&
5360            settings->op[stage].aop == WINED3DTOP_SELECTARG1) {
5361             op_equal = settings->op[stage].carg1 == settings->op[stage].aarg1;
5362         } else if(settings->op[stage].cop == WINED3DTOP_SELECTARG1 &&
5363                   settings->op[stage].aop == WINED3DTOP_SELECTARG2) {
5364             op_equal = settings->op[stage].carg1 == settings->op[stage].aarg2;
5365         } else if(settings->op[stage].cop == WINED3DTOP_SELECTARG2 &&
5366                   settings->op[stage].aop == WINED3DTOP_SELECTARG1) {
5367             op_equal = settings->op[stage].carg2 == settings->op[stage].aarg1;
5368         } else if(settings->op[stage].cop == WINED3DTOP_SELECTARG2 &&
5369                   settings->op[stage].aop == WINED3DTOP_SELECTARG2) {
5370             op_equal = settings->op[stage].carg2 == settings->op[stage].aarg2;
5371         } else {
5372             op_equal = settings->op[stage].aop   == settings->op[stage].cop &&
5373                        settings->op[stage].carg0 == settings->op[stage].aarg0 &&
5374                        settings->op[stage].carg1 == settings->op[stage].aarg1 &&
5375                        settings->op[stage].carg2 == settings->op[stage].aarg2;
5376         }
5377
5378         if(settings->op[stage].aop == WINED3DTOP_DISABLE) {
5379             gen_ffp_instr(&buffer, stage, TRUE, FALSE, settings->op[stage].dst,
5380                           settings->op[stage].cop, settings->op[stage].carg0,
5381                           settings->op[stage].carg1, settings->op[stage].carg2);
5382             if(stage == 0) {
5383                 shader_addline(&buffer, "MOV ret.w, fragment.color.primary.w;\n");
5384             }
5385         } else if(op_equal) {
5386             gen_ffp_instr(&buffer, stage, TRUE, TRUE, settings->op[stage].dst,
5387                           settings->op[stage].cop, settings->op[stage].carg0,
5388                           settings->op[stage].carg1, settings->op[stage].carg2);
5389         } else {
5390             gen_ffp_instr(&buffer, stage, TRUE, FALSE, settings->op[stage].dst,
5391                           settings->op[stage].cop, settings->op[stage].carg0,
5392                           settings->op[stage].carg1, settings->op[stage].carg2);
5393             gen_ffp_instr(&buffer, stage, FALSE, TRUE, settings->op[stage].dst,
5394                           settings->op[stage].aop, settings->op[stage].aarg0,
5395                           settings->op[stage].aarg1, settings->op[stage].aarg2);
5396         }
5397     }
5398
5399     if(settings->sRGB_write) {
5400         shader_addline(&buffer, "MAD ret, fragment.color.secondary, specular_enable, %s;\n", final_combiner_src);
5401         arbfp_add_sRGB_correction(&buffer, "ret", "arg0", "arg1", "arg2", "tempreg", FALSE);
5402     } else {
5403         shader_addline(&buffer, "MAD result.color, fragment.color.secondary, specular_enable, %s;\n", final_combiner_src);
5404     }
5405
5406     /* Footer */
5407     shader_addline(&buffer, "END\n");
5408
5409     /* Generate the shader */
5410     GL_EXTCALL(glGenProgramsARB(1, &ret));
5411     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, ret));
5412     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, strlen(buffer.buffer), buffer.buffer));
5413
5414     if (glGetError() == GL_INVALID_OPERATION) {
5415         GLint pos;
5416         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
5417         FIXME("Fragment program error at position %d: %s\n", pos,
5418               debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
5419     }
5420     shader_buffer_free(&buffer);
5421     return ret;
5422 }
5423
5424 static void fragment_prog_arbfp(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
5425     IWineD3DDeviceImpl *device = stateblock->wineD3DDevice;
5426     struct shader_arb_priv *priv = device->fragment_priv;
5427     BOOL use_pshader = use_ps(stateblock);
5428     BOOL use_vshader = use_vs(stateblock);
5429     struct ffp_frag_settings settings;
5430     const struct arbfp_ffp_desc *desc;
5431     unsigned int i;
5432
5433     TRACE("state %#x, stateblock %p, context %p\n", state, stateblock, context);
5434
5435     if(isStateDirty(context, STATE_RENDER(WINED3DRS_FOGENABLE))) {
5436         if(!use_pshader && device->shader_backend == &arb_program_shader_backend && context->last_was_pshader) {
5437             /* Reload fixed function constants since they collide with the pixel shader constants */
5438             for(i = 0; i < MAX_TEXTURES; i++) {
5439                 set_bumpmat_arbfp(STATE_TEXTURESTAGE(i, WINED3DTSS_BUMPENVMAT00), stateblock, context);
5440             }
5441             state_texfactor_arbfp(STATE_RENDER(WINED3DRS_TEXTUREFACTOR), stateblock, context);
5442             state_arb_specularenable(STATE_RENDER(WINED3DRS_SPECULARENABLE), stateblock, context);
5443         } else if(use_pshader && !isStateDirty(context, device->StateTable[STATE_VSHADER].representative)) {
5444             device->shader_backend->shader_select((IWineD3DDevice *)stateblock->wineD3DDevice, use_pshader, use_vshader);
5445         }
5446         return;
5447     }
5448
5449     if(!use_pshader) {
5450         /* Find or create a shader implementing the fixed function pipeline settings, then activate it */
5451         gen_ffp_frag_op(stateblock, &settings, FALSE);
5452         desc = (const struct arbfp_ffp_desc *)find_ffp_frag_shader(&priv->fragment_shaders, &settings);
5453         if(!desc) {
5454             struct arbfp_ffp_desc *new_desc = HeapAlloc(GetProcessHeap(), 0, sizeof(*new_desc));
5455             if (!new_desc)
5456             {
5457                 ERR("Out of memory\n");
5458                 return;
5459             }
5460             new_desc->num_textures_used = 0;
5461             for(i = 0; i < GL_LIMITS(texture_stages); i++) {
5462                 if(settings.op[i].cop == WINED3DTOP_DISABLE) break;
5463                 new_desc->num_textures_used = i;
5464             }
5465
5466             memcpy(&new_desc->parent.settings, &settings, sizeof(settings));
5467             new_desc->shader = gen_arbfp_ffp_shader(&settings, stateblock);
5468             add_ffp_frag_shader(&priv->fragment_shaders, &new_desc->parent);
5469             TRACE("Allocated fixed function replacement shader descriptor %p\n", new_desc);
5470             desc = new_desc;
5471         }
5472
5473         /* Now activate the replacement program. GL_FRAGMENT_PROGRAM_ARB is already active(however, note the
5474          * comment above the shader_select call below). If e.g. GLSL is active, the shader_select call will
5475          * deactivate it.
5476          */
5477         GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, desc->shader));
5478         checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, desc->shader)");
5479         priv->current_fprogram_id = desc->shader;
5480
5481         if(device->shader_backend == &arb_program_shader_backend && context->last_was_pshader) {
5482             /* Reload fixed function constants since they collide with the pixel shader constants */
5483             for(i = 0; i < MAX_TEXTURES; i++) {
5484                 set_bumpmat_arbfp(STATE_TEXTURESTAGE(i, WINED3DTSS_BUMPENVMAT00), stateblock, context);
5485             }
5486             state_texfactor_arbfp(STATE_RENDER(WINED3DRS_TEXTUREFACTOR), stateblock, context);
5487             state_arb_specularenable(STATE_RENDER(WINED3DRS_SPECULARENABLE), stateblock, context);
5488         }
5489         context->last_was_pshader = FALSE;
5490     } else {
5491         context->last_was_pshader = TRUE;
5492     }
5493
5494     /* Finally, select the shader. If a pixel shader is used, it will be set and enabled by the shader backend.
5495      * If this shader backend is arbfp(most likely), then it will simply overwrite the last fixed function replace-
5496      * ment shader. If the shader backend is not ARB, it currently is important that the opengl implementation
5497      * type overwrites GL_ARB_fragment_program. This is currently the case with GLSL. If we really want to use
5498      * atifs or nvrc pixel shaders with arb fragment programs we'd have to disable GL_FRAGMENT_PROGRAM_ARB here
5499      *
5500      * Don't call shader_select if the vertex shader is dirty, because it will be called later on by the vertex
5501      * shader handler
5502      */
5503     if(!isStateDirty(context, device->StateTable[STATE_VSHADER].representative)) {
5504         device->shader_backend->shader_select((IWineD3DDevice *)stateblock->wineD3DDevice, use_pshader, use_vshader);
5505
5506         if (!isStateDirty(context, STATE_VERTEXSHADERCONSTANT) && (use_vshader || use_pshader)) {
5507             device->StateTable[STATE_VERTEXSHADERCONSTANT].apply(STATE_VERTEXSHADERCONSTANT, stateblock, context);
5508         }
5509     }
5510     if(use_pshader) {
5511         device->StateTable[STATE_PIXELSHADERCONSTANT].apply(STATE_PIXELSHADERCONSTANT, stateblock, context);
5512     }
5513 }
5514
5515 /* We can't link the fog states to the fragment state directly since the vertex pipeline links them
5516  * to FOGENABLE. A different linking in different pipeline parts can't be expressed in the combined
5517  * state table, so we need to handle that with a forwarding function. The other invisible side effect
5518  * is that changing the fog start and fog end(which links to FOGENABLE in vertex) results in the
5519  * fragment_prog_arbfp function being called because FOGENABLE is dirty, which calls this function here
5520  */
5521 static void state_arbfp_fog(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
5522     enum fogsource new_source;
5523
5524     TRACE("state %#x, stateblock %p, context %p\n", state, stateblock, context);
5525
5526     if(!isStateDirty(context, STATE_PIXELSHADER)) {
5527         fragment_prog_arbfp(state, stateblock, context);
5528     }
5529
5530     if(!stateblock->renderState[WINED3DRS_FOGENABLE]) return;
5531
5532     if(stateblock->renderState[WINED3DRS_FOGTABLEMODE] == WINED3DFOG_NONE) {
5533         if(use_vs(stateblock)) {
5534             new_source = FOGSOURCE_VS;
5535         } else {
5536             if(stateblock->renderState[WINED3DRS_FOGVERTEXMODE] == WINED3DFOG_NONE || context->last_was_rhw) {
5537                 new_source = FOGSOURCE_COORD;
5538             } else {
5539                 new_source = FOGSOURCE_FFP;
5540             }
5541         }
5542     } else {
5543         new_source = FOGSOURCE_FFP;
5544     }
5545     if(new_source != context->fog_source) {
5546         context->fog_source = new_source;
5547         state_fogstartend(STATE_RENDER(WINED3DRS_FOGSTART), stateblock, context);
5548     }
5549 }
5550
5551 static void textransform(DWORD state, IWineD3DStateBlockImpl *stateblock, WineD3DContext *context) {
5552     if(!isStateDirty(context, STATE_PIXELSHADER)) {
5553         fragment_prog_arbfp(state, stateblock, context);
5554     }
5555 }
5556
5557 #undef GLINFO_LOCATION
5558
5559 static const struct StateEntryTemplate arbfp_fragmentstate_template[] = {
5560     {STATE_RENDER(WINED3DRS_TEXTUREFACTOR),               { STATE_RENDER(WINED3DRS_TEXTUREFACTOR),              state_texfactor_arbfp   }, WINED3D_GL_EXT_NONE             },
5561     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5562     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5563     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5564     {STATE_TEXTURESTAGE(0, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5565     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5566     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5567     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5568     {STATE_TEXTURESTAGE(0, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5569     {STATE_TEXTURESTAGE(0, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5570     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5571     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5572     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5573     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5574     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5575     {STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(0, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5576     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5577     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5578     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5579     {STATE_TEXTURESTAGE(1, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5580     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5581     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5582     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5583     {STATE_TEXTURESTAGE(1, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5584     {STATE_TEXTURESTAGE(1, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5585     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5586     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5587     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5588     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5589     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5590     {STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(1, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5591     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5592     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5593     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5594     {STATE_TEXTURESTAGE(2, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5595     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5596     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5597     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5598     {STATE_TEXTURESTAGE(2, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5599     {STATE_TEXTURESTAGE(2, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5600     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5601     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5602     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5603     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5604     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5605     {STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(2, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5606     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5607     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5608     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5609     {STATE_TEXTURESTAGE(3, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5610     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5611     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5612     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5613     {STATE_TEXTURESTAGE(3, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5614     {STATE_TEXTURESTAGE(3, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5615     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5616     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5617     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5618     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5619     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5620     {STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(3, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5621     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5622     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5623     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5624     {STATE_TEXTURESTAGE(4, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5625     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5626     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5627     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5628     {STATE_TEXTURESTAGE(4, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5629     {STATE_TEXTURESTAGE(4, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5630     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5631     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5632     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5633     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5634     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5635     {STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(4, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5636     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5637     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5638     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5639     {STATE_TEXTURESTAGE(5, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5640     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5641     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5642     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5643     {STATE_TEXTURESTAGE(5, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5644     {STATE_TEXTURESTAGE(5, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5645     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5646     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5647     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5648     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5649     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5650     {STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(5, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5651     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5652     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5653     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5654     {STATE_TEXTURESTAGE(6, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5655     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5656     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5657     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5658     {STATE_TEXTURESTAGE(6, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5659     {STATE_TEXTURESTAGE(6, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5660     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5661     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5662     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5663     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5664     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5665     {STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(6, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5666     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLOROP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5667     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLORARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5668     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLORARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5669     {STATE_TEXTURESTAGE(7, WINED3DTSS_COLORARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5670     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAOP),           { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5671     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAARG1),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5672     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAARG2),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5673     {STATE_TEXTURESTAGE(7, WINED3DTSS_ALPHAARG0),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5674     {STATE_TEXTURESTAGE(7, WINED3DTSS_RESULTARG),         { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5675     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5676     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT01),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5677     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT10),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5678     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT11),      { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVMAT00),     set_bumpmat_arbfp       }, WINED3D_GL_EXT_NONE             },
5679     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLSCALE),     { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5680     {STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLOFFSET),    { STATE_TEXTURESTAGE(7, WINED3DTSS_BUMPENVLSCALE),    tex_bumpenvlum_arbfp    }, WINED3D_GL_EXT_NONE             },
5681     {STATE_SAMPLER(0),                                    { STATE_SAMPLER(0),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5682     {STATE_SAMPLER(1),                                    { STATE_SAMPLER(1),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5683     {STATE_SAMPLER(2),                                    { STATE_SAMPLER(2),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5684     {STATE_SAMPLER(3),                                    { STATE_SAMPLER(3),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5685     {STATE_SAMPLER(4),                                    { STATE_SAMPLER(4),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5686     {STATE_SAMPLER(5),                                    { STATE_SAMPLER(5),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5687     {STATE_SAMPLER(6),                                    { STATE_SAMPLER(6),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5688     {STATE_SAMPLER(7),                                    { STATE_SAMPLER(7),                                   sampler_texdim          }, WINED3D_GL_EXT_NONE             },
5689     {STATE_PIXELSHADER,                                   { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5690     {STATE_RENDER(WINED3DRS_FOGENABLE),                   { STATE_RENDER(WINED3DRS_FOGENABLE),                  state_arbfp_fog         }, WINED3D_GL_EXT_NONE             },
5691     {STATE_RENDER(WINED3DRS_FOGTABLEMODE),                { STATE_RENDER(WINED3DRS_FOGENABLE),                  state_arbfp_fog         }, WINED3D_GL_EXT_NONE             },
5692     {STATE_RENDER(WINED3DRS_FOGVERTEXMODE),               { STATE_RENDER(WINED3DRS_FOGENABLE),                  state_arbfp_fog         }, WINED3D_GL_EXT_NONE             },
5693     {STATE_RENDER(WINED3DRS_FOGSTART),                    { STATE_RENDER(WINED3DRS_FOGSTART),                   state_fogstartend       }, WINED3D_GL_EXT_NONE             },
5694     {STATE_RENDER(WINED3DRS_FOGEND),                      { STATE_RENDER(WINED3DRS_FOGSTART),                   state_fogstartend       }, WINED3D_GL_EXT_NONE             },
5695     {STATE_RENDER(WINED3DRS_SRGBWRITEENABLE),             { STATE_PIXELSHADER,                                  fragment_prog_arbfp     }, WINED3D_GL_EXT_NONE             },
5696     {STATE_RENDER(WINED3DRS_FOGCOLOR),                    { STATE_RENDER(WINED3DRS_FOGCOLOR),                   state_fogcolor          }, WINED3D_GL_EXT_NONE             },
5697     {STATE_RENDER(WINED3DRS_FOGDENSITY),                  { STATE_RENDER(WINED3DRS_FOGDENSITY),                 state_fogdensity        }, WINED3D_GL_EXT_NONE             },
5698     {STATE_TEXTURESTAGE(0,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(0, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5699     {STATE_TEXTURESTAGE(1,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(1, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5700     {STATE_TEXTURESTAGE(2,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(2, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5701     {STATE_TEXTURESTAGE(3,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(3, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5702     {STATE_TEXTURESTAGE(4,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(4, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5703     {STATE_TEXTURESTAGE(5,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(5, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5704     {STATE_TEXTURESTAGE(6,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(6, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5705     {STATE_TEXTURESTAGE(7,WINED3DTSS_TEXTURETRANSFORMFLAGS),{STATE_TEXTURESTAGE(7, WINED3DTSS_TEXTURETRANSFORMFLAGS), textransform      }, WINED3D_GL_EXT_NONE             },
5706     {STATE_RENDER(WINED3DRS_SPECULARENABLE),              { STATE_RENDER(WINED3DRS_SPECULARENABLE),             state_arb_specularenable}, WINED3D_GL_EXT_NONE             },
5707     {0 /* Terminate */,                                   { 0,                                                  0                       }, WINED3D_GL_EXT_NONE             },
5708 };
5709
5710 const struct fragment_pipeline arbfp_fragment_pipeline = {
5711     arbfp_enable,
5712     arbfp_get_caps,
5713     arbfp_alloc,
5714     arbfp_free,
5715     shader_arb_color_fixup_supported,
5716     arbfp_fragmentstate_template,
5717     TRUE /* We can disable projected textures */
5718 };
5719
5720 #define GLINFO_LOCATION device->adapter->gl_info
5721
5722 struct arbfp_blit_priv {
5723     GLenum yuy2_rect_shader, yuy2_2d_shader;
5724     GLenum uyvy_rect_shader, uyvy_2d_shader;
5725     GLenum yv12_rect_shader, yv12_2d_shader;
5726 };
5727
5728 static HRESULT arbfp_blit_alloc(IWineD3DDevice *iface) {
5729     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
5730     device->blit_priv = HeapAlloc(GetProcessHeap(), HEAP_ZERO_MEMORY, sizeof(struct arbfp_blit_priv));
5731     if(!device->blit_priv) {
5732         ERR("Out of memory\n");
5733         return E_OUTOFMEMORY;
5734     }
5735     return WINED3D_OK;
5736 }
5737 static void arbfp_blit_free(IWineD3DDevice *iface) {
5738     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
5739     struct arbfp_blit_priv *priv = device->blit_priv;
5740
5741     ENTER_GL();
5742     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yuy2_rect_shader));
5743     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yuy2_2d_shader));
5744     GL_EXTCALL(glDeleteProgramsARB(1, &priv->uyvy_rect_shader));
5745     GL_EXTCALL(glDeleteProgramsARB(1, &priv->uyvy_2d_shader));
5746     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yv12_rect_shader));
5747     GL_EXTCALL(glDeleteProgramsARB(1, &priv->yv12_2d_shader));
5748     checkGLcall("Delete yuv programs\n");
5749     LEAVE_GL();
5750 }
5751
5752 static BOOL gen_planar_yuv_read(SHADER_BUFFER *buffer, enum yuv_fixup yuv_fixup, GLenum textype, char *luminance)
5753 {
5754     char chroma;
5755     const char *tex, *texinstr;
5756
5757     if (yuv_fixup == YUV_FIXUP_UYVY) {
5758         chroma = 'x';
5759         *luminance = 'w';
5760     } else {
5761         chroma = 'w';
5762         *luminance = 'x';
5763     }
5764     switch(textype) {
5765         case GL_TEXTURE_2D:             tex = "2D";     texinstr = "TXP"; break;
5766         case GL_TEXTURE_RECTANGLE_ARB:  tex = "RECT";   texinstr = "TEX"; break;
5767         default:
5768             /* This is more tricky than just replacing the texture type - we have to navigate
5769              * properly in the texture to find the correct chroma values
5770              */
5771             FIXME("Implement yuv correction for non-2d, non-rect textures\n");
5772             return FALSE;
5773     }
5774
5775     /* First we have to read the chroma values. This means we need at least two pixels(no filtering),
5776      * or 4 pixels(with filtering). To get the unmodified chromas, we have to rid ourselves of the
5777      * filtering when we sample the texture.
5778      *
5779      * These are the rules for reading the chroma:
5780      *
5781      * Even pixel: Cr
5782      * Even pixel: U
5783      * Odd pixel: V
5784      *
5785      * So we have to get the sampling x position in non-normalized coordinates in integers
5786      */
5787     if(textype != GL_TEXTURE_RECTANGLE_ARB) {
5788         shader_addline(buffer, "MUL texcrd.xy, fragment.texcoord[0], size.x;\n");
5789         shader_addline(buffer, "MOV texcrd.w, size.x;\n");
5790     } else {
5791         shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
5792     }
5793     /* We must not allow filtering between pixel x and x+1, this would mix U and V
5794      * Vertical filtering is ok. However, bear in mind that the pixel center is at
5795      * 0.5, so add 0.5.
5796      */
5797     shader_addline(buffer, "FLR texcrd.x, texcrd.x;\n");
5798     shader_addline(buffer, "ADD texcrd.x, texcrd.x, coef.y;\n");
5799
5800     /* Divide the x coordinate by 0.5 and get the fraction. This gives 0.25 and 0.75 for the
5801      * even and odd pixels respectively
5802      */
5803     shader_addline(buffer, "MUL texcrd2, texcrd, coef.y;\n");
5804     shader_addline(buffer, "FRC texcrd2, texcrd2;\n");
5805
5806     /* Sample Pixel 1 */
5807     shader_addline(buffer, "%s luminance, texcrd, texture[0], %s;\n", texinstr, tex);
5808
5809     /* Put the value into either of the chroma values */
5810     shader_addline(buffer, "SGE temp.x, texcrd2.x, coef.y;\n");
5811     shader_addline(buffer, "MUL chroma.x, luminance.%c, temp.x;\n", chroma);
5812     shader_addline(buffer, "SLT temp.x, texcrd2.x, coef.y;\n");
5813     shader_addline(buffer, "MUL chroma.y, luminance.%c, temp.x;\n", chroma);
5814
5815     /* Sample pixel 2. If we read an even pixel(SLT above returned 1), sample
5816      * the pixel right to the current one. Otherwise, sample the left pixel.
5817      * Bias and scale the SLT result to -1;1 and add it to the texcrd.x.
5818      */
5819     shader_addline(buffer, "MAD temp.x, temp.x, coef.z, -coef.x;\n");
5820     shader_addline(buffer, "ADD texcrd.x, texcrd, temp.x;\n");
5821     shader_addline(buffer, "%s luminance, texcrd, texture[0], %s;\n", texinstr, tex);
5822
5823     /* Put the value into the other chroma */
5824     shader_addline(buffer, "SGE temp.x, texcrd2.x, coef.y;\n");
5825     shader_addline(buffer, "MAD chroma.y, luminance.%c, temp.x, chroma.y;\n", chroma);
5826     shader_addline(buffer, "SLT temp.x, texcrd2.x, coef.y;\n");
5827     shader_addline(buffer, "MAD chroma.x, luminance.%c, temp.x, chroma.x;\n", chroma);
5828
5829     /* TODO: If filtering is enabled, sample a 2nd pair of pixels left or right of
5830      * the current one and lerp the two U and V values
5831      */
5832
5833     /* This gives the correctly filtered luminance value */
5834     shader_addline(buffer, "TEX luminance, fragment.texcoord[0], texture[0], %s;\n", tex);
5835
5836     return TRUE;
5837 }
5838
5839 static BOOL gen_yv12_read(SHADER_BUFFER *buffer, GLenum textype, char *luminance)
5840 {
5841     const char *tex;
5842
5843     switch(textype) {
5844         case GL_TEXTURE_2D:             tex = "2D";     break;
5845         case GL_TEXTURE_RECTANGLE_ARB:  tex = "RECT";   break;
5846         default:
5847             FIXME("Implement yv12 correction for non-2d, non-rect textures\n");
5848             return FALSE;
5849     }
5850
5851     /* YV12 surfaces contain a WxH sized luminance plane, followed by a (W/2)x(H/2)
5852      * V and a (W/2)x(H/2) U plane, each with 8 bit per pixel. So the effective
5853      * bitdepth is 12 bits per pixel. Since the U and V planes have only half the
5854      * pitch of the luminance plane, the packing into the gl texture is a bit
5855      * unfortunate. If the whole texture is interpreted as luminance data it looks
5856      * approximately like this:
5857      *
5858      *        +----------------------------------+----
5859      *        |                                  |
5860      *        |                                  |
5861      *        |                                  |
5862      *        |                                  |
5863      *        |                                  |   2
5864      *        |            LUMINANCE             |   -
5865      *        |                                  |   3
5866      *        |                                  |
5867      *        |                                  |
5868      *        |                                  |
5869      *        |                                  |
5870      *        +----------------+-----------------+----
5871      *        |                |                 |
5872      *        |  U even rows   |  U odd rows     |
5873      *        |                |                 |   1
5874      *        +----------------+------------------   -
5875      *        |                |                 |   3
5876      *        |  V even rows   |  V odd rows     |
5877      *        |                |                 |
5878      *        +----------------+-----------------+----
5879      *        |                |                 |
5880      *        |     0.5        |       0.5       |
5881      *
5882      * So it appears as if there are 4 chroma images, but in fact the odd rows
5883      * in the chroma images are in the same row as the even ones. So its is
5884      * kinda tricky to read
5885      *
5886      * When reading from rectangle textures, keep in mind that the input y coordinates
5887      * go from 0 to d3d_height, whereas the opengl texture height is 1.5 * d3d_height
5888      */
5889     shader_addline(buffer, "PARAM yv12_coef = {%f, %f, %f, %f};\n",
5890                    2.0 / 3.0, 1.0 / 6.0, (2.0 / 3.0) + (1.0 / 6.0), 1.0 / 3.0);
5891
5892     shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
5893     /* the chroma planes have only half the width */
5894     shader_addline(buffer, "MUL texcrd.x, texcrd.x, coef.y;\n");
5895
5896     /* The first value is between 2/3 and 5/6th of the texture's height, so scale+bias
5897      * the coordinate. Also read the right side of the image when reading odd lines
5898      *
5899      * Don't forget to clamp the y values in into the range, otherwise we'll get filtering
5900      * bleeding
5901      */
5902     if(textype == GL_TEXTURE_2D) {
5903
5904         shader_addline(buffer, "RCP chroma.w, size.y;\n");
5905
5906         shader_addline(buffer, "MUL texcrd2.y, texcrd.y, size.y;\n");
5907
5908         shader_addline(buffer, "FLR texcrd2.y, texcrd2.y;\n");
5909         shader_addline(buffer, "MAD texcrd.y, texcrd.y, yv12_coef.y, yv12_coef.x;\n");
5910
5911         /* Read odd lines from the right side(add size * 0.5 to the x coordinate */
5912         shader_addline(buffer, "ADD texcrd2.x, texcrd2.y, yv12_coef.y;\n"); /* To avoid 0.5 == 0.5 comparisons */
5913         shader_addline(buffer, "FRC texcrd2.x, texcrd2.x;\n");
5914         shader_addline(buffer, "SGE texcrd2.x, texcrd2.x, coef.y;\n");
5915         shader_addline(buffer, "MAD texcrd.x, texcrd2.x, coef.y, texcrd.x;\n");
5916
5917         /* clamp, keep the half pixel origin in mind */
5918         shader_addline(buffer, "MAD temp.y, coef.y, chroma.w, yv12_coef.x;\n");
5919         shader_addline(buffer, "MAX texcrd.y, temp.y, texcrd.y;\n");
5920         shader_addline(buffer, "MAD temp.y, -coef.y, chroma.w, yv12_coef.z;\n");
5921         shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
5922     } else {
5923         /* Read from [size - size+size/4] */
5924         shader_addline(buffer, "FLR texcrd.y, texcrd.y;\n");
5925         shader_addline(buffer, "MAD texcrd.y, texcrd.y, coef.w, size.y;\n");
5926
5927         /* Read odd lines from the right side(add size * 0.5 to the x coordinate */
5928         shader_addline(buffer, "ADD texcrd2.x, texcrd.y, yv12_coef.y;\n"); /* To avoid 0.5 == 0.5 comparisons */
5929         shader_addline(buffer, "FRC texcrd2.x, texcrd2.x;\n");
5930         shader_addline(buffer, "SGE texcrd2.x, texcrd2.x, coef.y;\n");
5931         shader_addline(buffer, "MUL texcrd2.x, texcrd2.x, size.x;\n");
5932         shader_addline(buffer, "MAD texcrd.x, texcrd2.x, coef.y, texcrd.x;\n");
5933
5934         /* Make sure to read exactly from the pixel center */
5935         shader_addline(buffer, "FLR texcrd.y, texcrd.y;\n");
5936         shader_addline(buffer, "ADD texcrd.y, texcrd.y, coef.y;\n");
5937
5938         /* Clamp */
5939         shader_addline(buffer, "MAD temp.y, size.y, coef.w, size.y;\n");
5940         shader_addline(buffer, "ADD temp.y, temp.y, -coef.y;\n");
5941         shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
5942         shader_addline(buffer, "ADD temp.y, size.y, -coef.y;\n");
5943         shader_addline(buffer, "MAX texcrd.y, temp.y, texcrd.y;\n");
5944     }
5945     /* Read the texture, put the result into the output register */
5946     shader_addline(buffer, "TEX temp, texcrd, texture[0], %s;\n", tex);
5947     shader_addline(buffer, "MOV chroma.x, temp.w;\n");
5948
5949     /* The other chroma value is 1/6th of the texture lower, from 5/6th to 6/6th
5950      * No need to clamp because we're just reusing the already clamped value from above
5951      */
5952     if(textype == GL_TEXTURE_2D) {
5953         shader_addline(buffer, "ADD texcrd.y, texcrd.y, yv12_coef.y;\n");
5954     } else {
5955         shader_addline(buffer, "MAD texcrd.y, size.y, coef.w, texcrd.y;\n");
5956     }
5957     shader_addline(buffer, "TEX temp, texcrd, texture[0], %s;\n", tex);
5958     shader_addline(buffer, "MOV chroma.y, temp.w;\n");
5959
5960     /* Sample the luminance value. It is in the top 2/3rd of the texture, so scale the y coordinate.
5961      * Clamp the y coordinate to prevent the chroma values from bleeding into the sampled luminance
5962      * values due to filtering
5963      */
5964     shader_addline(buffer, "MOV texcrd, fragment.texcoord[0];\n");
5965     if(textype == GL_TEXTURE_2D) {
5966         /* Multiply the y coordinate by 2/3 and clamp it */
5967         shader_addline(buffer, "MUL texcrd.y, texcrd.y, yv12_coef.x;\n");
5968         shader_addline(buffer, "MAD temp.y, -coef.y, chroma.w, yv12_coef.x;\n");
5969         shader_addline(buffer, "MIN texcrd.y, temp.y, texcrd.y;\n");
5970         shader_addline(buffer, "TEX luminance, texcrd, texture[0], %s;\n", tex);
5971     } else {
5972         /* Reading from texture_rectangles is pretty straightforward, just use the unmodified
5973          * texture coordinate. It is still a good idea to clamp it though, since the opengl texture
5974          * is bigger
5975          */
5976         shader_addline(buffer, "ADD temp.x, size.y, -coef.y;\n");
5977         shader_addline(buffer, "MIN texcrd.y, texcrd.y, size.x;\n");
5978         shader_addline(buffer, "TEX luminance, texcrd, texture[0], %s;\n", tex);
5979     }
5980     *luminance = 'a';
5981
5982     return TRUE;
5983 }
5984
5985 static GLuint gen_yuv_shader(IWineD3DDeviceImpl *device, enum yuv_fixup yuv_fixup, GLenum textype)
5986 {
5987     GLenum shader;
5988     SHADER_BUFFER buffer;
5989     char luminance_component;
5990     struct arbfp_blit_priv *priv = device->blit_priv;
5991
5992     /* Shader header */
5993     shader_buffer_init(&buffer);
5994
5995     ENTER_GL();
5996     GL_EXTCALL(glGenProgramsARB(1, &shader));
5997     checkGLcall("GL_EXTCALL(glGenProgramsARB(1, &shader))");
5998     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader));
5999     checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader)");
6000     LEAVE_GL();
6001     if(!shader) {
6002         shader_buffer_free(&buffer);
6003         return 0;
6004     }
6005
6006     /* The YUY2 and UYVY formats contain two pixels packed into a 32 bit macropixel,
6007      * giving effectively 16 bit per pixel. The color consists of a luminance(Y) and
6008      * two chroma(U and V) values. Each macropixel has two luminance values, one for
6009      * each single pixel it contains, and one U and one V value shared between both
6010      * pixels.
6011      *
6012      * The data is loaded into an A8L8 texture. With YUY2, the luminance component
6013      * contains the luminance and alpha the chroma. With UYVY it is vice versa. Thus
6014      * take the format into account when generating the read swizzles
6015      *
6016      * Reading the Y value is straightforward - just sample the texture. The hardware
6017      * takes care of filtering in the horizontal and vertical direction.
6018      *
6019      * Reading the U and V values is harder. We have to avoid filtering horizontally,
6020      * because that would mix the U and V values of one pixel or two adjacent pixels.
6021      * Thus floor the texture coordinate and add 0.5 to get an unfiltered read,
6022      * regardless of the filtering setting. Vertical filtering works automatically
6023      * though - the U and V values of two rows are mixed nicely.
6024      *
6025      * Appart of avoiding filtering issues, the code has to know which value it just
6026      * read, and where it can find the other one. To determine this, it checks if
6027      * it sampled an even or odd pixel, and shifts the 2nd read accordingly.
6028      *
6029      * Handling horizontal filtering of U and V values requires reading a 2nd pair
6030      * of pixels, extracting U and V and mixing them. This is not implemented yet.
6031      *
6032      * An alternative implementation idea is to load the texture as A8R8G8B8 texture,
6033      * with width / 2. This way one read gives all 3 values, finding U and V is easy
6034      * in an unfiltered situation. Finding the luminance on the other hand requires
6035      * finding out if it is an odd or even pixel. The real drawback of this approach
6036      * is filtering. This would have to be emulated completely in the shader, reading
6037      * up two 2 packed pixels in up to 2 rows and interpolating both horizontally and
6038      * vertically. Beyond that it would require adjustments to the texture handling
6039      * code to deal with the width scaling
6040      */
6041     shader_addline(&buffer, "!!ARBfp1.0\n");
6042     shader_addline(&buffer, "TEMP luminance;\n");
6043     shader_addline(&buffer, "TEMP temp;\n");
6044     shader_addline(&buffer, "TEMP chroma;\n");
6045     shader_addline(&buffer, "TEMP texcrd;\n");
6046     shader_addline(&buffer, "TEMP texcrd2;\n");
6047     shader_addline(&buffer, "PARAM coef = {1.0, 0.5, 2.0, 0.25};\n");
6048     shader_addline(&buffer, "PARAM yuv_coef = {1.403, 0.344, 0.714, 1.770};\n");
6049     shader_addline(&buffer, "PARAM size = program.local[0];\n");
6050
6051     switch (yuv_fixup)
6052     {
6053         case YUV_FIXUP_UYVY:
6054         case YUV_FIXUP_YUY2:
6055             if (!gen_planar_yuv_read(&buffer, yuv_fixup, textype, &luminance_component))
6056             {
6057                 shader_buffer_free(&buffer);
6058                 return 0;
6059             }
6060             break;
6061
6062         case YUV_FIXUP_YV12:
6063             if (!gen_yv12_read(&buffer, textype, &luminance_component))
6064             {
6065                 shader_buffer_free(&buffer);
6066                 return 0;
6067             }
6068             break;
6069
6070         default:
6071             FIXME("Unsupported YUV fixup %#x\n", yuv_fixup);
6072             shader_buffer_free(&buffer);
6073             return 0;
6074     }
6075
6076     /* Calculate the final result. Formula is taken from
6077      * http://www.fourcc.org/fccyvrgb.php. Note that the chroma
6078      * ranges from -0.5 to 0.5
6079      */
6080     shader_addline(&buffer, "SUB chroma.xy, chroma, coef.y;\n");
6081
6082     shader_addline(&buffer, "MAD result.color.x, chroma.x, yuv_coef.x, luminance.%c;\n", luminance_component);
6083     shader_addline(&buffer, "MAD temp.x, -chroma.y, yuv_coef.y, luminance.%c;\n", luminance_component);
6084     shader_addline(&buffer, "MAD result.color.y, -chroma.x, yuv_coef.z, temp.x;\n");
6085     shader_addline(&buffer, "MAD result.color.z, chroma.y, yuv_coef.w, luminance.%c;\n", luminance_component);
6086     shader_addline(&buffer, "END\n");
6087
6088     ENTER_GL();
6089     GL_EXTCALL(glProgramStringARB(GL_FRAGMENT_PROGRAM_ARB, GL_PROGRAM_FORMAT_ASCII_ARB, strlen(buffer.buffer), buffer.buffer));
6090
6091     if (glGetError() == GL_INVALID_OPERATION) {
6092         GLint pos;
6093         glGetIntegerv(GL_PROGRAM_ERROR_POSITION_ARB, &pos);
6094         FIXME("Fragment program error at position %d: %s\n", pos,
6095               debugstr_a((const char *)glGetString(GL_PROGRAM_ERROR_STRING_ARB)));
6096     }
6097     shader_buffer_free(&buffer);
6098     LEAVE_GL();
6099
6100     switch (yuv_fixup)
6101     {
6102         case YUV_FIXUP_YUY2:
6103             if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->yuy2_rect_shader = shader;
6104             else priv->yuy2_2d_shader = shader;
6105             break;
6106
6107         case YUV_FIXUP_UYVY:
6108             if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->uyvy_rect_shader = shader;
6109             else priv->uyvy_2d_shader = shader;
6110             break;
6111
6112         case YUV_FIXUP_YV12:
6113             if (textype == GL_TEXTURE_RECTANGLE_ARB) priv->yv12_rect_shader = shader;
6114             else priv->yv12_2d_shader = shader;
6115             break;
6116     }
6117
6118     return shader;
6119 }
6120
6121 static HRESULT arbfp_blit_set(IWineD3DDevice *iface, const struct GlPixelFormatDesc *format_desc,
6122         GLenum textype, UINT width, UINT height)
6123 {
6124     GLenum shader;
6125     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
6126     float size[4] = {width, height, 1, 1};
6127     struct arbfp_blit_priv *priv = device->blit_priv;
6128     enum yuv_fixup yuv_fixup;
6129
6130     if (!is_yuv_fixup(format_desc->color_fixup))
6131     {
6132         TRACE("Fixup:\n");
6133         dump_color_fixup_desc(format_desc->color_fixup);
6134         /* Don't bother setting up a shader for unconverted formats */
6135         ENTER_GL();
6136         glEnable(textype);
6137         checkGLcall("glEnable(textype)");
6138         LEAVE_GL();
6139         return WINED3D_OK;
6140     }
6141
6142     yuv_fixup = get_yuv_fixup(format_desc->color_fixup);
6143
6144     switch(yuv_fixup)
6145     {
6146         case YUV_FIXUP_YUY2:
6147             shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->yuy2_rect_shader : priv->yuy2_2d_shader;
6148             break;
6149
6150         case YUV_FIXUP_UYVY:
6151             shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->uyvy_rect_shader : priv->uyvy_2d_shader;
6152             break;
6153
6154         case YUV_FIXUP_YV12:
6155             shader = textype == GL_TEXTURE_RECTANGLE_ARB ? priv->yv12_rect_shader : priv->yv12_2d_shader;
6156             break;
6157
6158         default:
6159             FIXME("Unsupported YUV fixup %#x, not setting a shader\n", yuv_fixup);
6160             ENTER_GL();
6161             glEnable(textype);
6162             checkGLcall("glEnable(textype)");
6163             LEAVE_GL();
6164             return E_NOTIMPL;
6165     }
6166
6167     if (!shader) shader = gen_yuv_shader(device, yuv_fixup, textype);
6168
6169     ENTER_GL();
6170     glEnable(GL_FRAGMENT_PROGRAM_ARB);
6171     checkGLcall("glEnable(GL_FRAGMENT_PROGRAM_ARB)");
6172     GL_EXTCALL(glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader));
6173     checkGLcall("glBindProgramARB(GL_FRAGMENT_PROGRAM_ARB, shader)");
6174     GL_EXTCALL(glProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 0, size));
6175     checkGLcall("glProgramLocalParameter4fvARB");
6176     LEAVE_GL();
6177
6178     return WINED3D_OK;
6179 }
6180
6181 static void arbfp_blit_unset(IWineD3DDevice *iface) {
6182     IWineD3DDeviceImpl *device = (IWineD3DDeviceImpl *) iface;
6183
6184     ENTER_GL();
6185     glDisable(GL_FRAGMENT_PROGRAM_ARB);
6186     checkGLcall("glDisable(GL_FRAGMENT_PROGRAM_ARB)");
6187     glDisable(GL_TEXTURE_2D);
6188     checkGLcall("glDisable(GL_TEXTURE_2D)");
6189     if(GL_SUPPORT(ARB_TEXTURE_CUBE_MAP)) {
6190         glDisable(GL_TEXTURE_CUBE_MAP_ARB);
6191         checkGLcall("glDisable(GL_TEXTURE_CUBE_MAP_ARB)");
6192     }
6193     if(GL_SUPPORT(ARB_TEXTURE_RECTANGLE)) {
6194         glDisable(GL_TEXTURE_RECTANGLE_ARB);
6195         checkGLcall("glDisable(GL_TEXTURE_RECTANGLE_ARB)");
6196     }
6197     LEAVE_GL();
6198 }
6199
6200 static BOOL arbfp_blit_color_fixup_supported(struct color_fixup_desc fixup)
6201 {
6202     enum yuv_fixup yuv_fixup;
6203
6204     if (TRACE_ON(d3d_shader) && TRACE_ON(d3d))
6205     {
6206         TRACE("Checking support for fixup:\n");
6207         dump_color_fixup_desc(fixup);
6208     }
6209
6210     if (is_identity_fixup(fixup))
6211     {
6212         TRACE("[OK]\n");
6213         return TRUE;
6214     }
6215
6216     /* We only support YUV conversions. */
6217     if (!is_yuv_fixup(fixup))
6218     {
6219         TRACE("[FAILED]\n");
6220         return FALSE;
6221     }
6222
6223     yuv_fixup = get_yuv_fixup(fixup);
6224     switch(yuv_fixup)
6225     {
6226         case YUV_FIXUP_YUY2:
6227         case YUV_FIXUP_UYVY:
6228         case YUV_FIXUP_YV12:
6229             TRACE("[OK]\n");
6230             return TRUE;
6231
6232         default:
6233             FIXME("Unsupported YUV fixup %#x\n", yuv_fixup);
6234             TRACE("[FAILED]\n");
6235             return FALSE;
6236     }
6237 }
6238
6239 const struct blit_shader arbfp_blit = {
6240     arbfp_blit_alloc,
6241     arbfp_blit_free,
6242     arbfp_blit_set,
6243     arbfp_blit_unset,
6244     arbfp_blit_color_fixup_supported,
6245 };
6246
6247 #undef GLINFO_LOCATION