1 diff -urp libdv-0.104-old/libdv/dct_block_mmx.S libdv-0.104/libdv/dct_block_mmx.S
2 --- libdv-0.104-old/libdv/dct_block_mmx.S 2005-10-23 19:40:58.000000000 +0200
3 +++ libdv-0.104/libdv/dct_block_mmx.S 2005-10-24 00:11:39.000000000 +0200
4 @@ -53,6 +53,17 @@ scratch2: .quad 0
9 +# undef __i686 /* gcc define gets in our way */
10 + .section .gnu.linkonce.t.__i686.get_pc_thunk.bp,"ax",@progbits
11 +.globl __i686.get_pc_thunk.bp
12 + .hidden __i686.get_pc_thunk.bp
13 + .type __i686.get_pc_thunk.bp,@function
14 +__i686.get_pc_thunk.bp:
22 @@ -60,10 +71,14 @@ scratch4: .quad 0
29 - movl 8(%ebp), %esi # source
31 + call __i686.get_pc_thunk.bp
32 + addl $_GLOBAL_OFFSET_TABLE_, %ebp
35 + movl 12(%ebp), %esi # source
38 movq 16*0(%esi), %mm0 # v0
39 @@ -86,22 +101,45 @@ _dv_dct_88_block_mmx:
41 movq 16*3(%esi), %mm5 # v3
42 movq 16*4(%esi), %mm7 # v4
44 + movq %mm7, scratch1@GOTOFF(%ebp) # scratch1: v4 ;
46 movq %mm7, scratch1 # scratch1: v4 ;
48 movq %mm5, %mm7 # duplicate v3
50 + paddw scratch1@GOTOFF(%ebp), %mm5 # v03: v3+v4
51 + psubw scratch1@GOTOFF(%ebp), %mm7 # v04: v3-v4
52 + movq %mm5, scratch2@GOTOFF(%ebp) # scratch2: v03
54 paddw scratch1, %mm5 # v03: v3+v4
55 psubw scratch1, %mm7 # v04: v3-v4
56 movq %mm5, scratch2 # scratch2: v03
58 movq %mm0, %mm5 # mm5: v00
61 + paddw scratch2@GOTOFF(%ebp), %mm0 # v10: v00+v03
62 + psubw scratch2@GOTOFF(%ebp), %mm5 # v13: v00-v03
63 + movq %mm3, scratch3@GOTOFF(%ebp) # scratch3: v02
65 paddw scratch2, %mm0 # v10: v00+v03
66 psubw scratch2, %mm5 # v13: v00-v03
67 movq %mm3, scratch3 # scratch3: v02
69 movq %mm1, %mm3 # duplicate v01
72 + paddw scratch3@GOTOFF(%ebp), %mm1 # v11: v01+v02
73 + psubw scratch3@GOTOFF(%ebp), %mm3 # v12: v01-v02
75 + movq %mm6, scratch4@GOTOFF(%ebp) # scratch4: v05
77 paddw scratch3, %mm1 # v11: v01+v02
78 psubw scratch3, %mm3 # v12: v01-v02
80 movq %mm6, scratch4 # scratch4: v05
82 movq %mm0, %mm6 # duplicate v10
84 paddw %mm1, %mm0 # v10+v11
85 @@ -111,10 +149,18 @@ _dv_dct_88_block_mmx:
86 movq %mm6, 16*4(%esi) # out4: v10-v11
88 movq %mm4, %mm0 # mm0: v06
90 + paddw scratch4@GOTOFF(%ebp), %mm4 # v15: v05+v06
92 paddw scratch4, %mm4 # v15: v05+v06
94 paddw %mm2, %mm0 # v16: v07+v06
97 + pmulhw WA3@GOTOFF(%ebp), %mm4 # v35~: WA3*v15
99 pmulhw WA3, %mm4 # v35~: WA3*v15
101 psllw $1, %mm4 # v35: compensate the coeefient scale
103 movq %mm4, %mm6 # duplicate v35
104 @@ -123,7 +169,11 @@ _dv_dct_88_block_mmx:
106 paddw %mm5, %mm3 # v22: v12+v13
109 + pmulhw WA1@GOTOFF(%ebp), %mm3 # v32~: WA1*v22
111 pmulhw WA1, %mm3 # v32~: WA1*v22
113 psllw $16-NSHIFT, %mm3 # v32: compensate the coeefient scale
114 movq %mm5, %mm6 # duplicate v13
116 @@ -134,13 +184,23 @@ _dv_dct_88_block_mmx:
117 movq %mm6, 16*6(%esi) # out6: v13-v32
121 + paddw scratch4@GOTOFF(%ebp), %mm7 # v14n: v04+v05
123 paddw scratch4, %mm7 # v14n: v04+v05
125 movq %mm0, %mm5 # duplicate v16
127 psubw %mm7, %mm0 # va1: v16-v14n
129 + pmulhw WA5@GOTOFF(%ebp), %mm0 # va0~: va1*WA5
130 + pmulhw WA4@GOTOFF(%ebp), %mm5 # v36~~: v16*WA4
131 + pmulhw WA2@GOTOFF(%ebp), %mm7 # v34~~: v14n*WA2
133 pmulhw WA5, %mm0 # va0~: va1*WA5
134 pmulhw WA4, %mm5 # v36~~: v16*WA4
135 pmulhw WA2, %mm7 # v34~~: v14n*WA2
137 psllw $16-WA4_SHIFT, %mm5 # v36: compensate the coeefient scale
138 psllw $16-NSHIFT, %mm7 # v34: compensate the coeefient scale
140 @@ -188,22 +248,45 @@ _dv_dct_88_block_mmx:
142 movq 16*3(%esi), %mm5 # v3
143 movq 16*4(%esi), %mm7 # v4
145 + movq %mm7, scratch1@GOTOFF(%ebp) # scratch1: v4 ;
147 movq %mm7, scratch1 # scratch1: v4 ;
149 movq %mm5, %mm7 # duplicate v3
151 + paddw scratch1@GOTOFF(%ebp), %mm5 # v03: v3+v4
152 + psubw scratch1@GOTOFF(%ebp), %mm7 # v04: v3-v4
153 + movq %mm5, scratch2@GOTOFF(%ebp) # scratch2: v03
155 paddw scratch1, %mm5 # v03: v3+v4
156 psubw scratch1, %mm7 # v04: v3-v4
157 movq %mm5, scratch2 # scratch2: v03
159 movq %mm0, %mm5 # mm5: v00
162 + paddw scratch2@GOTOFF(%ebp), %mm0 # v10: v00+v03
163 + psubw scratch2@GOTOFF(%ebp), %mm5 # v13: v00-v03
164 + movq %mm3, scratch3@GOTOFF(%ebp) # scratc3: v02
166 paddw scratch2, %mm0 # v10: v00+v03
167 psubw scratch2, %mm5 # v13: v00-v03
168 movq %mm3, scratch3 # scratc3: v02
170 movq %mm1, %mm3 # duplicate v01
173 + paddw scratch3@GOTOFF(%ebp), %mm1 # v11: v01+v02
174 + psubw scratch3@GOTOFF(%ebp), %mm3 # v12: v01-v02
176 + movq %mm6, scratch4@GOTOFF(%ebp) # scratc4: v05
178 paddw scratch3, %mm1 # v11: v01+v02
179 psubw scratch3, %mm3 # v12: v01-v02
181 movq %mm6, scratch4 # scratc4: v05
183 movq %mm0, %mm6 # duplicate v10
185 paddw %mm1, %mm0 # v10+v11
186 @@ -213,10 +296,18 @@ _dv_dct_88_block_mmx:
187 movq %mm6, 16*4(%esi) # out4: v10-v11
189 movq %mm4, %mm0 # mm0: v06
191 + paddw scratch4@GOTOFF(%ebp), %mm4 # v15: v05+v06
193 paddw scratch4, %mm4 # v15: v05+v06
195 paddw %mm2, %mm0 # v16: v07+v06
198 + pmulhw WA3@GOTOFF(%ebp), %mm4 # v35~: WA3*v15
200 pmulhw WA3, %mm4 # v35~: WA3*v15
202 psllw $16-NSHIFT, %mm4 # v35: compensate the coeefient scale
204 movq %mm4, %mm6 # duplicate v35
205 @@ -225,7 +316,11 @@ _dv_dct_88_block_mmx:
207 paddw %mm5, %mm3 # v22: v12+v13
210 + pmulhw WA1@GOTOFF(%ebp), %mm3 # v32~: WA3*v15
212 pmulhw WA1, %mm3 # v32~: WA3*v15
214 psllw $16-NSHIFT, %mm3 # v32: compensate the coeefient scale
215 movq %mm5, %mm6 # duplicate v13
217 @@ -235,13 +330,23 @@ _dv_dct_88_block_mmx:
218 movq %mm5, 16*2(%esi) # out2: v13+v32
219 movq %mm6, 16*6(%esi) # out6: v13-v32
222 + paddw scratch4@GOTOFF(%ebp), %mm7 # v14n: v04+v05
224 paddw scratch4, %mm7 # v14n: v04+v05
226 movq %mm0, %mm5 # duplicate v16
228 psubw %mm7, %mm0 # va1: v16-v14n
230 + pmulhw WA2@GOTOFF(%ebp), %mm7 # v34~~: v14n*WA2
231 + pmulhw WA5@GOTOFF(%ebp), %mm0 # va0~: va1*WA5
232 + pmulhw WA4@GOTOFF(%ebp), %mm5 # v36~~: v16*WA4
234 pmulhw WA2, %mm7 # v34~~: v14n*WA2
235 pmulhw WA5, %mm0 # va0~: va1*WA5
236 pmulhw WA4, %mm5 # v36~~: v16*WA4
238 psllw $16-NSHIFT, %mm7
239 psllw $16-WA4_SHIFT, %mm5 # v36: compensate the coeffient
240 # scale note that WA4 is shifted 1 bit less than the others
241 @@ -751,11 +856,15 @@ _dv_dct_block_mmx_postscale_88:
242 _dv_dct_248_block_mmx:
249 - movl 8(%ebp), %esi # source
251 + call __i686.get_pc_thunk.bp
252 + addl $_GLOBAL_OFFSET_TABLE_, %ebp
255 + movl 16(%ebp), %esi # source
259 @@ -779,7 +888,11 @@ _dv_dct_248_block_mmx:
260 paddw %mm1, %mm0 # v20: v10+v11
261 psubw %mm1, %mm3 # v21: v10-v11
264 + pmulhw WA1@GOTOFF(%ebp), %mm5 # v32~: WA1*v22
266 pmulhw WA1, %mm5 # v32~: WA1*v22
269 psllw $16-NSHIFT, %mm5 # v32: compensate the coeffient scale
271 @@ -818,7 +931,11 @@ _dv_dct_248_block_mmx:
272 paddw %mm1, %mm0 # v20: v10+v11
273 psubw %mm1, %mm3 # v21: v10-v11
276 + pmulhw WA1@GOTOFF(%ebp), %mm5 # v32~: WA1*v22
278 pmulhw WA1, %mm5 # v32~: WA1*v22
281 psllw $16-NSHIFT, %mm5 # v32: compensate the coeffient scale
283 @@ -855,7 +972,11 @@ _dv_dct_248_block_mmx:
284 paddw %mm1, %mm0 # v20: v10+v11
285 psubw %mm1, %mm3 # v21: v10-v11
288 + pmulhw WA1@GOTOFF(%ebp), %mm5 # v32~: WA1*v22
290 pmulhw WA1, %mm5 # v32~: WA1*v22
293 psllw $16-NSHIFT, %mm5 # v32: compensate the coeffient scale
295 @@ -892,7 +1013,11 @@ _dv_dct_248_block_mmx:
296 paddw %mm1, %mm0 # v20: v10+v11
297 psubw %mm1, %mm3 # v21: v10-v11
300 + pmulhw WA1@GOTOFF(%ebp), %mm5 # v32~: WA1*v22
302 pmulhw WA1, %mm5 # v32~: WA1*v22
305 psllw $16-NSHIFT, %mm5 # v32: compensate the coeffient scale
307 diff -urp libdv-0.104-old/libdv/dv.c libdv-0.104/libdv/dv.c
308 --- libdv-0.104-old/libdv/dv.c 2004-10-20 05:49:24.000000000 +0200
309 +++ libdv-0.104/libdv/dv.c 2005-10-24 00:59:57.000000000 +0200
310 @@ -205,6 +205,9 @@ dv_reconfigure(int clamp_luma, int clamp
311 } /* dv_reconfigure */
314 +extern uint8_t dv_quant_offset[4];
315 +extern uint8_t dv_quant_shifts[22][4];
318 dv_decode_macroblock(dv_decoder_t *dv, dv_macroblock_t *mb, unsigned int quality) {
320 @@ -218,7 +221,7 @@ dv_decode_macroblock(dv_decoder_t *dv, d
321 dv_idct_248 (co248, mb->b[i].coeffs);
324 - _dv_quant_88_inverse_x86(mb->b[i].coeffs,mb->qno,mb->b[i].class_no);
325 + _dv_quant_88_inverse_x86(mb->b[i].coeffs,mb->qno,mb->b[i].class_no,dv_quant_offset,dv_quant_shifts);
326 _dv_idct_88(mb->b[i].coeffs);
328 _dv_quant_88_inverse_x86_64(mb->b[i].coeffs,mb->qno,mb->b[i].class_no);
329 @@ -250,7 +253,7 @@ dv_decode_video_segment(dv_decoder_t *dv
330 dv_idct_248 (co248, mb->b[b].coeffs);
333 - _dv_quant_88_inverse_x86(bl->coeffs,mb->qno,bl->class_no);
334 + _dv_quant_88_inverse_x86(bl->coeffs,mb->qno,bl->class_no,dv_quant_offset,dv_quant_shifts);
335 _dv_weight_88_inverse(bl->coeffs);
336 _dv_idct_88(bl->coeffs);
338 diff -urp libdv-0.104-old/libdv/encode.c libdv-0.104/libdv/encode.c
339 --- libdv-0.104-old/libdv/encode.c 2004-11-17 04:36:30.000000000 +0100
340 +++ libdv-0.104/libdv/encode.c 2005-10-24 01:17:41.000000000 +0200
341 @@ -521,7 +521,8 @@ static void reorder_block(dv_block_t *bl
344 extern unsigned long _dv_vlc_encode_block_mmx(dv_coeff_t* coeffs,
345 - dv_vlc_entry_t ** out);
346 + dv_vlc_entry_t ** out,
347 + dv_vlc_entry_t * lookup);
349 extern unsigned long _dv_vlc_encode_block_mmx_x86_64(dv_coeff_t* coeffs,
350 dv_vlc_entry_t ** out);
351 @@ -558,7 +559,7 @@ static unsigned long vlc_encode_block(dv
355 - num_bits = _dv_vlc_encode_block_mmx(coeffs, &o);
356 + num_bits = _dv_vlc_encode_block_mmx(coeffs, &o, vlc_encode_lookup);
360 @@ -574,7 +575,7 @@ static unsigned long vlc_encode_block(dv
364 -extern unsigned long _dv_vlc_num_bits_block_x86(dv_coeff_t* coeffs);
365 +extern unsigned long _dv_vlc_num_bits_block_x86(dv_coeff_t* coeffs, unsigned char* lookup);
366 extern unsigned long _dv_vlc_num_bits_block_x86_64(dv_coeff_t* coeffs);
368 extern unsigned long _dv_vlc_num_bits_block(dv_coeff_t* coeffs)
369 @@ -600,7 +601,7 @@ extern unsigned long _dv_vlc_num_bits_bl
371 return _dv_vlc_num_bits_block_x86_64(coeffs);
373 - return _dv_vlc_num_bits_block_x86(coeffs);
374 + return _dv_vlc_num_bits_block_x86(coeffs, vlc_num_bits_lookup);
378 diff -urp libdv-0.104-old/libdv/encode_x86.S libdv-0.104/libdv/encode_x86.S
379 --- libdv-0.104-old/libdv/encode_x86.S 2005-10-23 19:40:58.000000000 +0200
380 +++ libdv-0.104/libdv/encode_x86.S 2005-10-24 01:18:32.000000000 +0200
382 * The libdv homepage is http://libdv.sourceforge.net/.
386 -ALLONE: .word 1,1,1,1
387 -VLCADDMASK: .byte 255,0,0,0,255,0,0,0
391 .global _dv_vlc_encode_block_mmx
392 @@ -45,11 +41,14 @@ _dv_vlc_encode_block_mmx:
396 - movl vlc_encode_lookup, %esi
397 + movl 4+4*4+8(%esp), %esi # vlc_encode_lookup
401 - movq VLCADDMASK, %mm1
408 vlc_encode_block_mmx_loop:
409 @@ -121,7 +120,7 @@ _dv_vlc_num_bits_block_x86:
413 - movl vlc_num_bits_lookup, %esi
414 + movl 4+4*4+4(%esp), %esi # vlc_num_bits_lookup
416 vlc_num_bits_block_x86_loop:
418 @@ -579,8 +578,11 @@ _dv_need_dct_248_mmx_rows:
423 - pmaddwd ALLONE, %mm0
427 + pmaddwd (%esp), %mm0
432 diff -urp libdv-0.104-old/libdv/idct_block_mmx.S libdv-0.104/libdv/idct_block_mmx.S
433 --- libdv-0.104-old/libdv/idct_block_mmx.S 2005-10-23 19:40:58.000000000 +0200
434 +++ libdv-0.104/libdv/idct_block_mmx.S 2005-10-24 01:12:12.000000000 +0200
440 +# undef __i686 /* gcc define gets in our way */
441 + .section .gnu.linkonce.t.__i686.get_pc_thunk.bp,"ax",@progbits
442 +.globl __i686.get_pc_thunk.bp
443 + .hidden __i686.get_pc_thunk.bp
444 + .type __i686.get_pc_thunk.bp,@function
445 +__i686.get_pc_thunk.bp:
453 .globl _dv_idct_block_mmx
454 .type _dv_idct_block_mmx,@function
461 + call __i686.get_pc_thunk.bp
462 + addl $_GLOBAL_OFFSET_TABLE_, %ebp
466 + leal preSC@GOTOFF(%ebp), %ecx
469 - movl 8(%ebp),%esi /* source matrix */
471 + movl 12(%esp),%esi /* source matrix */
474 * column 0: even part
475 @@ -35,7 +56,11 @@ _dv_idct_block_mmx:
476 movq %mm1, %mm2 /* added 11/1/96 */
477 pmulhw 8*8(%esi),%mm5 /* V8 */
478 psubsw %mm0, %mm1 /* V16 */
480 + pmulhw x5a825a825a825a82@GOTOFF(%ebp), %mm1 /* 23170 ->V18 */
482 pmulhw x5a825a825a825a82, %mm1 /* 23170 ->V18 */
484 paddsw %mm0, %mm2 /* V17 */
485 movq %mm2, %mm0 /* duplicate V17 */
486 psraw $1, %mm2 /* t75=t82 */
487 @@ -76,7 +101,11 @@ _dv_idct_block_mmx:
488 paddsw %mm0, %mm3 /* V29 ; free mm0 */
489 movq %mm7, %mm1 /* duplicate V26 */
490 psraw $1, %mm3 /* t91=t94 */
492 + pmulhw x539f539f539f539f@GOTOFF(%ebp),%mm7 /* V33 */
494 pmulhw x539f539f539f539f,%mm7 /* V33 */
496 psraw $1, %mm1 /* t96 */
497 movq %mm5, %mm0 /* duplicate V2 */
498 psraw $2, %mm4 /* t85=t87 */
499 @@ -84,15 +113,27 @@ _dv_idct_block_mmx:
500 psubsw %mm4, %mm0 /* V28 ; free mm4 */
501 movq %mm0, %mm2 /* duplicate V28 */
502 psraw $1, %mm5 /* t90=t93 */
504 + pmulhw x4546454645464546@GOTOFF(%ebp),%mm0 /* V35 */
506 pmulhw x4546454645464546,%mm0 /* V35 */
508 psraw $1, %mm2 /* t97 */
509 movq %mm5, %mm4 /* duplicate t90=t93 */
510 psubsw %mm2, %mm1 /* V32 ; free mm2 */
512 + pmulhw x61f861f861f861f8@GOTOFF(%ebp),%mm1 /* V36 */
514 pmulhw x61f861f861f861f8,%mm1 /* V36 */
516 psllw $1, %mm7 /* t107 */
517 paddsw %mm3, %mm5 /* V31 */
518 psubsw %mm3, %mm4 /* V30 ; free mm3 */
520 + pmulhw x5a825a825a825a82@GOTOFF(%ebp),%mm4 /* V34 */
522 pmulhw x5a825a825a825a82,%mm4 /* V34 */
525 psubsw %mm1, %mm0 /* V38 */
526 psubsw %mm7, %mm1 /* V37 ; free mm7 */
527 @@ -159,7 +200,11 @@ _dv_idct_block_mmx:
528 psubsw %mm7, %mm1 /* V50 */
529 pmulhw 8*9(%esi), %mm5 /* V9 */
530 paddsw %mm7, %mm2 /* V51 */
532 + pmulhw x5a825a825a825a82@GOTOFF(%ebp), %mm1 /* 23170 ->V52 */
534 pmulhw x5a825a825a825a82, %mm1 /* 23170 ->V52 */
536 movq %mm2, %mm6 /* duplicate V51 */
537 psraw $1, %mm2 /* t138=t144 */
538 movq %mm3, %mm4 /* duplicate V1 */
539 @@ -200,11 +245,19 @@ _dv_idct_block_mmx:
540 * even more by doing the correction step in a later stage when the number
541 * is actually multiplied by 16
544 + paddw x0005000200010001@GOTOFF(%ebp), %mm4
546 paddw x0005000200010001, %mm4
548 psubsw %mm6, %mm3 /* V60 ; free mm6 */
549 psraw $1, %mm0 /* t154=t156 */
550 movq %mm3, %mm1 /* duplicate V60 */
552 + pmulhw x539f539f539f539f@GOTOFF(%ebp), %mm1 /* V67 */
554 pmulhw x539f539f539f539f, %mm1 /* V67 */
556 movq %mm5, %mm6 /* duplicate V3 */
557 psraw $2, %mm4 /* t148=t150 */
558 paddsw %mm4, %mm5 /* V61 */
559 @@ -213,13 +266,25 @@ _dv_idct_block_mmx:
560 psllw $1, %mm1 /* t169 */
561 paddsw %mm0, %mm5 /* V65 -> result */
562 psubsw %mm0, %mm4 /* V64 ; free mm0 */
564 + pmulhw x5a825a825a825a82@GOTOFF(%ebp), %mm4 /* V68 */
566 pmulhw x5a825a825a825a82, %mm4 /* V68 */
568 psraw $1, %mm3 /* t158 */
569 psubsw %mm6, %mm3 /* V66 */
570 movq %mm5, %mm2 /* duplicate V65 */
572 + pmulhw x61f861f861f861f8@GOTOFF(%ebp), %mm3 /* V70 */
574 pmulhw x61f861f861f861f8, %mm3 /* V70 */
576 psllw $1, %mm6 /* t165 */
578 + pmulhw x4546454645464546@GOTOFF(%ebp), %mm6 /* V69 */
580 pmulhw x4546454645464546, %mm6 /* V69 */
582 psraw $1, %mm2 /* t172 */
583 /* moved from next block */
584 movq 8*5(%esi), %mm0 /* V56 */
585 @@ -344,7 +409,11 @@ _dv_idct_block_mmx:
586 * movq 8*13(%esi), %mm4 tmt13
588 psubsw %mm4, %mm3 /* V134 */
590 + pmulhw x5a825a825a825a82@GOTOFF(%ebp), %mm3 /* 23170 ->V136 */
592 pmulhw x5a825a825a825a82, %mm3 /* 23170 ->V136 */
594 movq 8*9(%esi), %mm6 /* tmt9 */
595 paddsw %mm4, %mm5 /* V135 ; mm4 free */
596 movq %mm0, %mm4 /* duplicate tmt1 */
597 @@ -373,17 +442,33 @@ _dv_idct_block_mmx:
598 psubsw %mm7, %mm0 /* V144 */
599 movq %mm0, %mm3 /* duplicate V144 */
600 paddsw %mm7, %mm2 /* V147 ; free mm7 */
602 + pmulhw x539f539f539f539f@GOTOFF(%ebp), %mm0 /* 21407-> V151 */
604 pmulhw x539f539f539f539f, %mm0 /* 21407-> V151 */
606 movq %mm1, %mm7 /* duplicate tmt3 */
607 paddsw %mm5, %mm7 /* V145 */
608 psubsw %mm5, %mm1 /* V146 ; free mm5 */
609 psubsw %mm1, %mm3 /* V150 */
610 movq %mm7, %mm5 /* duplicate V145 */
612 + pmulhw x4546454645464546@GOTOFF(%ebp), %mm1 /* 17734-> V153 */
614 pmulhw x4546454645464546, %mm1 /* 17734-> V153 */
616 psubsw %mm2, %mm5 /* V148 */
618 + pmulhw x61f861f861f861f8@GOTOFF(%ebp), %mm3 /* 25080-> V154 */
620 pmulhw x61f861f861f861f8, %mm3 /* 25080-> V154 */
622 psllw $2, %mm0 /* t311 */
624 + pmulhw x5a825a825a825a82@GOTOFF(%ebp), %mm5 /* 23170-> V152 */
626 pmulhw x5a825a825a825a82, %mm5 /* 23170-> V152 */
628 paddsw %mm2, %mm7 /* V149 ; free mm2 */
629 psllw $1, %mm1 /* t313 */
630 nop /* without the nop - freeze here for one clock */
631 @@ -409,7 +494,11 @@ _dv_idct_block_mmx:
632 paddsw %mm3, %mm6 /* V164 ; free mm3 */
633 movq %mm4, %mm3 /* duplicate V142 */
634 psubsw %mm5, %mm4 /* V165 ; free mm5 */
636 + movq %mm2, scratch7@GOTOFF(%ebp) /* out7 */
638 movq %mm2, scratch7 /* out7 */
642 paddsw %mm5, %mm3 /* V162 */
643 @@ -420,11 +509,19 @@ _dv_idct_block_mmx:
645 movq %mm6, 8*9(%esi) /* out9 */
646 paddsw %mm1, %mm0 /* V161 */
648 + movq %mm3, scratch5@GOTOFF(%ebp) /* out5 */
650 movq %mm3, scratch5 /* out5 */
652 psubsw %mm1, %mm5 /* V166 ; free mm1 */
653 movq %mm4, 8*11(%esi) /* out11 */
656 + movq %mm0, scratch3@GOTOFF(%ebp) /* out3 */
658 movq %mm0, scratch3 /* out3 */
660 movq %mm2, %mm4 /* duplicate V140 */
661 movq %mm5, 8*13(%esi) /* out13 */
662 paddsw %mm7, %mm2 /* V160 */
663 @@ -434,7 +531,11 @@ _dv_idct_block_mmx:
664 /* moved from the next block */
668 + movq %mm2, scratch1@GOTOFF(%ebp) /* out1 */
670 movq %mm2, scratch1 /* out1 */
672 /* moved from the next block */
674 movq %mm4, 8*15(%esi) /* out15 */
675 @@ -491,15 +592,31 @@ _dv_idct_block_mmx:
676 paddsw %mm4, %mm3 /* V113 ; free mm4 */
677 movq %mm0, %mm4 /* duplicate V110 */
678 paddsw %mm1, %mm2 /* V111 */
680 + pmulhw x539f539f539f539f@GOTOFF(%ebp), %mm0 /* 21407-> V117 */
682 pmulhw x539f539f539f539f, %mm0 /* 21407-> V117 */
684 psubsw %mm1, %mm5 /* V112 ; free mm1 */
685 psubsw %mm5, %mm4 /* V116 */
686 movq %mm2, %mm1 /* duplicate V111 */
688 + pmulhw x4546454645464546@GOTOFF(%ebp), %mm5 /* 17734-> V119 */
690 pmulhw x4546454645464546, %mm5 /* 17734-> V119 */
692 psubsw %mm3, %mm2 /* V114 */
694 + pmulhw x61f861f861f861f8@GOTOFF(%ebp), %mm4 /* 25080-> V120 */
696 pmulhw x61f861f861f861f8, %mm4 /* 25080-> V120 */
698 paddsw %mm3, %mm1 /* V115 ; free mm3 */
700 + pmulhw x5a825a825a825a82@GOTOFF(%ebp), %mm2 /* 23170-> V118 */
702 pmulhw x5a825a825a825a82, %mm2 /* 23170-> V118 */
704 psllw $2, %mm0 /* t266 */
705 movq %mm1, (%esi) /* save V115 */
706 psllw $1, %mm5 /* t268 */
707 @@ -517,7 +634,11 @@ _dv_idct_block_mmx:
708 movq %mm6, %mm3 /* duplicate tmt4 */
709 psubsw %mm0, %mm6 /* V100 */
710 paddsw %mm0, %mm3 /* V101 ; free mm0 */
712 + pmulhw x5a825a825a825a82@GOTOFF(%ebp), %mm6 /* 23170 ->V102 */
714 pmulhw x5a825a825a825a82, %mm6 /* 23170 ->V102 */
716 movq %mm7, %mm5 /* duplicate tmt0 */
717 movq 8*8(%esi), %mm1 /* tmt8 */
718 paddsw %mm1, %mm7 /* V103 */
719 @@ -551,10 +672,18 @@ _dv_idct_block_mmx:
720 movq 8*2(%esi), %mm3 /* V123 */
721 paddsw %mm4, %mm7 /* out0 */
722 /* moved up from next block */
724 + movq scratch3@GOTOFF(%ebp), %mm0
729 /* moved up from next block */
731 + movq scratch5@GOTOFF(%ebp), %mm6
735 psubsw %mm4, %mm1 /* out14 ; free mm4 */
736 paddsw %mm3, %mm5 /* out2 */
738 @@ -565,7 +694,11 @@ _dv_idct_block_mmx:
739 movq %mm5, 8*2(%esi) /* out2 ; free mm5 */
741 /* moved up to the prev block */
743 + movq scratch7@GOTOFF(%ebp), %mm4
747 /* moved up to the prev block */
749 movq %mm2, 8*12(%esi) /* out12 ; free mm2 */
750 @@ -579,7 +712,11 @@ _dv_idct_block_mmx:
755 + movq scratch1@GOTOFF(%ebp), %mm1
760 movq %mm0, 8*3(%esi) /* out3 */
762 diff -urp libdv-0.104-old/libdv/quant.c libdv-0.104/libdv/quant.c
763 --- libdv-0.104-old/libdv/quant.c 2004-10-20 05:49:24.000000000 +0200
764 +++ libdv-0.104/libdv/quant.c 2005-10-24 01:06:24.000000000 +0200
765 @@ -144,7 +144,7 @@ uint8_t dv_quant_offset[4] = { 6,3,0,1
766 uint32_t dv_quant_248_mul_tab [2] [22] [64];
767 uint32_t dv_quant_88_mul_tab [2] [22] [64];
769 -extern void _dv_quant_x86(dv_coeff_t *block,int qno,int klass);
770 +extern void _dv_quant_x86(dv_coeff_t *block,int qno,int klass,uint8_t dv_quant_offset[],uint8_t dv_quant_shifts[][]);
771 extern void _dv_quant_x86_64(dv_coeff_t *block,int qno,int klass);
772 static void quant_248_inverse_std(dv_coeff_t *block,int qno,int klass,dv_248_coeff_t *co);
773 static void quant_248_inverse_mmx(dv_coeff_t *block,int qno,int klass,dv_248_coeff_t *co);
774 @@ -210,7 +210,7 @@ void _dv_quant(dv_coeff_t *block,int qno
775 _dv_quant_x86_64(block, qno, klass);
778 - _dv_quant_x86(block, qno, klass);
779 + _dv_quant_x86(block, qno, klass, dv_quant_offset, dv_quant_shifts);
783 diff -urp libdv-0.104-old/libdv/quant.h libdv-0.104/libdv/quant.h
784 --- libdv-0.104-old/libdv/quant.h 2004-10-20 05:49:24.000000000 +0200
785 +++ libdv-0.104/libdv/quant.h 2005-10-24 00:57:43.000000000 +0200
786 @@ -27,7 +27,7 @@ extern void _dv_quant(dv_coeff_t *block,
787 extern void _dv_quant_88_inverse(dv_coeff_t *block,int qno,int klass);
788 extern void (*_dv_quant_248_inverse) (dv_coeff_t *block,int qno,int klass,
790 -extern void _dv_quant_88_inverse_x86(dv_coeff_t *block,int qno,int klass);
791 +extern void _dv_quant_88_inverse_x86(dv_coeff_t *block,int qno,int klass, uint8_t offset[], uint8_t shifts[][]);
792 extern void _dv_quant_88_inverse_x86_64(dv_coeff_t *block,int qno,int klass);
793 extern void dv_quant_init (void);
795 diff -urp libdv-0.104-old/libdv/quant_x86.S libdv-0.104/libdv/quant_x86.S
796 --- libdv-0.104-old/libdv/quant_x86.S 2005-10-23 19:40:58.000000000 +0200
797 +++ libdv-0.104/libdv/quant_x86.S 2005-10-24 01:10:21.000000000 +0200
798 @@ -71,10 +71,13 @@ _dv_quant_88_inverse_x86:
800 /* pq = dv_quant_shifts[qno + dv_quant_offset[class]]; */
801 movl ARGn(1),%eax /* qno */
802 + movl ARGn(3),%ebx /* dv_quant_offset */
803 + addl ARGn(2),%ebx /* class */
805 movl ARGn(2),%ebx /* class */
806 - movzbl dv_quant_offset(%ebx),%ecx
808 - leal dv_quant_shifts(,%eax,4),%edx /* edx is pq */
809 + movl ARGn(4),%edx /* dv_quant_shifts */
810 + leal (%edx,%eax,4),%edx /* edx is pq */
812 /* extra = (class == 3); */
814 @@ -212,11 +215,13 @@ _dv_quant_x86:
816 /* pq = dv_quant_shifts[qno + dv_quant_offset[class]]; */
817 movl ARGn(1),%eax /* qno */
818 + movl ARGn(3),%ebx /* offset */
819 + addl ARGn(2),%ebx /* class */
821 movl ARGn(2),%ebx /* class */
823 - movzbl dv_quant_offset(%ebx),%ecx
824 + movl ARGn(4),%edx /* shifts */
826 - leal dv_quant_shifts(,%eax,4),%edx /* edx is pq */
827 + leal (%edx,%eax,4),%edx /* edx is pq */
829 /* extra = (class == 3); */
831 diff -urp libdv-0.104-old/libdv/rgbtoyuv.S libdv-0.104/libdv/rgbtoyuv.S
832 --- libdv-0.104-old/libdv/rgbtoyuv.S 2005-10-23 19:40:58.000000000 +0200
833 +++ libdv-0.104/libdv/rgbtoyuv.S 2005-10-24 00:46:34.000000000 +0200
834 @@ -110,20 +110,30 @@ VR0GR: .long 0,0
841 +# undef __i686 /* gcc define gets in our way */
842 + .section .gnu.linkonce.t.__i686.get_pc_thunk.bp,"ax",@progbits
843 +.globl __i686.get_pc_thunk.bp
844 + .hidden __i686.get_pc_thunk.bp
845 + .type __i686.get_pc_thunk.bp,@function
846 +__i686.get_pc_thunk.bp:
861 +#define _columns 24+16
862 +#define _outyPtr 24+20
863 +#define _outuPtr 24+24
864 +#define _outvPtr 24+28
873 @@ -131,46 +141,103 @@ _dv_rgbtoycb_mmx:
878 + call __i686.get_pc_thunk.bp
879 + addl $_GLOBAL_OFFSET_TABLE_, %ebp
883 + leal ZEROSX@GOTOFF(%ebp), %eax #This section gets around a bug
885 leal ZEROSX, %eax #This section gets around a bug
887 movq (%eax), %mm0 #unlikely to persist
889 + movq %mm0, ZEROS@GOTOFF(%ebp)
890 + leal OFFSETDX@GOTOFF(%ebp), %eax
897 + movq %mm0, OFFSETD@GOTOFF(%ebp)
898 + leal OFFSETWX@GOTOFF(%ebp), %eax
905 + movq %mm0, OFFSETW@GOTOFF(%ebp)
906 + leal OFFSETBX@GOTOFF(%ebp), %eax
913 + movq %mm0, OFFSETB@GOTOFF(%ebp)
914 + leal YR0GRX@GOTOFF(%ebp), %eax
921 + movq %mm0, YR0GR@GOTOFF(%ebp)
922 + leal YBG0BX@GOTOFF(%ebp), %eax
929 + movq %mm0, YBG0B@GOTOFF(%ebp)
930 + leal UR0GRX@GOTOFF(%ebp), %eax
937 + movq %mm0, UR0GR@GOTOFF(%ebp)
938 + leal UBG0BX@GOTOFF(%ebp), %eax
945 + movq %mm0, UBG0B@GOTOFF(%ebp)
946 + leal VR0GRX@GOTOFF(%ebp), %eax
953 + movq %mm0, VR0GR@GOTOFF(%ebp)
954 + leal VBG0BX@GOTOFF(%ebp), %eax
961 + movq %mm0, VBG0B@GOTOFF(%ebp)
965 - movl _rows(%ebp), %eax
966 - movl _columns(%ebp), %ebx
968 + movl _rows(%esp), %eax
969 + movl _columns(%esp), %ebx
970 mull %ebx #number pixels
971 shrl $3, %eax #number of loops
972 movl %eax, %edi #loop counter in edi
973 - movl _inPtr(%ebp), %eax
974 - movl _outyPtr(%ebp), %ebx
975 - movl _outuPtr(%ebp), %ecx
976 - movl _outvPtr(%ebp), %edx
977 + movl _inPtr(%esp), %eax
978 + movl _outyPtr(%esp), %ebx
979 + movl _outuPtr(%esp), %ecx
980 + movl _outvPtr(%esp), %edx
982 movq (%eax), %mm1 #load G2R2B1G1R1B0G0R0
983 pxor %mm6, %mm6 #0 -> mm6
984 @@ -184,29 +251,57 @@ rgbtoycb_mmx_loop:
985 punpcklbw %mm6, %mm1 #B1G1R1B0 -> mm1
986 movq %mm0, %mm2 #R1B0G0R0 -> mm2
989 + pmaddwd YR0GR@GOTOFF(%ebp), %mm0 #yrR1,ygG0+yrR0 -> mm0
991 pmaddwd YR0GR, %mm0 #yrR1,ygG0+yrR0 -> mm0
993 movq %mm1, %mm3 #B1G1R1B0 -> mm3
996 + pmaddwd YBG0B@GOTOFF(%ebp), %mm1 #ybB1+ygG1,ybB0 -> mm1
998 pmaddwd YBG0B, %mm1 #ybB1+ygG1,ybB0 -> mm1
1000 movq %mm2, %mm4 #R1B0G0R0 -> mm4
1003 + pmaddwd UR0GR@GOTOFF(%ebp), %mm2 #urR1,ugG0+urR0 -> mm2
1005 pmaddwd UR0GR, %mm2 #urR1,ugG0+urR0 -> mm2
1007 movq %mm3, %mm5 #B1G1R1B0 -> mm5
1010 + pmaddwd UBG0B@GOTOFF(%ebp), %mm3 #ubB1+ugG1,ubB0 -> mm3
1012 pmaddwd UBG0B, %mm3 #ubB1+ugG1,ubB0 -> mm3
1014 punpckhbw %mm6, %mm7 # 00G2R2 -> mm7
1017 + pmaddwd VR0GR@GOTOFF(%ebp), %mm4 #vrR1,vgG0+vrR0 -> mm4
1019 pmaddwd VR0GR, %mm4 #vrR1,vgG0+vrR0 -> mm4
1021 paddd %mm1, %mm0 #Y1Y0 -> mm0
1024 + pmaddwd VBG0B@GOTOFF(%ebp), %mm5 #vbB1+vgG1,vbB0 -> mm5
1026 pmaddwd VBG0B, %mm5 #vbB1+vgG1,vbB0 -> mm5
1029 movq 8(%eax), %mm1 #R5B4G4R4B3G3R3B2 -> mm1
1030 paddd %mm3, %mm2 #U1U0 -> mm2
1032 movq %mm1, %mm6 #R5B4G4R4B3G3R3B2 -> mm6
1035 + punpcklbw ZEROS@GOTOFF(%ebp), %mm1 #B3G3R3B2 -> mm1
1037 punpcklbw ZEROS, %mm1 #B3G3R3B2 -> mm1
1039 paddd %mm5, %mm4 #V1V0 -> mm4
1041 movq %mm1, %mm5 #B3G3R3B2 -> mm5
1042 @@ -214,29 +309,61 @@ rgbtoycb_mmx_loop:
1044 paddd %mm7, %mm1 #R3B200+00G2R2=R3B2G2R2->mm1
1047 + punpckhbw ZEROS@GOTOFF(%ebp), %mm6 #R5B4G4R3 -> mm6
1049 punpckhbw ZEROS, %mm6 #R5B4G4R3 -> mm6
1051 movq %mm1, %mm3 #R3B2G2R2 -> mm3
1054 + pmaddwd YR0GR@GOTOFF(%ebp), %mm1 #yrR3,ygG2+yrR2 -> mm1
1056 pmaddwd YR0GR, %mm1 #yrR3,ygG2+yrR2 -> mm1
1058 movq %mm5, %mm7 #B3G3R3B2 -> mm7
1061 + pmaddwd YBG0B@GOTOFF(%ebp), %mm5 #ybB3+ygG3,ybB2 -> mm5
1063 pmaddwd YBG0B, %mm5 #ybB3+ygG3,ybB2 -> mm5
1065 psrad $FIXPSHIFT, %mm0 #32-bit scaled Y1Y0 -> mm0
1068 + movq %mm6, TEMP0@GOTOFF(%ebp) #R5B4G4R4 -> TEMP0
1070 movq %mm6, TEMP0 #R5B4G4R4 -> TEMP0
1072 movq %mm3, %mm6 #R3B2G2R2 -> mm6
1074 + pmaddwd UR0GR@GOTOFF(%ebp), %mm6 #urR3,ugG2+urR2 -> mm6
1076 pmaddwd UR0GR, %mm6 #urR3,ugG2+urR2 -> mm6
1078 psrad $FIXPSHIFT, %mm2 #32-bit scaled U1U0 -> mm2
1080 paddd %mm5, %mm1 #Y3Y2 -> mm1
1081 movq %mm7, %mm5 #B3G3R3B2 -> mm5
1083 + pmaddwd UBG0B@GOTOFF(%ebp), %mm7 #ubB3+ugG3,ubB2
1085 pmaddwd UBG0B, %mm7 #ubB3+ugG3,ubB2
1087 psrad $FIXPSHIFT, %mm1 #32-bit scaled Y3Y2 -> mm1
1090 + pmaddwd VR0GR@GOTOFF(%ebp), %mm3 #vrR3,vgG2+vgR2
1092 pmaddwd VR0GR, %mm3 #vrR3,vgG2+vgR2
1094 packssdw %mm1, %mm0 #Y3Y2Y1Y0 -> mm0
1097 + pmaddwd VBG0B@GOTOFF(%ebp), %mm5 #vbB3+vgG3,vbB2 -> mm5
1099 pmaddwd VBG0B, %mm5 #vbB3+vgG3,vbB2 -> mm5
1101 psrad $FIXPSHIFT, %mm4 #32-bit scaled V1V0 -> mm4
1103 movq 16(%eax), %mm1 #B7G7R7B6G6R6B5G5 -> mm7
1104 @@ -251,58 +378,114 @@ rgbtoycb_mmx_loop:
1105 movq %mm7, %mm5 #R7B6G6R6B5G500 -> mm5
1106 psrad $FIXPSHIFT, %mm3 #32-bit scaled V3V2 -> mm3
1109 + paddw OFFSETY@GOTOFF(%ebp), %mm0
1113 movq %mm0, (%ebx) #store Y3Y2Y1Y0
1114 packssdw %mm6, %mm2 #32-bit scaled U3U2U1U0 -> mm2
1117 + movq TEMP0@GOTOFF(%ebp), %mm0 #R5B4G4R4 -> mm0
1119 movq TEMP0, %mm0 #R5B4G4R4 -> mm0
1125 + punpcklbw ZEROS@GOTOFF(%ebp), %mm7 #B5G500 -> mm7
1127 punpcklbw ZEROS, %mm7 #B5G500 -> mm7
1129 movq %mm0, %mm6 #R5B4G4R4 -> mm6
1132 + movq %mm2, TEMPU@GOTOFF(%ebp) #32-bit scaled U3U2U1U0 -> TEMPU
1134 movq %mm2, TEMPU #32-bit scaled U3U2U1U0 -> TEMPU
1136 psrlq $32, %mm0 #00R5B4 -> mm0
1138 paddw %mm0, %mm7 #B5G5R5B4 -> mm7
1139 movq %mm6, %mm2 #B5B4G4R4 -> mm2
1142 + pmaddwd YR0GR@GOTOFF(%ebp), %mm2 #yrR5,ygG4+yrR4 -> mm2
1144 pmaddwd YR0GR, %mm2 #yrR5,ygG4+yrR4 -> mm2
1146 movq %mm7, %mm0 #B5G5R5B4 -> mm0
1149 + pmaddwd YBG0B@GOTOFF(%ebp), %mm7 #ybB5+ygG5,ybB4 -> mm7
1151 pmaddwd YBG0B, %mm7 #ybB5+ygG5,ybB4 -> mm7
1153 packssdw %mm3, %mm4 #32-bit scaled V3V2V1V0 -> mm4
1155 addl $24, %eax #increment RGB count
1158 + movq %mm4, TEMPV@GOTOFF(%ebp) #(V3V2V1V0)/256 -> mm4
1160 movq %mm4, TEMPV #(V3V2V1V0)/256 -> mm4
1162 movq %mm6, %mm4 #B5B4G4R4 -> mm4
1165 + pmaddwd UR0GR@GOTOFF(%ebp), %mm6 #urR5,ugG4+urR4
1167 pmaddwd UR0GR, %mm6 #urR5,ugG4+urR4
1169 movq %mm0, %mm3 #B5G5R5B4 -> mm0
1172 + pmaddwd UBG0B@GOTOFF(%ebp), %mm0 #ubB5+ugG5,ubB4
1174 pmaddwd UBG0B, %mm0 #ubB5+ugG5,ubB4
1176 paddd %mm7, %mm2 #Y5Y4 -> mm2
1179 + pmaddwd VR0GR@GOTOFF(%ebp), %mm4 #vrR5,vgG4+vrR4 -> mm4
1181 pmaddwd VR0GR, %mm4 #vrR5,vgG4+vrR4 -> mm4
1183 pxor %mm7, %mm7 #0 -> mm7
1186 + pmaddwd VBG0B@GOTOFF(%ebp), %mm3 #vbB5+vgG5,vbB4 -> mm3
1188 pmaddwd VBG0B, %mm3 #vbB5+vgG5,vbB4 -> mm3
1190 punpckhbw %mm7, %mm1 #B7G7R7B6 -> mm1
1192 paddd %mm6, %mm0 #U5U4 -> mm0
1193 movq %mm1, %mm6 #B7G7R7B6 -> mm6
1196 + pmaddwd YBG0B@GOTOFF(%ebp), %mm6 #ybB7+ygG7,ybB6 -> mm6
1198 pmaddwd YBG0B, %mm6 #ybB7+ygG7,ybB6 -> mm6
1200 punpckhbw %mm7, %mm5 #R7B6G6R6 -> mm5
1202 movq %mm5, %mm7 #R7B6G6R6 -> mm7
1203 paddd %mm4, %mm3 #V5V4 -> mm3
1206 + pmaddwd YR0GR@GOTOFF(%ebp), %mm5 #yrR7,ygG6+yrR6 -> mm5
1208 pmaddwd YR0GR, %mm5 #yrR7,ygG6+yrR6 -> mm5
1210 movq %mm1, %mm4 #B7G7R7B6 -> mm4
1213 + pmaddwd UBG0B@GOTOFF(%ebp), %mm4 #ubB7+ugG7,ubB6 -> mm4
1215 pmaddwd UBG0B, %mm4 #ubB7+ugG7,ubB6 -> mm4
1217 psrad $FIXPSHIFT, %mm0 #32-bit scaled U5U4 -> mm0
1219 psrad $FIXPSHIFT, %mm2 #32-bit scaled Y5Y4 -> mm2
1220 @@ -310,25 +493,49 @@ rgbtoycb_mmx_loop:
1221 paddd %mm5, %mm6 #Y7Y6 -> mm6
1222 movq %mm7, %mm5 #R7B6G6R6 -> mm5
1225 + pmaddwd UR0GR@GOTOFF(%ebp), %mm7 #urR7,ugG6+ugR6 -> mm7
1227 pmaddwd UR0GR, %mm7 #urR7,ugG6+ugR6 -> mm7
1229 psrad $FIXPSHIFT, %mm3 #32-bit scaled V5V4 -> mm3
1232 + pmaddwd VBG0B@GOTOFF(%ebp), %mm1 #vbB7+vgG7,vbB6 -> mm1
1234 pmaddwd VBG0B, %mm1 #vbB7+vgG7,vbB6 -> mm1
1236 psrad $FIXPSHIFT, %mm6 #32-bit scaled Y7Y6 -> mm6
1238 packssdw %mm6, %mm2 #Y7Y6Y5Y4 -> mm2
1241 + pmaddwd VR0GR@GOTOFF(%ebp), %mm5 #vrR7,vgG6+vrR6 -> mm5
1243 pmaddwd VR0GR, %mm5 #vrR7,vgG6+vrR6 -> mm5
1245 paddd %mm4, %mm7 #U7U6 -> mm7
1247 psrad $FIXPSHIFT, %mm7 #32-bit scaled U7U6 -> mm7
1249 + paddw OFFSETY@GOTOFF(%ebp), %mm2
1253 movq %mm2, (%ebx) #store Y7Y6Y5Y4
1256 + movq ALLONE@GOTOFF(%ebp), %mm6
1260 packssdw %mm7, %mm0 #32-bit scaled U7U6U5U4 -> mm0
1263 + movq TEMPU@GOTOFF(%ebp), %mm4 #32-bit scaled U3U2U1U0 -> mm4
1265 movq TEMPU, %mm4 #32-bit scaled U3U2U1U0 -> mm4
1267 pmaddwd %mm6, %mm0 #U7U6U5U4 averaged -> (U7U6)(U5U4)=UU3 UU2->mm0
1269 pmaddwd %mm6, %mm4 #U3U2U1U0 averaged -> (U3U2)(U1U0)=UU1 UU0->mm4
1270 @@ -338,8 +545,12 @@ rgbtoycb_mmx_loop:
1272 psrad $FIXPSHIFT, %mm1 #32-bit scaled V7V6 -> mm1
1273 psraw $1, %mm4 #divide UU3 UU2 UU1 UU0 by 2 -> mm4
1277 + movq TEMPV@GOTOFF(%ebp), %mm5 #32-bit scaled V3V2V1V0 -> mm5
1279 movq TEMPV, %mm5 #32-bit scaled V3V2V1V0 -> mm5
1282 movq %mm4, (%ecx) # store U
1284 @@ -425,14 +636,22 @@ _dv_ppm_copy_y_block_mmx:
1285 _dv_pgm_copy_y_block_mmx:
1292 - movl 8(%ebp), %edi # dest
1293 - movl 12(%ebp), %esi # src
1296 + call __i686.get_pc_thunk.bp
1297 + addl $_GLOBAL_OFFSET_TABLE_, %ebp
1300 + movl 16(%esp), %edi # dest
1301 + movl 20(%esp), %esi # src
1304 + movq OFFSETY@GOTOFF(%ebp), %mm7
1311 @@ -567,14 +786,22 @@ _dv_pgm_copy_y_block_mmx:
1312 _dv_video_copy_y_block_mmx:
1319 - movl 8(%ebp), %edi # dest
1320 - movl 12(%ebp), %esi # src
1323 + call __i686.get_pc_thunk.bp
1324 + addl $_GLOBAL_OFFSET_TABLE_, %ebp
1327 + movl 16(%esp), %edi # dest
1328 + movl 20(%esp), %esi # src
1331 + movq OFFSETBX@GOTOFF(%ebp), %mm7
1338 @@ -855,16 +1082,23 @@ _dv_ppm_copy_pal_c_block_mmx:
1339 _dv_pgm_copy_pal_c_block_mmx:
1347 - movl 8(%ebp), %edi # dest
1348 - movl 12(%ebp), %esi # src
1351 + call __i686.get_pc_thunk.bp
1352 + addl $_GLOBAL_OFFSET_TABLE_, %ebp
1355 + movl 20(%esp), %edi # dest
1356 + movl 24(%esp), %esi # src
1359 + movq OFFSETBX@GOTOFF(%ebp), %mm7
1366 @@ -1003,15 +1237,23 @@ _dv_pgm_copy_pal_c_block_mmx:
1367 _dv_video_copy_pal_c_block_mmx:
1375 - movl 8(%ebp), %edi # dest
1376 - movl 12(%ebp), %esi # src
1379 + call __i686.get_pc_thunk.bp
1380 + addl $_GLOBAL_OFFSET_TABLE_, %ebp
1383 + movl 20(%esp), %edi # dest
1384 + movl 24(%esp), %esi # src
1387 + movq OFFSETBX@GOTOFF(%ebp), %mm7
1394 @@ -1098,18 +1340,25 @@ video_copy_pal_c_block_mmx_loop:
1395 _dv_ppm_copy_ntsc_c_block_mmx:
1403 - movl 8(%ebp), %edi # dest
1404 - movl 12(%ebp), %esi # src
1407 + call __i686.get_pc_thunk.bp
1408 + addl $_GLOBAL_OFFSET_TABLE_, %ebp
1411 + movl 20(%esp), %edi # dest
1412 + movl 24(%esp), %esi # src
1417 + movq ALLONE@GOTOFF(%ebp), %mm6
1422 ppm_copy_ntsc_c_block_mmx_loop:
1425 @@ -1171,14 +1420,22 @@ ppm_copy_ntsc_c_block_mmx_loop:
1426 _dv_pgm_copy_ntsc_c_block_mmx:
1433 - movl 8(%ebp), %edi # dest
1434 - movl 12(%ebp), %esi # src
1437 + call __i686.get_pc_thunk.bp
1438 + addl $_GLOBAL_OFFSET_TABLE_, %ebp
1441 + movl 16(%esp), %edi # dest
1442 + movl 20(%esp), %esi # src
1445 + movq OFFSETBX@GOTOFF(%ebp), %mm7
1452 @@ -1328,15 +1585,23 @@ _dv_pgm_copy_ntsc_c_block_mmx:
1453 _dv_video_copy_ntsc_c_block_mmx:
1461 - movl 8(%ebp), %edi # dest
1462 - movl 12(%ebp), %esi # src
1465 + call __i686.get_pc_thunk.bp
1466 + addl $_GLOBAL_OFFSET_TABLE_, %ebp
1469 + movl 20(%esp), %edi # dest
1470 + movl 24(%esp), %esi # src
1473 + movq OFFSETBX@GOTOFF(%ebp), %mm7
1480 diff -urp libdv-0.104-old/libdv/vlc_x86.S libdv-0.104/libdv/vlc_x86.S
1481 --- libdv-0.104-old/libdv/vlc_x86.S 2005-10-23 19:40:58.000000000 +0200
1482 +++ libdv-0.104/libdv/vlc_x86.S 2005-10-25 01:47:14.000000000 +0200
1488 +# undef __i686 /* gcc define gets in our way */
1489 + .section .gnu.linkonce.t.__i686.get_pc_thunk.bp,"ax",@progbits
1490 +.globl __i686.get_pc_thunk.bp
1491 + .hidden __i686.get_pc_thunk.bp
1492 + .type __i686.get_pc_thunk.bp,@function
1493 +__i686.get_pc_thunk.bp:
1497 + .section .gnu.linkonce.t.__i686.get_pc_thunk.si,"ax",@progbits
1498 +.globl __i686.get_pc_thunk.si
1499 + .hidden __i686.get_pc_thunk.si
1500 + .type __i686.get_pc_thunk.si,@function
1501 +__i686.get_pc_thunk.si:
1507 .globl dv_decode_vlc
1508 .type dv_decode_vlc,@function
1514 + call __i686.get_pc_thunk.bp
1515 + addl $_GLOBAL_OFFSET_TABLE_, %ebp
1518 - /* Args are at 8(%esp). */
1519 - movl 8(%esp),%eax /* %eax is bits */
1520 - movl 12(%esp),%ebx /* %ebx is maxbits */
1521 + /* Args are at 12(%esp). */
1522 + movl 12(%esp),%eax /* %eax is bits */
1523 + movl 16(%esp),%ebx /* %ebx is maxbits */
1524 andl $0x3f,%ebx /* limit index range STL*/
1527 + movl dv_vlc_class_index_mask@GOTOFF(%ebp,%ebx,4),%edx
1529 movl dv_vlc_class_index_mask(,%ebx,4),%edx
1533 + movl dv_vlc_class_index_rshift@GOTOFF(%ebp,%ebx,4),%ecx
1535 movl dv_vlc_class_index_rshift(,%ebx,4),%ecx
1539 + movl dv_vlc_classes@GOTOFF(%ebp,%ebx,4),%ecx
1541 movl dv_vlc_classes(,%ebx,4),%ecx
1543 movsbl (%ecx,%edx,1),%edx /* %edx is class */
1547 + movl dv_vlc_index_mask@GOTOFF(%ebp,%edx,4),%ebx
1548 + movl dv_vlc_index_rshift@GOTOFF(%ebp,%edx,4),%ecx
1550 movl dv_vlc_index_mask(,%edx,4),%ebx
1551 movl dv_vlc_index_rshift(,%edx,4),%ecx
1557 + movl dv_vlc_lookups@GOTOFF(%ebp,%edx,4),%edx
1559 movl dv_vlc_lookups(,%edx,4),%edx
1561 movl (%edx,%ebx,4),%edx
1563 /* Now %edx holds result, like this:
1564 @@ -42,7 +89,11 @@ dv_decode_vlc:
1569 + movl sign_mask@GOTOFF(%ebp,%ecx,4),%ebx
1571 movl sign_mask(,%ecx,4),%ebx
1576 @@ -63,14 +114,14 @@ dv_decode_vlc:
1578 Note that the 'broken' pattern is all ones (i.e. 0xffffffff)
1580 - movl 12(%esp),%ebx /* %ebx is maxbits */
1581 + movl 16(%esp),%ebx /* %ebx is maxbits */
1586 - movl 16(%esp),%eax
1587 + movl 20(%esp),%eax
1594 @@ -80,21 +131,38 @@ dv_decode_vlc:
1595 .type __dv_decode_vlc,@function
1601 + call __i686.get_pc_thunk.bp
1602 + addl $_GLOBAL_OFFSET_TABLE_, %ebp
1605 - /* Args are at 8(%esp). */
1606 - movl 8(%esp),%eax /* %eax is bits */
1607 + /* Args are at 12(%esp). */
1608 + movl 12(%esp),%eax /* %eax is bits */
1610 movl %eax,%edx /* %edx is class */
1614 + movsbl dv_vlc_class_lookup5@GOTOFF(%ebp,%edx),%edx
1616 + movl dv_vlc_index_mask@GOTOFF(%ebp,%edx,4),%ebx
1617 + movl dv_vlc_index_rshift@GOTOFF(%ebp,%edx,4),%ecx
1619 movsbl dv_vlc_class_lookup5(%edx),%edx
1622 movl dv_vlc_index_mask(,%edx,4),%ebx
1623 movl dv_vlc_index_rshift(,%edx,4),%ecx
1629 + movl dv_vlc_lookups@GOTOFF(%ebp,%edx,4),%edx
1631 movl dv_vlc_lookups(,%edx,4),%edx
1633 movl (%edx,%ebx,4),%edx
1635 /* Now %edx holds result, like this:
1636 @@ -112,7 +180,11 @@ __dv_decode_vlc:
1641 + movl sign_mask@GOTOFF(%ebp,%ecx,4),%ecx
1643 movl sign_mask(,%ecx,4),%ecx
1648 @@ -127,9 +199,9 @@ __dv_decode_vlc:
1652 - movl 12(%esp),%eax
1653 + movl 16(%esp),%eax
1660 @@ -147,6 +219,11 @@ dv_parse_ac_coeffs_pass0:
1665 + call __i686.get_pc_thunk.si
1666 + addl $_GLOBAL_OFFSET_TABLE_, %esi
1669 #define ARGn(N) (20+(4*(N)))(%esp)
1672 @@ -159,8 +236,10 @@ dv_parse_ac_coeffs_pass0:
1678 movl bitstream_t_buf(%esi),%esi
1680 movl dv_block_t_offset(%ebp),%edi
1681 movl dv_block_t_reorder(%ebp),%ebx
1683 @@ -170,7 +249,11 @@ dv_parse_ac_coeffs_pass0:
1685 movq dv_block_t_coeffs(%ebp),%mm1
1688 + pand const_f_0_0_0@GOTOFF(%esi),%mm1
1690 pand const_f_0_0_0,%mm1
1692 movq %mm1,dv_block_t_coeffs(%ebp)
1693 movq %mm0,(dv_block_t_coeffs + 8)(%ebp)
1694 movq %mm0,(dv_block_t_coeffs + 16)(%ebp)
1695 @@ -191,9 +274,17 @@ dv_parse_ac_coeffs_pass0:
1702 + movl bitstream_t_buf(%esi),%esi
1704 movzbl (%esi,%ecx,1),%eax
1705 movzbl 1(%esi,%ecx,1),%edx
1706 movzbl 2(%esi,%ecx,1),%ecx
1713 @@ -217,7 +308,11 @@ readloop:
1715 /* Attempt to use the shortcut first. If it hits, then
1716 this vlc term has been decoded. */
1718 + movl dv_vlc_class1_shortcut@GOTOFF(%esi,%ecx,4),%edx
1720 movl dv_vlc_class1_shortcut(,%ecx,4),%edx
1725 @@ -228,12 +323,19 @@ readloop:
1726 movl %ebx,dv_block_t_reorder(%ebp)
1731 + movsbl dv_vlc_class_lookup5@GOTOFF(%esi,%ecx),%ecx
1733 + movl dv_vlc_index_mask@GOTOFF(%esi,%ecx,4),%ebx
1734 + movl dv_vlc_lookups@GOTOFF(%esi,%ecx,4),%edx
1735 + movl dv_vlc_index_rshift@GOTOFF(%esi,%ecx,4),%ecx
1737 movsbl dv_vlc_class_lookup5(%ecx),%ecx
1739 movl dv_vlc_index_mask(,%ecx,4),%ebx
1740 movl dv_vlc_lookups(,%ecx,4),%edx
1741 movl dv_vlc_index_rshift(,%ecx,4),%ecx
1746 @@ -256,7 +358,11 @@ readloop:
1751 + movl sign_mask@GOTOFF(%esi,%ecx,4),%ecx
1753 movl sign_mask(,%ecx,4),%ecx
1758 @@ -326,10 +432,20 @@ alldone:
1761 /* slow path: use dv_decode_vlc */;
1764 + leal vlc@GOTOFF(%esi),%esi
1765 + xchgl %esi,(%esp) /* last parameter is &vlc */
1767 pushl $vlc /* last parameter is &vlc */
1769 pushl %edx /* bits_left */
1770 pushl %eax /* bits */
1772 + call dv_decode_vlc@PLT
1777 test $0x80,%edx /* If (vlc.run < 0) break */
1779 @@ -365,6 +481,11 @@ dv_parse_video_segment:
1784 + call __i686.get_pc_thunk.si
1785 + addl $_GLOBAL_OFFSET_TABLE_, %esi
1788 #define ARGn(N) (20+(4*(N)))(%esp)
1790 movl ARGn(1),%eax /* quality */
1791 @@ -373,7 +494,11 @@ dv_parse_video_segment:
1796 + movl %ebx,n_blocks@GOTOFF(%esi)
1803 @@ -384,15 +509,22 @@ its_mono:
1808 movl dv_videosegment_t_bs(%ebx),%esi
1809 movl bitstream_t_buf(%esi),%esi
1811 leal dv_videosegment_t_mb(%ebx),%edi
1817 + movl %eax,m@GOTOFF(%esi)
1818 + movl %ecx,mb_start@GOTOFF(%esi)
1826 @@ -400,7 +532,15 @@ macloop:
1827 /* mb->qno = bitstream_get(bs,4); */
1832 + movl dv_videosegment_t_bs(%ebx),%esi
1833 + movl bitstream_t_buf(%esi),%esi
1835 movzbl 3(%esi,%edx,1),%edx
1840 movl %edx,dv_macroblock_t_qno(%edi)
1842 @@ -411,7 +551,11 @@ macloop:
1843 movl %edx,dv_macroblock_t_eob_count(%edi)
1845 /* mb->i = (seg->i + dv_super_map_vertical[m]) % (seg->isPAL?12:10); */
1847 + movl dv_super_map_vertical@GOTOFF(%esi,%eax,4),%edx
1849 movl dv_super_map_vertical(,%eax,4),%edx
1851 movl dv_videosegment_t_i(%ebx),%ecx
1854 @@ -422,11 +566,20 @@ skarly:
1856 shll $5,%ecx /* ecx = (isPAL ? 32 : 0) */
1859 + leal mod_10@GOTOFF(%esi,%edx),%edx
1860 + movzbl (%edx,%ecx,1),%edx /* uses mod_12 for PAL */
1862 movzbl mod_10(%edx,%ecx,1),%edx /* uses mod_12 for PAL */
1864 movl %edx,dv_macroblock_t_i(%edi)
1866 /* mb->j = dv_super_map_horizontal[m]; */
1868 + movl dv_super_map_horizontal@GOTOFF(%esi,%eax,4),%edx
1870 movl dv_super_map_horizontal(,%eax,4),%edx
1872 movl %edx,dv_macroblock_t_j(%edi)
1874 /* mb->k = seg->k; */
1875 @@ -445,12 +598,29 @@ blkloop:
1876 +---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+
1878 /* dc = bitstream_get(bs,9); */
1880 + movl mb_start@GOTOFF(%esi),%ecx
1886 + movzbl blk_start@GOTOFF(%esi,%ebx),%edx
1888 movzbl blk_start(%ebx),%edx
1894 + movl dv_videosegment_t_bs(%esi),%esi
1895 + movl bitstream_t_buf(%esi),%esi
1897 movzbl (%esi,%edx,1),%eax /* hi byte */
1898 movzbl 1(%esi,%edx,1),%ecx /* lo byte */
1905 @@ -477,7 +647,11 @@ blkloop:
1907 /* bl->reorder = &dv_reorder[bl->dct_mode][1]; */
1910 + addl dv_reorder@GOTOFF+1(%esi),%eax
1912 addl $(dv_reorder+1),%eax
1914 movl %eax,dv_block_t_reorder(%ebp)
1916 /* bl->reorder_sentinel = bl->reorder + 63; */
1917 @@ -485,13 +659,22 @@ blkloop:
1918 movl %eax,dv_block_t_reorder_sentinel(%ebp)
1920 /* bl->offset= mb_start + dv_parse_bit_start[b]; */
1922 + movl mb_start@GOTOFF(%esi),%ecx
1923 + movl dv_parse_bit_start@GOTOFF(%esi,%ebx,4),%eax
1926 movl dv_parse_bit_start(,%ebx,4),%eax
1929 movl %eax,dv_block_t_offset(%ebp)
1931 /* bl->end= mb_start + dv_parse_bit_end[b]; */
1933 + movl dv_parse_bit_end@GOTOFF(%esi,%ebx,4),%eax
1935 movl dv_parse_bit_end(,%ebx,4),%eax
1938 movl %eax,dv_block_t_end(%ebp)
1940 @@ -503,7 +686,11 @@ blkloop:
1941 /* no AC pass. Just zero out the remaining coeffs */
1942 movq dv_block_t_coeffs(%ebp),%mm1
1945 + pand const_f_0_0_0@GOTOFF(%esi),%mm1
1947 pand const_f_0_0_0,%mm1
1949 movq %mm1,dv_block_t_coeffs(%ebp)
1950 movq %mm0,(dv_block_t_coeffs + 8)(%ebp)
1951 movq %mm0,(dv_block_t_coeffs + 16)(%ebp)
1952 @@ -528,18 +715,31 @@ do_ac_pass:
1957 + call dv_parse_ac_coeffs_pass0@PLT
1959 call dv_parse_ac_coeffs_pass0
1965 + movl n_blocks@GOTOFF(%esi),%eax
1969 addl $dv_block_t_size,%ebp
1975 + movl m@GOTOFF(%esi),%eax
1976 + movl mb_start@GOTOFF(%esi),%ecx
1982 addl $dv_macroblock_t_size,%edi
1984 @@ -557,7 +757,11 @@ done_ac:
1986 andl $DV_QUALITY_AC_MASK,%eax
1987 cmpl $DV_QUALITY_AC_2,%eax
1989 + jz dv_parse_ac_coeffs@PLT
1991 jz dv_parse_ac_coeffs