39 #if COMPILE_TEMPLATE_AMD3DNOW
40 #define PREFETCH "prefetch"
41 #define PAVGB "pavgusb"
42 #elif COMPILE_TEMPLATE_MMXEXT
43 #define PREFETCH "prefetchnta"
46 #define PREFETCH " # nop"
49 #if COMPILE_TEMPLATE_AMD3DNOW
56 #if COMPILE_TEMPLATE_MMXEXT
57 #define MOVNTQ "movntq"
58 #define SFENCE "sfence"
61 #define SFENCE " # nop"
64 #if !COMPILE_TEMPLATE_SSE2
66 #if !COMPILE_TEMPLATE_AMD3DNOW
75 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
77 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask32a):
"memory");
81 "movd (%1), %%mm0 \n\t"
82 "punpckldq 3(%1), %%mm0 \n\t"
83 "movd 6(%1), %%mm1 \n\t"
84 "punpckldq 9(%1), %%mm1 \n\t"
85 "movd 12(%1), %%mm2 \n\t"
86 "punpckldq 15(%1), %%mm2 \n\t"
87 "movd 18(%1), %%mm3 \n\t"
88 "punpckldq 21(%1), %%mm3 \n\t"
89 "por %%mm7, %%mm0 \n\t"
90 "por %%mm7, %%mm1 \n\t"
91 "por %%mm7, %%mm2 \n\t"
92 "por %%mm7, %%mm3 \n\t"
95 MOVNTQ" %%mm2, 16(%0) \n\t"
102 __asm__
volatile(
SFENCE:::
"memory");
103 __asm__
volatile(
EMMS:::
"memory");
112 #define STORE_BGR24_MMX \
113 "psrlq $8, %%mm2 \n\t" \
114 "psrlq $8, %%mm3 \n\t" \
115 "psrlq $8, %%mm6 \n\t" \
116 "psrlq $8, %%mm7 \n\t" \
117 "pand "MANGLE(mask24l)", %%mm0\n\t" \
118 "pand "MANGLE(mask24l)", %%mm1\n\t" \
119 "pand "MANGLE(mask24l)", %%mm4\n\t" \
120 "pand "MANGLE(mask24l)", %%mm5\n\t" \
121 "pand "MANGLE(mask24h)", %%mm2\n\t" \
122 "pand "MANGLE(mask24h)", %%mm3\n\t" \
123 "pand "MANGLE(mask24h)", %%mm6\n\t" \
124 "pand "MANGLE(mask24h)", %%mm7\n\t" \
125 "por %%mm2, %%mm0 \n\t" \
126 "por %%mm3, %%mm1 \n\t" \
127 "por %%mm6, %%mm4 \n\t" \
128 "por %%mm7, %%mm5 \n\t" \
130 "movq %%mm1, %%mm2 \n\t" \
131 "movq %%mm4, %%mm3 \n\t" \
132 "psllq $48, %%mm2 \n\t" \
133 "psllq $32, %%mm3 \n\t" \
134 "por %%mm2, %%mm0 \n\t" \
135 "psrlq $16, %%mm1 \n\t" \
136 "psrlq $32, %%mm4 \n\t" \
137 "psllq $16, %%mm5 \n\t" \
138 "por %%mm3, %%mm1 \n\t" \
139 "por %%mm5, %%mm4 \n\t" \
141 MOVNTQ" %%mm0, (%0) \n\t" \
142 MOVNTQ" %%mm1, 8(%0) \n\t" \
143 MOVNTQ" %%mm4, 16(%0)"
153 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
158 "movq (%1), %%mm0 \n\t"
159 "movq 8(%1), %%mm1 \n\t"
160 "movq 16(%1), %%mm4 \n\t"
161 "movq 24(%1), %%mm5 \n\t"
162 "movq %%mm0, %%mm2 \n\t"
163 "movq %%mm1, %%mm3 \n\t"
164 "movq %%mm4, %%mm6 \n\t"
165 "movq %%mm5, %%mm7 \n\t"
173 __asm__
volatile(
SFENCE:::
"memory");
174 __asm__
volatile(
EMMS:::
"memory");
196 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
197 __asm__
volatile(
"movq %0, %%mm4"::
"m"(mask15s));
202 "movq (%1), %%mm0 \n\t"
203 "movq 8(%1), %%mm2 \n\t"
204 "movq %%mm0, %%mm1 \n\t"
205 "movq %%mm2, %%mm3 \n\t"
206 "pand %%mm4, %%mm0 \n\t"
207 "pand %%mm4, %%mm2 \n\t"
208 "paddw %%mm1, %%mm0 \n\t"
209 "paddw %%mm3, %%mm2 \n\t"
217 __asm__
volatile(
SFENCE:::
"memory");
218 __asm__
volatile(
EMMS:::
"memory");
221 register unsigned x= *((
const uint32_t *)s);
222 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
227 register unsigned short x= *((
const uint16_t *)s);
228 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
239 __asm__
volatile(
PREFETCH" %0"::
"m"(*s));
240 __asm__
volatile(
"movq %0, %%mm7"::
"m"(mask15rg));
241 __asm__
volatile(
"movq %0, %%mm6"::
"m"(mask15b));
246 "movq (%1), %%mm0 \n\t"
247 "movq 8(%1), %%mm2 \n\t"
248 "movq %%mm0, %%mm1 \n\t"
249 "movq %%mm2, %%mm3 \n\t"
250 "psrlq $1, %%mm0 \n\t"
251 "psrlq $1, %%mm2 \n\t"
252 "pand %%mm7, %%mm0 \n\t"
253 "pand %%mm7, %%mm2 \n\t"
254 "pand %%mm6, %%mm1 \n\t"
255 "pand %%mm6, %%mm3 \n\t"
256 "por %%mm1, %%mm0 \n\t"
257 "por %%mm3, %%mm2 \n\t"
265 __asm__
volatile(
SFENCE:::
"memory");
266 __asm__
volatile(
EMMS:::
"memory");
269 register uint32_t x= *((
const uint32_t*)s);
270 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
275 register uint16_t x= *((
const uint16_t*)s);
276 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
285 uint16_t *d = (uint16_t *)dst;
289 "movq %3, %%mm5 \n\t"
290 "movq %4, %%mm6 \n\t"
291 "movq %5, %%mm7 \n\t"
296 "movd (%1), %%mm0 \n\t"
297 "movd 4(%1), %%mm3 \n\t"
298 "punpckldq 8(%1), %%mm0 \n\t"
299 "punpckldq 12(%1), %%mm3 \n\t"
300 "movq %%mm0, %%mm1 \n\t"
301 "movq %%mm3, %%mm4 \n\t"
302 "pand %%mm6, %%mm0 \n\t"
303 "pand %%mm6, %%mm3 \n\t"
304 "pmaddwd %%mm7, %%mm0 \n\t"
305 "pmaddwd %%mm7, %%mm3 \n\t"
306 "pand %%mm5, %%mm1 \n\t"
307 "pand %%mm5, %%mm4 \n\t"
308 "por %%mm1, %%mm0 \n\t"
309 "por %%mm4, %%mm3 \n\t"
310 "psrld $5, %%mm0 \n\t"
311 "pslld $11, %%mm3 \n\t"
312 "por %%mm3, %%mm0 \n\t"
320 :
"r" (mm_end),
"m" (mask3216g),
"m" (mask3216br),
"m" (mul3216)
322 __asm__
volatile(
SFENCE:::
"memory");
323 __asm__
volatile(
EMMS:::
"memory");
325 register int rgb = *(
const uint32_t*)s; s += 4;
326 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
335 uint16_t *d = (uint16_t *)dst;
337 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
339 "movq %0, %%mm7 \n\t"
340 "movq %1, %%mm6 \n\t"
341 ::
"m"(red_16mask),
"m"(green_16mask));
346 "movd (%1), %%mm0 \n\t"
347 "movd 4(%1), %%mm3 \n\t"
348 "punpckldq 8(%1), %%mm0 \n\t"
349 "punpckldq 12(%1), %%mm3 \n\t"
350 "movq %%mm0, %%mm1 \n\t"
351 "movq %%mm0, %%mm2 \n\t"
352 "movq %%mm3, %%mm4 \n\t"
353 "movq %%mm3, %%mm5 \n\t"
354 "psllq $8, %%mm0 \n\t"
355 "psllq $8, %%mm3 \n\t"
356 "pand %%mm7, %%mm0 \n\t"
357 "pand %%mm7, %%mm3 \n\t"
358 "psrlq $5, %%mm1 \n\t"
359 "psrlq $5, %%mm4 \n\t"
360 "pand %%mm6, %%mm1 \n\t"
361 "pand %%mm6, %%mm4 \n\t"
362 "psrlq $19, %%mm2 \n\t"
363 "psrlq $19, %%mm5 \n\t"
364 "pand %2, %%mm2 \n\t"
365 "pand %2, %%mm5 \n\t"
366 "por %%mm1, %%mm0 \n\t"
367 "por %%mm4, %%mm3 \n\t"
368 "por %%mm2, %%mm0 \n\t"
369 "por %%mm5, %%mm3 \n\t"
370 "psllq $16, %%mm3 \n\t"
371 "por %%mm3, %%mm0 \n\t"
373 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
377 __asm__
volatile(
SFENCE:::
"memory");
378 __asm__
volatile(
EMMS:::
"memory");
380 register int rgb = *(
const uint32_t*)s; s += 4;
381 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
390 uint16_t *d = (uint16_t *)dst;
394 "movq %3, %%mm5 \n\t"
395 "movq %4, %%mm6 \n\t"
396 "movq %5, %%mm7 \n\t"
401 "movd (%1), %%mm0 \n\t"
402 "movd 4(%1), %%mm3 \n\t"
403 "punpckldq 8(%1), %%mm0 \n\t"
404 "punpckldq 12(%1), %%mm3 \n\t"
405 "movq %%mm0, %%mm1 \n\t"
406 "movq %%mm3, %%mm4 \n\t"
407 "pand %%mm6, %%mm0 \n\t"
408 "pand %%mm6, %%mm3 \n\t"
409 "pmaddwd %%mm7, %%mm0 \n\t"
410 "pmaddwd %%mm7, %%mm3 \n\t"
411 "pand %%mm5, %%mm1 \n\t"
412 "pand %%mm5, %%mm4 \n\t"
413 "por %%mm1, %%mm0 \n\t"
414 "por %%mm4, %%mm3 \n\t"
415 "psrld $6, %%mm0 \n\t"
416 "pslld $10, %%mm3 \n\t"
417 "por %%mm3, %%mm0 \n\t"
425 :
"r" (mm_end),
"m" (mask3215g),
"m" (mask3216br),
"m" (mul3215)
427 __asm__
volatile(
SFENCE:::
"memory");
428 __asm__
volatile(
EMMS:::
"memory");
430 register int rgb = *(
const uint32_t*)s; s += 4;
431 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
440 uint16_t *d = (uint16_t *)dst;
442 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
444 "movq %0, %%mm7 \n\t"
445 "movq %1, %%mm6 \n\t"
446 ::
"m"(red_15mask),
"m"(green_15mask));
451 "movd (%1), %%mm0 \n\t"
452 "movd 4(%1), %%mm3 \n\t"
453 "punpckldq 8(%1), %%mm0 \n\t"
454 "punpckldq 12(%1), %%mm3 \n\t"
455 "movq %%mm0, %%mm1 \n\t"
456 "movq %%mm0, %%mm2 \n\t"
457 "movq %%mm3, %%mm4 \n\t"
458 "movq %%mm3, %%mm5 \n\t"
459 "psllq $7, %%mm0 \n\t"
460 "psllq $7, %%mm3 \n\t"
461 "pand %%mm7, %%mm0 \n\t"
462 "pand %%mm7, %%mm3 \n\t"
463 "psrlq $6, %%mm1 \n\t"
464 "psrlq $6, %%mm4 \n\t"
465 "pand %%mm6, %%mm1 \n\t"
466 "pand %%mm6, %%mm4 \n\t"
467 "psrlq $19, %%mm2 \n\t"
468 "psrlq $19, %%mm5 \n\t"
469 "pand %2, %%mm2 \n\t"
470 "pand %2, %%mm5 \n\t"
471 "por %%mm1, %%mm0 \n\t"
472 "por %%mm4, %%mm3 \n\t"
473 "por %%mm2, %%mm0 \n\t"
474 "por %%mm5, %%mm3 \n\t"
475 "psllq $16, %%mm3 \n\t"
476 "por %%mm3, %%mm0 \n\t"
478 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
482 __asm__
volatile(
SFENCE:::
"memory");
483 __asm__
volatile(
EMMS:::
"memory");
485 register int rgb = *(
const uint32_t*)s; s += 4;
486 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
495 uint16_t *d = (uint16_t *)dst;
497 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
499 "movq %0, %%mm7 \n\t"
500 "movq %1, %%mm6 \n\t"
501 ::
"m"(red_16mask),
"m"(green_16mask));
506 "movd (%1), %%mm0 \n\t"
507 "movd 3(%1), %%mm3 \n\t"
508 "punpckldq 6(%1), %%mm0 \n\t"
509 "punpckldq 9(%1), %%mm3 \n\t"
510 "movq %%mm0, %%mm1 \n\t"
511 "movq %%mm0, %%mm2 \n\t"
512 "movq %%mm3, %%mm4 \n\t"
513 "movq %%mm3, %%mm5 \n\t"
514 "psrlq $3, %%mm0 \n\t"
515 "psrlq $3, %%mm3 \n\t"
516 "pand %2, %%mm0 \n\t"
517 "pand %2, %%mm3 \n\t"
518 "psrlq $5, %%mm1 \n\t"
519 "psrlq $5, %%mm4 \n\t"
520 "pand %%mm6, %%mm1 \n\t"
521 "pand %%mm6, %%mm4 \n\t"
522 "psrlq $8, %%mm2 \n\t"
523 "psrlq $8, %%mm5 \n\t"
524 "pand %%mm7, %%mm2 \n\t"
525 "pand %%mm7, %%mm5 \n\t"
526 "por %%mm1, %%mm0 \n\t"
527 "por %%mm4, %%mm3 \n\t"
528 "por %%mm2, %%mm0 \n\t"
529 "por %%mm5, %%mm3 \n\t"
530 "psllq $16, %%mm3 \n\t"
531 "por %%mm3, %%mm0 \n\t"
533 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
537 __asm__
volatile(
SFENCE:::
"memory");
538 __asm__
volatile(
EMMS:::
"memory");
543 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
552 uint16_t *d = (uint16_t *)dst;
554 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
556 "movq %0, %%mm7 \n\t"
557 "movq %1, %%mm6 \n\t"
558 ::
"m"(red_16mask),
"m"(green_16mask));
563 "movd (%1), %%mm0 \n\t"
564 "movd 3(%1), %%mm3 \n\t"
565 "punpckldq 6(%1), %%mm0 \n\t"
566 "punpckldq 9(%1), %%mm3 \n\t"
567 "movq %%mm0, %%mm1 \n\t"
568 "movq %%mm0, %%mm2 \n\t"
569 "movq %%mm3, %%mm4 \n\t"
570 "movq %%mm3, %%mm5 \n\t"
571 "psllq $8, %%mm0 \n\t"
572 "psllq $8, %%mm3 \n\t"
573 "pand %%mm7, %%mm0 \n\t"
574 "pand %%mm7, %%mm3 \n\t"
575 "psrlq $5, %%mm1 \n\t"
576 "psrlq $5, %%mm4 \n\t"
577 "pand %%mm6, %%mm1 \n\t"
578 "pand %%mm6, %%mm4 \n\t"
579 "psrlq $19, %%mm2 \n\t"
580 "psrlq $19, %%mm5 \n\t"
581 "pand %2, %%mm2 \n\t"
582 "pand %2, %%mm5 \n\t"
583 "por %%mm1, %%mm0 \n\t"
584 "por %%mm4, %%mm3 \n\t"
585 "por %%mm2, %%mm0 \n\t"
586 "por %%mm5, %%mm3 \n\t"
587 "psllq $16, %%mm3 \n\t"
588 "por %%mm3, %%mm0 \n\t"
590 ::
"r"(d),
"r"(s),
"m"(blue_16mask):
"memory");
594 __asm__
volatile(
SFENCE:::
"memory");
595 __asm__
volatile(
EMMS:::
"memory");
600 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
609 uint16_t *d = (uint16_t *)dst;
611 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
613 "movq %0, %%mm7 \n\t"
614 "movq %1, %%mm6 \n\t"
615 ::
"m"(red_15mask),
"m"(green_15mask));
620 "movd (%1), %%mm0 \n\t"
621 "movd 3(%1), %%mm3 \n\t"
622 "punpckldq 6(%1), %%mm0 \n\t"
623 "punpckldq 9(%1), %%mm3 \n\t"
624 "movq %%mm0, %%mm1 \n\t"
625 "movq %%mm0, %%mm2 \n\t"
626 "movq %%mm3, %%mm4 \n\t"
627 "movq %%mm3, %%mm5 \n\t"
628 "psrlq $3, %%mm0 \n\t"
629 "psrlq $3, %%mm3 \n\t"
630 "pand %2, %%mm0 \n\t"
631 "pand %2, %%mm3 \n\t"
632 "psrlq $6, %%mm1 \n\t"
633 "psrlq $6, %%mm4 \n\t"
634 "pand %%mm6, %%mm1 \n\t"
635 "pand %%mm6, %%mm4 \n\t"
636 "psrlq $9, %%mm2 \n\t"
637 "psrlq $9, %%mm5 \n\t"
638 "pand %%mm7, %%mm2 \n\t"
639 "pand %%mm7, %%mm5 \n\t"
640 "por %%mm1, %%mm0 \n\t"
641 "por %%mm4, %%mm3 \n\t"
642 "por %%mm2, %%mm0 \n\t"
643 "por %%mm5, %%mm3 \n\t"
644 "psllq $16, %%mm3 \n\t"
645 "por %%mm3, %%mm0 \n\t"
647 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
651 __asm__
volatile(
SFENCE:::
"memory");
652 __asm__
volatile(
EMMS:::
"memory");
657 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
666 uint16_t *d = (uint16_t *)dst;
668 __asm__
volatile(
PREFETCH" %0"::
"m"(*src):
"memory");
670 "movq %0, %%mm7 \n\t"
671 "movq %1, %%mm6 \n\t"
672 ::
"m"(red_15mask),
"m"(green_15mask));
677 "movd (%1), %%mm0 \n\t"
678 "movd 3(%1), %%mm3 \n\t"
679 "punpckldq 6(%1), %%mm0 \n\t"
680 "punpckldq 9(%1), %%mm3 \n\t"
681 "movq %%mm0, %%mm1 \n\t"
682 "movq %%mm0, %%mm2 \n\t"
683 "movq %%mm3, %%mm4 \n\t"
684 "movq %%mm3, %%mm5 \n\t"
685 "psllq $7, %%mm0 \n\t"
686 "psllq $7, %%mm3 \n\t"
687 "pand %%mm7, %%mm0 \n\t"
688 "pand %%mm7, %%mm3 \n\t"
689 "psrlq $6, %%mm1 \n\t"
690 "psrlq $6, %%mm4 \n\t"
691 "pand %%mm6, %%mm1 \n\t"
692 "pand %%mm6, %%mm4 \n\t"
693 "psrlq $19, %%mm2 \n\t"
694 "psrlq $19, %%mm5 \n\t"
695 "pand %2, %%mm2 \n\t"
696 "pand %2, %%mm5 \n\t"
697 "por %%mm1, %%mm0 \n\t"
698 "por %%mm4, %%mm3 \n\t"
699 "por %%mm2, %%mm0 \n\t"
700 "por %%mm5, %%mm3 \n\t"
701 "psllq $16, %%mm3 \n\t"
702 "por %%mm3, %%mm0 \n\t"
704 ::
"r"(d),
"r"(s),
"m"(blue_15mask):
"memory");
708 __asm__
volatile(
SFENCE:::
"memory");
709 __asm__
volatile(
EMMS:::
"memory");
714 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
721 const uint16_t *mm_end;
723 const uint16_t *
s = (
const uint16_t*)
src;
724 end = s + src_size/2;
725 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
730 "movq (%1), %%mm0 \n\t"
731 "movq (%1), %%mm1 \n\t"
732 "movq (%1), %%mm2 \n\t"
733 "pand %2, %%mm0 \n\t"
734 "pand %3, %%mm1 \n\t"
735 "pand %4, %%mm2 \n\t"
736 "psllq $5, %%mm0 \n\t"
737 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
738 "pmulhw "MANGLE(mul15_mid)
", %%mm1 \n\t"
739 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
740 "movq %%mm0, %%mm3 \n\t"
741 "movq %%mm1, %%mm4 \n\t"
742 "movq %%mm2, %%mm5 \n\t"
743 "punpcklwd %5, %%mm0 \n\t"
744 "punpcklwd %5, %%mm1 \n\t"
745 "punpcklwd %5, %%mm2 \n\t"
746 "punpckhwd %5, %%mm3 \n\t"
747 "punpckhwd %5, %%mm4 \n\t"
748 "punpckhwd %5, %%mm5 \n\t"
749 "psllq $8, %%mm1 \n\t"
750 "psllq $16, %%mm2 \n\t"
751 "por %%mm1, %%mm0 \n\t"
752 "por %%mm2, %%mm0 \n\t"
753 "psllq $8, %%mm4 \n\t"
754 "psllq $16, %%mm5 \n\t"
755 "por %%mm4, %%mm3 \n\t"
756 "por %%mm5, %%mm3 \n\t"
758 "movq %%mm0, %%mm6 \n\t"
759 "movq %%mm3, %%mm7 \n\t"
761 "movq 8(%1), %%mm0 \n\t"
762 "movq 8(%1), %%mm1 \n\t"
763 "movq 8(%1), %%mm2 \n\t"
764 "pand %2, %%mm0 \n\t"
765 "pand %3, %%mm1 \n\t"
766 "pand %4, %%mm2 \n\t"
767 "psllq $5, %%mm0 \n\t"
768 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
769 "pmulhw "MANGLE(mul15_mid)
", %%mm1 \n\t"
770 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
771 "movq %%mm0, %%mm3 \n\t"
772 "movq %%mm1, %%mm4 \n\t"
773 "movq %%mm2, %%mm5 \n\t"
774 "punpcklwd %5, %%mm0 \n\t"
775 "punpcklwd %5, %%mm1 \n\t"
776 "punpcklwd %5, %%mm2 \n\t"
777 "punpckhwd %5, %%mm3 \n\t"
778 "punpckhwd %5, %%mm4 \n\t"
779 "punpckhwd %5, %%mm5 \n\t"
780 "psllq $8, %%mm1 \n\t"
781 "psllq $16, %%mm2 \n\t"
782 "por %%mm1, %%mm0 \n\t"
783 "por %%mm2, %%mm0 \n\t"
784 "psllq $8, %%mm4 \n\t"
785 "psllq $16, %%mm5 \n\t"
786 "por %%mm4, %%mm3 \n\t"
787 "por %%mm5, %%mm3 \n\t"
790 :
"r"(
s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r),
"m"(mmx_null)
795 "movq %%mm0, %%mm4 \n\t"
796 "movq %%mm3, %%mm5 \n\t"
797 "movq %%mm6, %%mm0 \n\t"
798 "movq %%mm7, %%mm1 \n\t"
800 "movq %%mm4, %%mm6 \n\t"
801 "movq %%mm5, %%mm7 \n\t"
802 "movq %%mm0, %%mm2 \n\t"
803 "movq %%mm1, %%mm3 \n\t"
813 __asm__
volatile(
SFENCE:::
"memory");
814 __asm__
volatile(
EMMS:::
"memory");
816 register uint16_t bgr;
818 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
819 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
820 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
827 const uint16_t *mm_end;
829 const uint16_t *
s = (
const uint16_t *)
src;
830 end = s + src_size/2;
831 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
836 "movq (%1), %%mm0 \n\t"
837 "movq (%1), %%mm1 \n\t"
838 "movq (%1), %%mm2 \n\t"
839 "pand %2, %%mm0 \n\t"
840 "pand %3, %%mm1 \n\t"
841 "pand %4, %%mm2 \n\t"
842 "psllq $5, %%mm0 \n\t"
843 "psrlq $1, %%mm2 \n\t"
844 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
845 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
846 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
847 "movq %%mm0, %%mm3 \n\t"
848 "movq %%mm1, %%mm4 \n\t"
849 "movq %%mm2, %%mm5 \n\t"
850 "punpcklwd %5, %%mm0 \n\t"
851 "punpcklwd %5, %%mm1 \n\t"
852 "punpcklwd %5, %%mm2 \n\t"
853 "punpckhwd %5, %%mm3 \n\t"
854 "punpckhwd %5, %%mm4 \n\t"
855 "punpckhwd %5, %%mm5 \n\t"
856 "psllq $8, %%mm1 \n\t"
857 "psllq $16, %%mm2 \n\t"
858 "por %%mm1, %%mm0 \n\t"
859 "por %%mm2, %%mm0 \n\t"
860 "psllq $8, %%mm4 \n\t"
861 "psllq $16, %%mm5 \n\t"
862 "por %%mm4, %%mm3 \n\t"
863 "por %%mm5, %%mm3 \n\t"
865 "movq %%mm0, %%mm6 \n\t"
866 "movq %%mm3, %%mm7 \n\t"
868 "movq 8(%1), %%mm0 \n\t"
869 "movq 8(%1), %%mm1 \n\t"
870 "movq 8(%1), %%mm2 \n\t"
871 "pand %2, %%mm0 \n\t"
872 "pand %3, %%mm1 \n\t"
873 "pand %4, %%mm2 \n\t"
874 "psllq $5, %%mm0 \n\t"
875 "psrlq $1, %%mm2 \n\t"
876 "pmulhw "MANGLE(mul15_mid)
", %%mm0 \n\t"
877 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
878 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
879 "movq %%mm0, %%mm3 \n\t"
880 "movq %%mm1, %%mm4 \n\t"
881 "movq %%mm2, %%mm5 \n\t"
882 "punpcklwd %5, %%mm0 \n\t"
883 "punpcklwd %5, %%mm1 \n\t"
884 "punpcklwd %5, %%mm2 \n\t"
885 "punpckhwd %5, %%mm3 \n\t"
886 "punpckhwd %5, %%mm4 \n\t"
887 "punpckhwd %5, %%mm5 \n\t"
888 "psllq $8, %%mm1 \n\t"
889 "psllq $16, %%mm2 \n\t"
890 "por %%mm1, %%mm0 \n\t"
891 "por %%mm2, %%mm0 \n\t"
892 "psllq $8, %%mm4 \n\t"
893 "psllq $16, %%mm5 \n\t"
894 "por %%mm4, %%mm3 \n\t"
895 "por %%mm5, %%mm3 \n\t"
897 :
"r"(
s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mmx_null)
902 "movq %%mm0, %%mm4 \n\t"
903 "movq %%mm3, %%mm5 \n\t"
904 "movq %%mm6, %%mm0 \n\t"
905 "movq %%mm7, %%mm1 \n\t"
907 "movq %%mm4, %%mm6 \n\t"
908 "movq %%mm5, %%mm7 \n\t"
909 "movq %%mm0, %%mm2 \n\t"
910 "movq %%mm1, %%mm3 \n\t"
920 __asm__
volatile(
SFENCE:::
"memory");
921 __asm__
volatile(
EMMS:::
"memory");
923 register uint16_t bgr;
925 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
926 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
927 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
939 "packuswb %%mm7, %%mm0 \n\t" \
940 "packuswb %%mm7, %%mm1 \n\t" \
941 "packuswb %%mm7, %%mm2 \n\t" \
942 "punpcklbw %%mm1, %%mm0 \n\t" \
943 "punpcklbw %%mm6, %%mm2 \n\t" \
944 "movq %%mm0, %%mm3 \n\t" \
945 "punpcklwd %%mm2, %%mm0 \n\t" \
946 "punpckhwd %%mm2, %%mm3 \n\t" \
947 MOVNTQ" %%mm0, (%0) \n\t" \
948 MOVNTQ" %%mm3, 8(%0) \n\t" \
953 const uint16_t *mm_end;
955 const uint16_t *
s = (
const uint16_t *)
src;
956 end = s + src_size/2;
957 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
958 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
959 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
964 "movq (%1), %%mm0 \n\t"
965 "movq (%1), %%mm1 \n\t"
966 "movq (%1), %%mm2 \n\t"
967 "pand %2, %%mm0 \n\t"
968 "pand %3, %%mm1 \n\t"
969 "pand %4, %%mm2 \n\t"
970 "psllq $5, %%mm0 \n\t"
971 "pmulhw %5, %%mm0 \n\t"
972 "pmulhw %5, %%mm1 \n\t"
973 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
975 ::
"r"(d),
"r"(s),
"m"(mask15b),
"m"(mask15g),
"m"(mask15r) ,
"m"(mul15_mid)
981 __asm__
volatile(
SFENCE:::
"memory");
982 __asm__
volatile(
EMMS:::
"memory");
984 register uint16_t bgr;
986 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
987 *d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
988 *d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
996 const uint16_t *mm_end;
998 const uint16_t *
s = (
const uint16_t*)
src;
999 end = s + src_size/2;
1000 __asm__
volatile(
PREFETCH" %0"::
"m"(*s):
"memory");
1001 __asm__
volatile(
"pxor %%mm7,%%mm7 \n\t":::
"memory");
1002 __asm__
volatile(
"pcmpeqd %%mm6,%%mm6 \n\t":::
"memory");
1004 while (s < mm_end) {
1007 "movq (%1), %%mm0 \n\t"
1008 "movq (%1), %%mm1 \n\t"
1009 "movq (%1), %%mm2 \n\t"
1010 "pand %2, %%mm0 \n\t"
1011 "pand %3, %%mm1 \n\t"
1012 "pand %4, %%mm2 \n\t"
1013 "psllq $5, %%mm0 \n\t"
1014 "psrlq $1, %%mm2 \n\t"
1015 "pmulhw %5, %%mm0 \n\t"
1016 "pmulhw "MANGLE(mul16_mid)
", %%mm1 \n\t"
1017 "pmulhw "MANGLE(mul15_hi)
", %%mm2 \n\t"
1019 ::
"r"(d),
"r"(s),
"m"(mask16b),
"m"(mask16g),
"m"(mask16r),
"m"(mul15_mid)
1025 __asm__
volatile(
SFENCE:::
"memory");
1026 __asm__
volatile(
EMMS:::
"memory");
1028 register uint16_t bgr;
1030 *d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1031 *d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1032 *d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1046 "movq %3, %%mm7 \n\t"
1047 "pxor %4, %%mm7 \n\t"
1048 "movq %%mm7, %%mm6 \n\t"
1049 "pxor %5, %%mm7 \n\t"
1053 "movq (%1, %0), %%mm0 \n\t"
1054 "movq 8(%1, %0), %%mm1 \n\t"
1055 # if COMPILE_TEMPLATE_MMXEXT
1056 "pshufw $177, %%mm0, %%mm3 \n\t"
1057 "pshufw $177, %%mm1, %%mm5 \n\t"
1058 "pand %%mm7, %%mm0 \n\t"
1059 "pand %%mm6, %%mm3 \n\t"
1060 "pand %%mm7, %%mm1 \n\t"
1061 "pand %%mm6, %%mm5 \n\t"
1062 "por %%mm3, %%mm0 \n\t"
1063 "por %%mm5, %%mm1 \n\t"
1065 "movq %%mm0, %%mm2 \n\t"
1066 "movq %%mm1, %%mm4 \n\t"
1067 "pand %%mm7, %%mm0 \n\t"
1068 "pand %%mm6, %%mm2 \n\t"
1069 "pand %%mm7, %%mm1 \n\t"
1070 "pand %%mm6, %%mm4 \n\t"
1071 "movq %%mm2, %%mm3 \n\t"
1072 "movq %%mm4, %%mm5 \n\t"
1073 "pslld $16, %%mm2 \n\t"
1074 "psrld $16, %%mm3 \n\t"
1075 "pslld $16, %%mm4 \n\t"
1076 "psrld $16, %%mm5 \n\t"
1077 "por %%mm2, %%mm0 \n\t"
1078 "por %%mm4, %%mm1 \n\t"
1079 "por %%mm3, %%mm0 \n\t"
1080 "por %%mm5, %%mm1 \n\t"
1082 MOVNTQ" %%mm0, (%2, %0) \n\t"
1083 MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1090 :
"r" (s),
"r" (d),
"m" (mask32b),
"m" (mask32r),
"m" (mmx_one)
1092 for (; idx<15; idx+=4) {
1093 register unsigned v = *(
const uint32_t *)&s[idx],
g = v & 0xff00ff00;
1095 *(uint32_t *)&d[idx] = (v>>16) +
g + (v<<16);
1102 x86_reg mmx_size= 23 - src_size;
1104 "test %%"REG_a
", %%"REG_a
" \n\t"
1106 "movq "MANGLE(mask24r)
", %%mm5 \n\t"
1107 "movq "MANGLE(mask24g)
", %%mm6 \n\t"
1108 "movq "MANGLE(mask24b)
", %%mm7 \n\t"
1112 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1113 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1114 "movq 2(%1, %%"REG_a
"), %%mm2 \n\t"
1115 "psllq $16, %%mm0 \n\t"
1116 "pand %%mm5, %%mm0 \n\t"
1117 "pand %%mm6, %%mm1 \n\t"
1118 "pand %%mm7, %%mm2 \n\t"
1119 "por %%mm0, %%mm1 \n\t"
1120 "por %%mm2, %%mm1 \n\t"
1121 "movq 6(%1, %%"REG_a
"), %%mm0 \n\t"
1122 MOVNTQ" %%mm1, (%2, %%"REG_a
") \n\t"
1123 "movq 8(%1, %%"REG_a
"), %%mm1 \n\t"
1124 "movq 10(%1, %%"REG_a
"), %%mm2 \n\t"
1125 "pand %%mm7, %%mm0 \n\t"
1126 "pand %%mm5, %%mm1 \n\t"
1127 "pand %%mm6, %%mm2 \n\t"
1128 "por %%mm0, %%mm1 \n\t"
1129 "por %%mm2, %%mm1 \n\t"
1130 "movq 14(%1, %%"REG_a
"), %%mm0 \n\t"
1131 MOVNTQ" %%mm1, 8(%2, %%"REG_a
") \n\t"
1132 "movq 16(%1, %%"REG_a
"), %%mm1 \n\t"
1133 "movq 18(%1, %%"REG_a
"), %%mm2 \n\t"
1134 "pand %%mm6, %%mm0 \n\t"
1135 "pand %%mm7, %%mm1 \n\t"
1136 "pand %%mm5, %%mm2 \n\t"
1137 "por %%mm0, %%mm1 \n\t"
1138 "por %%mm2, %%mm1 \n\t"
1139 MOVNTQ" %%mm1, 16(%2, %%"REG_a
") \n\t"
1140 "add $24, %%"REG_a
" \n\t"
1144 :
"r" (
src-mmx_size),
"r"(dst-mmx_size)
1148 __asm__
volatile(
SFENCE:::
"memory");
1149 __asm__
volatile(
EMMS:::
"memory");
1151 if (mmx_size==23)
return;
1155 src_size= 23-mmx_size;
1158 for (i=0; i<src_size; i+=3) {
1161 dst[i + 1] =
src[i + 1];
1162 dst[i + 2] =
src[i + 0];
1169 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1173 for (y=0; y<
height; y++) {
1176 "xor %%"REG_a
", %%"REG_a
" \n\t"
1179 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1182 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1183 "movq %%mm0, %%mm2 \n\t"
1184 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1185 "punpcklbw %%mm1, %%mm0 \n\t"
1186 "punpckhbw %%mm1, %%mm2 \n\t"
1188 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1189 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1190 "movq %%mm3, %%mm4 \n\t"
1191 "movq %%mm5, %%mm6 \n\t"
1192 "punpcklbw %%mm0, %%mm3 \n\t"
1193 "punpckhbw %%mm0, %%mm4 \n\t"
1194 "punpcklbw %%mm2, %%mm5 \n\t"
1195 "punpckhbw %%mm2, %%mm6 \n\t"
1197 MOVNTQ" %%mm3, (%0, %%"REG_a
", 4) \n\t"
1198 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1199 MOVNTQ" %%mm5, 16(%0, %%"REG_a
", 4) \n\t"
1200 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1202 "add $8, %%"REG_a
" \n\t"
1203 "cmp %4, %%"REG_a
" \n\t"
1205 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1208 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1209 usrc += chromStride;
1210 vsrc += chromStride;
1226 int lumStride,
int chromStride,
int dstStride)
1234 int lumStride,
int chromStride,
int dstStride,
int vertLumPerChroma)
1238 for (y=0; y<
height; y++) {
1241 "xor %%"REG_a
", %%"REG_a
" \n\t"
1244 PREFETCH" 32(%1, %%"REG_a
", 2) \n\t"
1247 "movq (%2, %%"REG_a
"), %%mm0 \n\t"
1248 "movq %%mm0, %%mm2 \n\t"
1249 "movq (%3, %%"REG_a
"), %%mm1 \n\t"
1250 "punpcklbw %%mm1, %%mm0 \n\t"
1251 "punpckhbw %%mm1, %%mm2 \n\t"
1253 "movq (%1, %%"REG_a
",2), %%mm3 \n\t"
1254 "movq 8(%1, %%"REG_a
",2), %%mm5 \n\t"
1255 "movq %%mm0, %%mm4 \n\t"
1256 "movq %%mm2, %%mm6 \n\t"
1257 "punpcklbw %%mm3, %%mm0 \n\t"
1258 "punpckhbw %%mm3, %%mm4 \n\t"
1259 "punpcklbw %%mm5, %%mm2 \n\t"
1260 "punpckhbw %%mm5, %%mm6 \n\t"
1262 MOVNTQ" %%mm0, (%0, %%"REG_a
", 4) \n\t"
1263 MOVNTQ" %%mm4, 8(%0, %%"REG_a
", 4) \n\t"
1264 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 4) \n\t"
1265 MOVNTQ" %%mm6, 24(%0, %%"REG_a
", 4) \n\t"
1267 "add $8, %%"REG_a
" \n\t"
1268 "cmp %4, %%"REG_a
" \n\t"
1270 ::
"r"(dst),
"r"(ysrc),
"r"(usrc),
"r"(vsrc),
"g" (chromWidth)
1273 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1274 usrc += chromStride;
1275 vsrc += chromStride;
1291 int lumStride,
int chromStride,
int dstStride)
1302 int lumStride,
int chromStride,
int dstStride)
1312 int lumStride,
int chromStride,
int dstStride)
1323 int lumStride,
int chromStride,
int srcStride)
1327 for (y=0; y<
height; y+=2) {
1329 "xor %%"REG_a
", %%"REG_a
" \n\t"
1330 "pcmpeqw %%mm7, %%mm7 \n\t"
1331 "psrlw $8, %%mm7 \n\t"
1334 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1335 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1336 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1337 "movq %%mm0, %%mm2 \n\t"
1338 "movq %%mm1, %%mm3 \n\t"
1339 "psrlw $8, %%mm0 \n\t"
1340 "psrlw $8, %%mm1 \n\t"
1341 "pand %%mm7, %%mm2 \n\t"
1342 "pand %%mm7, %%mm3 \n\t"
1343 "packuswb %%mm1, %%mm0 \n\t"
1344 "packuswb %%mm3, %%mm2 \n\t"
1346 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1348 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1349 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1350 "movq %%mm1, %%mm3 \n\t"
1351 "movq %%mm2, %%mm4 \n\t"
1352 "psrlw $8, %%mm1 \n\t"
1353 "psrlw $8, %%mm2 \n\t"
1354 "pand %%mm7, %%mm3 \n\t"
1355 "pand %%mm7, %%mm4 \n\t"
1356 "packuswb %%mm2, %%mm1 \n\t"
1357 "packuswb %%mm4, %%mm3 \n\t"
1359 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1361 "movq %%mm0, %%mm2 \n\t"
1362 "movq %%mm1, %%mm3 \n\t"
1363 "psrlw $8, %%mm0 \n\t"
1364 "psrlw $8, %%mm1 \n\t"
1365 "pand %%mm7, %%mm2 \n\t"
1366 "pand %%mm7, %%mm3 \n\t"
1367 "packuswb %%mm1, %%mm0 \n\t"
1368 "packuswb %%mm3, %%mm2 \n\t"
1370 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1371 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1373 "add $8, %%"REG_a
" \n\t"
1374 "cmp %4, %%"REG_a
" \n\t"
1376 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1377 :
"memory",
"%"REG_a
1384 "xor %%"REG_a
", %%"REG_a
" \n\t"
1387 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1388 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1389 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1390 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1391 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1392 "pand %%mm7, %%mm0 \n\t"
1393 "pand %%mm7, %%mm1 \n\t"
1394 "pand %%mm7, %%mm2 \n\t"
1395 "pand %%mm7, %%mm3 \n\t"
1396 "packuswb %%mm1, %%mm0 \n\t"
1397 "packuswb %%mm3, %%mm2 \n\t"
1399 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1400 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1402 "add $8, %%"REG_a
" \n\t"
1403 "cmp %4, %%"REG_a
" \n\t"
1406 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1407 :
"memory",
"%"REG_a
1409 udst += chromStride;
1410 vdst += chromStride;
1414 __asm__
volatile(
EMMS" \n\t"
1420 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1428 for (x=0; x<srcWidth-1; x++) {
1429 dst[2*x+1]= (3*
src[x] +
src[x+1])>>2;
1430 dst[2*x+2]= (
src[x] + 3*
src[x+1])>>2;
1432 dst[2*srcWidth-1]=
src[srcWidth-1];
1436 for (y=1; y<srcHeight; y++) {
1437 x86_reg mmxSize= srcWidth&~15;
1441 "mov %4, %%"REG_a
" \n\t"
1442 "movq "MANGLE(mmx_ff)
", %%mm0 \n\t"
1443 "movq (%0, %%"REG_a
"), %%mm4 \n\t"
1444 "movq %%mm4, %%mm2 \n\t"
1445 "psllq $8, %%mm4 \n\t"
1446 "pand %%mm0, %%mm2 \n\t"
1447 "por %%mm2, %%mm4 \n\t"
1448 "movq (%1, %%"REG_a
"), %%mm5 \n\t"
1449 "movq %%mm5, %%mm3 \n\t"
1450 "psllq $8, %%mm5 \n\t"
1451 "pand %%mm0, %%mm3 \n\t"
1452 "por %%mm3, %%mm5 \n\t"
1454 "movq (%0, %%"REG_a
"), %%mm0 \n\t"
1455 "movq (%1, %%"REG_a
"), %%mm1 \n\t"
1456 "movq 1(%0, %%"REG_a
"), %%mm2 \n\t"
1457 "movq 1(%1, %%"REG_a
"), %%mm3 \n\t"
1458 PAVGB" %%mm0, %%mm5 \n\t"
1459 PAVGB" %%mm0, %%mm3 \n\t"
1460 PAVGB" %%mm0, %%mm5 \n\t"
1461 PAVGB" %%mm0, %%mm3 \n\t"
1462 PAVGB" %%mm1, %%mm4 \n\t"
1463 PAVGB" %%mm1, %%mm2 \n\t"
1464 PAVGB" %%mm1, %%mm4 \n\t"
1465 PAVGB" %%mm1, %%mm2 \n\t"
1466 "movq %%mm5, %%mm7 \n\t"
1467 "movq %%mm4, %%mm6 \n\t"
1468 "punpcklbw %%mm3, %%mm5 \n\t"
1469 "punpckhbw %%mm3, %%mm7 \n\t"
1470 "punpcklbw %%mm2, %%mm4 \n\t"
1471 "punpckhbw %%mm2, %%mm6 \n\t"
1472 MOVNTQ" %%mm5, (%2, %%"REG_a
", 2) \n\t"
1473 MOVNTQ" %%mm7, 8(%2, %%"REG_a
", 2) \n\t"
1474 MOVNTQ" %%mm4, (%3, %%"REG_a
", 2) \n\t"
1475 MOVNTQ" %%mm6, 8(%3, %%"REG_a
", 2) \n\t"
1476 "add $8, %%"REG_a
" \n\t"
1477 "movq -1(%0, %%"REG_a
"), %%mm4 \n\t"
1478 "movq -1(%1, %%"REG_a
"), %%mm5 \n\t"
1480 ::
"r" (
src + mmxSize ),
"r" (
src + srcStride + mmxSize ),
1481 "r" (dst + mmxSize*2),
"r" (dst + dstStride + mmxSize*2),
1488 dst[0] = (
src[0] * 3 +
src[srcStride]) >> 2;
1489 dst[dstStride] = (
src[0] + 3 *
src[srcStride]) >> 2;
1492 for (x=mmxSize-1; x<srcWidth-1; x++) {
1493 dst[2*x +1]= (3*
src[x+0] +
src[x+srcStride+1])>>2;
1494 dst[2*x+dstStride+2]= (
src[x+0] + 3*
src[x+srcStride+1])>>2;
1495 dst[2*x+dstStride+1]= (
src[x+1] + 3*
src[x+srcStride ])>>2;
1496 dst[2*x +2]= (3*
src[x+1] +
src[x+srcStride ])>>2;
1498 dst[srcWidth*2 -1 ]= (3*
src[srcWidth-1] +
src[srcWidth-1 + srcStride])>>2;
1499 dst[srcWidth*2 -1 + dstStride]= (
src[srcWidth-1] + 3*
src[srcWidth-1 + srcStride])>>2;
1508 for (x=0; x<srcWidth-1; x++) {
1509 dst[2*x+1]= (3*
src[x] +
src[x+1])>>2;
1510 dst[2*x+2]= (
src[x] + 3*
src[x+1])>>2;
1512 dst[2*srcWidth-1]=
src[srcWidth-1];
1514 __asm__
volatile(
EMMS" \n\t"
1520 #if !COMPILE_TEMPLATE_AMD3DNOW
1529 int lumStride,
int chromStride,
int srcStride)
1532 const x86_reg chromWidth= width>>1;
1533 for (y=0; y<
height; y+=2) {
1535 "xor %%"REG_a
", %%"REG_a
" \n\t"
1536 "pcmpeqw %%mm7, %%mm7 \n\t"
1537 "psrlw $8, %%mm7 \n\t"
1540 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1541 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1542 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1543 "movq %%mm0, %%mm2 \n\t"
1544 "movq %%mm1, %%mm3 \n\t"
1545 "pand %%mm7, %%mm0 \n\t"
1546 "pand %%mm7, %%mm1 \n\t"
1547 "psrlw $8, %%mm2 \n\t"
1548 "psrlw $8, %%mm3 \n\t"
1549 "packuswb %%mm1, %%mm0 \n\t"
1550 "packuswb %%mm3, %%mm2 \n\t"
1552 MOVNTQ" %%mm2, (%1, %%"REG_a
", 2) \n\t"
1554 "movq 16(%0, %%"REG_a
", 4), %%mm1 \n\t"
1555 "movq 24(%0, %%"REG_a
", 4), %%mm2 \n\t"
1556 "movq %%mm1, %%mm3 \n\t"
1557 "movq %%mm2, %%mm4 \n\t"
1558 "pand %%mm7, %%mm1 \n\t"
1559 "pand %%mm7, %%mm2 \n\t"
1560 "psrlw $8, %%mm3 \n\t"
1561 "psrlw $8, %%mm4 \n\t"
1562 "packuswb %%mm2, %%mm1 \n\t"
1563 "packuswb %%mm4, %%mm3 \n\t"
1565 MOVNTQ" %%mm3, 8(%1, %%"REG_a
", 2) \n\t"
1567 "movq %%mm0, %%mm2 \n\t"
1568 "movq %%mm1, %%mm3 \n\t"
1569 "psrlw $8, %%mm0 \n\t"
1570 "psrlw $8, %%mm1 \n\t"
1571 "pand %%mm7, %%mm2 \n\t"
1572 "pand %%mm7, %%mm3 \n\t"
1573 "packuswb %%mm1, %%mm0 \n\t"
1574 "packuswb %%mm3, %%mm2 \n\t"
1576 MOVNTQ" %%mm0, (%3, %%"REG_a
") \n\t"
1577 MOVNTQ" %%mm2, (%2, %%"REG_a
") \n\t"
1579 "add $8, %%"REG_a
" \n\t"
1580 "cmp %4, %%"REG_a
" \n\t"
1582 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1583 :
"memory",
"%"REG_a
1590 "xor %%"REG_a
", %%"REG_a
" \n\t"
1593 PREFETCH" 64(%0, %%"REG_a
", 4) \n\t"
1594 "movq (%0, %%"REG_a
", 4), %%mm0 \n\t"
1595 "movq 8(%0, %%"REG_a
", 4), %%mm1 \n\t"
1596 "movq 16(%0, %%"REG_a
", 4), %%mm2 \n\t"
1597 "movq 24(%0, %%"REG_a
", 4), %%mm3 \n\t"
1598 "psrlw $8, %%mm0 \n\t"
1599 "psrlw $8, %%mm1 \n\t"
1600 "psrlw $8, %%mm2 \n\t"
1601 "psrlw $8, %%mm3 \n\t"
1602 "packuswb %%mm1, %%mm0 \n\t"
1603 "packuswb %%mm3, %%mm2 \n\t"
1605 MOVNTQ" %%mm0, (%1, %%"REG_a
", 2) \n\t"
1606 MOVNTQ" %%mm2, 8(%1, %%"REG_a
", 2) \n\t"
1608 "add $8, %%"REG_a
" \n\t"
1609 "cmp %4, %%"REG_a
" \n\t"
1612 ::
"r"(
src),
"r"(ydst),
"r"(udst),
"r"(vdst),
"g" (chromWidth)
1613 :
"memory",
"%"REG_a
1615 udst += chromStride;
1616 vdst += chromStride;
1620 __asm__
volatile(
EMMS" \n\t"
1636 int lumStride,
int chromStride,
int srcStride,
1639 #define BGR2Y_IDX "16*4+16*32"
1640 #define BGR2U_IDX "16*4+16*33"
1641 #define BGR2V_IDX "16*4+16*34"
1643 const x86_reg chromWidth= width>>1;
1648 ydst += 2*lumStride;
1649 udst += chromStride;
1650 vdst += chromStride;
1654 for (y=0; y<height-2; y+=2) {
1656 for (i=0; i<2; i++) {
1658 "mov %2, %%"REG_a
" \n\t"
1659 "movq "BGR2Y_IDX
"(%3), %%mm6 \n\t"
1660 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1661 "pxor %%mm7, %%mm7 \n\t"
1662 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1666 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1667 "movd 3(%0, %%"REG_d
"), %%mm1 \n\t"
1668 "punpcklbw %%mm7, %%mm0 \n\t"
1669 "punpcklbw %%mm7, %%mm1 \n\t"
1670 "movd 6(%0, %%"REG_d
"), %%mm2 \n\t"
1671 "movd 9(%0, %%"REG_d
"), %%mm3 \n\t"
1672 "punpcklbw %%mm7, %%mm2 \n\t"
1673 "punpcklbw %%mm7, %%mm3 \n\t"
1674 "pmaddwd %%mm6, %%mm0 \n\t"
1675 "pmaddwd %%mm6, %%mm1 \n\t"
1676 "pmaddwd %%mm6, %%mm2 \n\t"
1677 "pmaddwd %%mm6, %%mm3 \n\t"
1678 "psrad $8, %%mm0 \n\t"
1679 "psrad $8, %%mm1 \n\t"
1680 "psrad $8, %%mm2 \n\t"
1681 "psrad $8, %%mm3 \n\t"
1682 "packssdw %%mm1, %%mm0 \n\t"
1683 "packssdw %%mm3, %%mm2 \n\t"
1684 "pmaddwd %%mm5, %%mm0 \n\t"
1685 "pmaddwd %%mm5, %%mm2 \n\t"
1686 "packssdw %%mm2, %%mm0 \n\t"
1687 "psraw $7, %%mm0 \n\t"
1689 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1690 "movd 15(%0, %%"REG_d
"), %%mm1 \n\t"
1691 "punpcklbw %%mm7, %%mm4 \n\t"
1692 "punpcklbw %%mm7, %%mm1 \n\t"
1693 "movd 18(%0, %%"REG_d
"), %%mm2 \n\t"
1694 "movd 21(%0, %%"REG_d
"), %%mm3 \n\t"
1695 "punpcklbw %%mm7, %%mm2 \n\t"
1696 "punpcklbw %%mm7, %%mm3 \n\t"
1697 "pmaddwd %%mm6, %%mm4 \n\t"
1698 "pmaddwd %%mm6, %%mm1 \n\t"
1699 "pmaddwd %%mm6, %%mm2 \n\t"
1700 "pmaddwd %%mm6, %%mm3 \n\t"
1701 "psrad $8, %%mm4 \n\t"
1702 "psrad $8, %%mm1 \n\t"
1703 "psrad $8, %%mm2 \n\t"
1704 "psrad $8, %%mm3 \n\t"
1705 "packssdw %%mm1, %%mm4 \n\t"
1706 "packssdw %%mm3, %%mm2 \n\t"
1707 "pmaddwd %%mm5, %%mm4 \n\t"
1708 "pmaddwd %%mm5, %%mm2 \n\t"
1709 "add $24, %%"REG_d
" \n\t"
1710 "packssdw %%mm2, %%mm4 \n\t"
1711 "psraw $7, %%mm4 \n\t"
1713 "packuswb %%mm4, %%mm0 \n\t"
1714 "paddusb "MANGLE(ff_bgr2YOffset)
", %%mm0 \n\t"
1716 MOVNTQ" %%mm0, (%1, %%"REG_a
") \n\t"
1717 "add $8, %%"REG_a
" \n\t"
1719 : :
"r" (src+width*3),
"r" (ydst+width),
"g" ((
x86_reg)-width),
"r"(
rgb2yuv)
1721 :
"%"REG_a,
"%"REG_d
1728 "mov %4, %%"REG_a
" \n\t"
1729 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1730 "movq "BGR2U_IDX
"(%5), %%mm6 \n\t"
1731 "pxor %%mm7, %%mm7 \n\t"
1732 "lea (%%"REG_a
", %%"REG_a
", 2), %%"REG_d
" \n\t"
1733 "add %%"REG_d
", %%"REG_d
" \n\t"
1738 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1739 "movq (%0, %%"REG_d
"), %%mm0 \n\t"
1740 "movq (%1, %%"REG_d
"), %%mm1 \n\t"
1741 "movq 6(%0, %%"REG_d
"), %%mm2 \n\t"
1742 "movq 6(%1, %%"REG_d
"), %%mm3 \n\t"
1743 PAVGB" %%mm1, %%mm0 \n\t"
1744 PAVGB" %%mm3, %%mm2 \n\t"
1745 "movq %%mm0, %%mm1 \n\t"
1746 "movq %%mm2, %%mm3 \n\t"
1747 "psrlq $24, %%mm0 \n\t"
1748 "psrlq $24, %%mm2 \n\t"
1749 PAVGB" %%mm1, %%mm0 \n\t"
1750 PAVGB" %%mm3, %%mm2 \n\t"
1751 "punpcklbw %%mm7, %%mm0 \n\t"
1752 "punpcklbw %%mm7, %%mm2 \n\t"
1754 "movd (%0, %%"REG_d
"), %%mm0 \n\t"
1755 "movd (%1, %%"REG_d
"), %%mm1 \n\t"
1756 "movd 3(%0, %%"REG_d
"), %%mm2 \n\t"
1757 "movd 3(%1, %%"REG_d
"), %%mm3 \n\t"
1758 "punpcklbw %%mm7, %%mm0 \n\t"
1759 "punpcklbw %%mm7, %%mm1 \n\t"
1760 "punpcklbw %%mm7, %%mm2 \n\t"
1761 "punpcklbw %%mm7, %%mm3 \n\t"
1762 "paddw %%mm1, %%mm0 \n\t"
1763 "paddw %%mm3, %%mm2 \n\t"
1764 "paddw %%mm2, %%mm0 \n\t"
1765 "movd 6(%0, %%"REG_d
"), %%mm4 \n\t"
1766 "movd 6(%1, %%"REG_d
"), %%mm1 \n\t"
1767 "movd 9(%0, %%"REG_d
"), %%mm2 \n\t"
1768 "movd 9(%1, %%"REG_d
"), %%mm3 \n\t"
1769 "punpcklbw %%mm7, %%mm4 \n\t"
1770 "punpcklbw %%mm7, %%mm1 \n\t"
1771 "punpcklbw %%mm7, %%mm2 \n\t"
1772 "punpcklbw %%mm7, %%mm3 \n\t"
1773 "paddw %%mm1, %%mm4 \n\t"
1774 "paddw %%mm3, %%mm2 \n\t"
1775 "paddw %%mm4, %%mm2 \n\t"
1776 "psrlw $2, %%mm0 \n\t"
1777 "psrlw $2, %%mm2 \n\t"
1779 "movq "BGR2V_IDX
"(%5), %%mm1 \n\t"
1780 "movq "BGR2V_IDX
"(%5), %%mm3 \n\t"
1782 "pmaddwd %%mm0, %%mm1 \n\t"
1783 "pmaddwd %%mm2, %%mm3 \n\t"
1784 "pmaddwd %%mm6, %%mm0 \n\t"
1785 "pmaddwd %%mm6, %%mm2 \n\t"
1786 "psrad $8, %%mm0 \n\t"
1787 "psrad $8, %%mm1 \n\t"
1788 "psrad $8, %%mm2 \n\t"
1789 "psrad $8, %%mm3 \n\t"
1790 "packssdw %%mm2, %%mm0 \n\t"
1791 "packssdw %%mm3, %%mm1 \n\t"
1792 "pmaddwd %%mm5, %%mm0 \n\t"
1793 "pmaddwd %%mm5, %%mm1 \n\t"
1794 "packssdw %%mm1, %%mm0 \n\t"
1795 "psraw $7, %%mm0 \n\t"
1797 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1798 "movq 12(%0, %%"REG_d
"), %%mm4 \n\t"
1799 "movq 12(%1, %%"REG_d
"), %%mm1 \n\t"
1800 "movq 18(%0, %%"REG_d
"), %%mm2 \n\t"
1801 "movq 18(%1, %%"REG_d
"), %%mm3 \n\t"
1802 PAVGB" %%mm1, %%mm4 \n\t"
1803 PAVGB" %%mm3, %%mm2 \n\t"
1804 "movq %%mm4, %%mm1 \n\t"
1805 "movq %%mm2, %%mm3 \n\t"
1806 "psrlq $24, %%mm4 \n\t"
1807 "psrlq $24, %%mm2 \n\t"
1808 PAVGB" %%mm1, %%mm4 \n\t"
1809 PAVGB" %%mm3, %%mm2 \n\t"
1810 "punpcklbw %%mm7, %%mm4 \n\t"
1811 "punpcklbw %%mm7, %%mm2 \n\t"
1813 "movd 12(%0, %%"REG_d
"), %%mm4 \n\t"
1814 "movd 12(%1, %%"REG_d
"), %%mm1 \n\t"
1815 "movd 15(%0, %%"REG_d
"), %%mm2 \n\t"
1816 "movd 15(%1, %%"REG_d
"), %%mm3 \n\t"
1817 "punpcklbw %%mm7, %%mm4 \n\t"
1818 "punpcklbw %%mm7, %%mm1 \n\t"
1819 "punpcklbw %%mm7, %%mm2 \n\t"
1820 "punpcklbw %%mm7, %%mm3 \n\t"
1821 "paddw %%mm1, %%mm4 \n\t"
1822 "paddw %%mm3, %%mm2 \n\t"
1823 "paddw %%mm2, %%mm4 \n\t"
1824 "movd 18(%0, %%"REG_d
"), %%mm5 \n\t"
1825 "movd 18(%1, %%"REG_d
"), %%mm1 \n\t"
1826 "movd 21(%0, %%"REG_d
"), %%mm2 \n\t"
1827 "movd 21(%1, %%"REG_d
"), %%mm3 \n\t"
1828 "punpcklbw %%mm7, %%mm5 \n\t"
1829 "punpcklbw %%mm7, %%mm1 \n\t"
1830 "punpcklbw %%mm7, %%mm2 \n\t"
1831 "punpcklbw %%mm7, %%mm3 \n\t"
1832 "paddw %%mm1, %%mm5 \n\t"
1833 "paddw %%mm3, %%mm2 \n\t"
1834 "paddw %%mm5, %%mm2 \n\t"
1835 "movq "MANGLE(ff_w1111)
", %%mm5 \n\t"
1836 "psrlw $2, %%mm4 \n\t"
1837 "psrlw $2, %%mm2 \n\t"
1839 "movq "BGR2V_IDX
"(%5), %%mm1 \n\t"
1840 "movq "BGR2V_IDX
"(%5), %%mm3 \n\t"
1842 "pmaddwd %%mm4, %%mm1 \n\t"
1843 "pmaddwd %%mm2, %%mm3 \n\t"
1844 "pmaddwd %%mm6, %%mm4 \n\t"
1845 "pmaddwd %%mm6, %%mm2 \n\t"
1846 "psrad $8, %%mm4 \n\t"
1847 "psrad $8, %%mm1 \n\t"
1848 "psrad $8, %%mm2 \n\t"
1849 "psrad $8, %%mm3 \n\t"
1850 "packssdw %%mm2, %%mm4 \n\t"
1851 "packssdw %%mm3, %%mm1 \n\t"
1852 "pmaddwd %%mm5, %%mm4 \n\t"
1853 "pmaddwd %%mm5, %%mm1 \n\t"
1854 "add $24, %%"REG_d
" \n\t"
1855 "packssdw %%mm1, %%mm4 \n\t"
1856 "psraw $7, %%mm4 \n\t"
1858 "movq %%mm0, %%mm1 \n\t"
1859 "punpckldq %%mm4, %%mm0 \n\t"
1860 "punpckhdq %%mm4, %%mm1 \n\t"
1861 "packsswb %%mm1, %%mm0 \n\t"
1862 "paddb "MANGLE(ff_bgr2UVOffset)
", %%mm0 \n\t"
1863 "movd %%mm0, (%2, %%"REG_a
") \n\t"
1864 "punpckhdq %%mm0, %%mm0 \n\t"
1865 "movd %%mm0, (%3, %%"REG_a
") \n\t"
1866 "add $4, %%"REG_a
" \n\t"
1868 : :
"r" (src+chromWidth*6),
"r" (src+srcStride+chromWidth*6),
"r" (udst+chromWidth),
"r" (vdst+chromWidth),
"g" (-chromWidth),
"r"(
rgb2yuv)
1870 :
"%"REG_a,
"%"REG_d
1873 udst += chromStride;
1874 vdst += chromStride;
1878 __asm__
volatile(
EMMS" \n\t"
1887 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
1890 int src2Stride,
int dstStride)
1894 for (h=0; h <
height; h++) {
1898 #if COMPILE_TEMPLATE_SSE2
1899 if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) {
1901 "xor %%"REG_a
", %%"REG_a
" \n\t"
1905 "movdqa (%1, %%"REG_a
"), %%xmm0 \n\t"
1906 "movdqa (%1, %%"REG_a
"), %%xmm1 \n\t"
1907 "movdqa (%2, %%"REG_a
"), %%xmm2 \n\t"
1908 "punpcklbw %%xmm2, %%xmm0 \n\t"
1909 "punpckhbw %%xmm2, %%xmm1 \n\t"
1910 "movntdq %%xmm0, (%0, %%"REG_a
", 2) \n\t"
1911 "movntdq %%xmm1, 16(%0, %%"REG_a
", 2) \n\t"
1912 "add $16, %%"REG_a
" \n\t"
1913 "cmp %3, %%"REG_a
" \n\t"
1915 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1916 :
"memory",
XMM_CLOBBERS(
"xmm0",
"xmm1",
"xmm2",)
"%"REG_a
1921 "xor %%"REG_a
", %%"REG_a
" \n\t"
1925 "movq (%1, %%"REG_a
"), %%mm0 \n\t"
1926 "movq 8(%1, %%"REG_a
"), %%mm2 \n\t"
1927 "movq %%mm0, %%mm1 \n\t"
1928 "movq %%mm2, %%mm3 \n\t"
1929 "movq (%2, %%"REG_a
"), %%mm4 \n\t"
1930 "movq 8(%2, %%"REG_a
"), %%mm5 \n\t"
1931 "punpcklbw %%mm4, %%mm0 \n\t"
1932 "punpckhbw %%mm4, %%mm1 \n\t"
1933 "punpcklbw %%mm5, %%mm2 \n\t"
1934 "punpckhbw %%mm5, %%mm3 \n\t"
1935 MOVNTQ" %%mm0, (%0, %%"REG_a
", 2) \n\t"
1936 MOVNTQ" %%mm1, 8(%0, %%"REG_a
", 2) \n\t"
1937 MOVNTQ" %%mm2, 16(%0, %%"REG_a
", 2) \n\t"
1938 MOVNTQ" %%mm3, 24(%0, %%"REG_a
", 2) \n\t"
1939 "add $16, %%"REG_a
" \n\t"
1940 "cmp %3, %%"REG_a
" \n\t"
1942 ::
"r"(dest),
"r"(src1),
"r"(src2),
"r" ((
x86_reg)width-15)
1943 :
"memory",
"%"REG_a
1947 for (w= (width&(~15)); w <
width; w++) {
1948 dest[2*w+0] = src1[w];
1949 dest[2*w+1] = src2[w];
1963 #if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
1964 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
1973 int dst1Stride,
int dst2Stride)
1977 for (h = 0; h <
height; h++) {
1992 #if !COMPILE_TEMPLATE_SSE2
1993 #if !COMPILE_TEMPLATE_AMD3DNOW
1997 int srcStride1,
int srcStride2,
1998 int dstStride1,
int dstStride2)
2002 w=width/2; h=height/2;
2006 ::
"m"(*(src1+srcStride1)),
"m"(*(src2+srcStride2)):
"memory");
2008 const uint8_t*
s1=src1+srcStride1*(y>>1);
2011 for (;x<w-31;x+=32) {
2014 "movq (%1,%2), %%mm0 \n\t"
2015 "movq 8(%1,%2), %%mm2 \n\t"
2016 "movq 16(%1,%2), %%mm4 \n\t"
2017 "movq 24(%1,%2), %%mm6 \n\t"
2018 "movq %%mm0, %%mm1 \n\t"
2019 "movq %%mm2, %%mm3 \n\t"
2020 "movq %%mm4, %%mm5 \n\t"
2021 "movq %%mm6, %%mm7 \n\t"
2022 "punpcklbw %%mm0, %%mm0 \n\t"
2023 "punpckhbw %%mm1, %%mm1 \n\t"
2024 "punpcklbw %%mm2, %%mm2 \n\t"
2025 "punpckhbw %%mm3, %%mm3 \n\t"
2026 "punpcklbw %%mm4, %%mm4 \n\t"
2027 "punpckhbw %%mm5, %%mm5 \n\t"
2028 "punpcklbw %%mm6, %%mm6 \n\t"
2029 "punpckhbw %%mm7, %%mm7 \n\t"
2030 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2031 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2032 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2033 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2034 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2035 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2036 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2037 MOVNTQ" %%mm7, 56(%0,%2,2)"
2038 ::
"r"(d),
"r"(s1),
"r"(x)
2041 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2044 const uint8_t*
s2=src2+srcStride2*(y>>1);
2047 for (;x<w-31;x+=32) {
2050 "movq (%1,%2), %%mm0 \n\t"
2051 "movq 8(%1,%2), %%mm2 \n\t"
2052 "movq 16(%1,%2), %%mm4 \n\t"
2053 "movq 24(%1,%2), %%mm6 \n\t"
2054 "movq %%mm0, %%mm1 \n\t"
2055 "movq %%mm2, %%mm3 \n\t"
2056 "movq %%mm4, %%mm5 \n\t"
2057 "movq %%mm6, %%mm7 \n\t"
2058 "punpcklbw %%mm0, %%mm0 \n\t"
2059 "punpckhbw %%mm1, %%mm1 \n\t"
2060 "punpcklbw %%mm2, %%mm2 \n\t"
2061 "punpckhbw %%mm3, %%mm3 \n\t"
2062 "punpcklbw %%mm4, %%mm4 \n\t"
2063 "punpckhbw %%mm5, %%mm5 \n\t"
2064 "punpcklbw %%mm6, %%mm6 \n\t"
2065 "punpckhbw %%mm7, %%mm7 \n\t"
2066 MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2067 MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2068 MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2069 MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2070 MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2071 MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2072 MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2073 MOVNTQ" %%mm7, 56(%0,%2,2)"
2074 ::
"r"(d),
"r"(s2),
"r"(x)
2077 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2089 int srcStride1,
int srcStride2,
2090 int srcStride3,
int dstStride)
2096 const uint8_t* yp=src1+srcStride1*
y;
2097 const uint8_t* up=src2+srcStride2*(y>>2);
2098 const uint8_t* vp=src3+srcStride3*(y>>2);
2106 "movq (%1, %0, 4), %%mm0 \n\t"
2107 "movq (%2, %0), %%mm1 \n\t"
2108 "movq (%3, %0), %%mm2 \n\t"
2109 "movq %%mm0, %%mm3 \n\t"
2110 "movq %%mm1, %%mm4 \n\t"
2111 "movq %%mm2, %%mm5 \n\t"
2112 "punpcklbw %%mm1, %%mm1 \n\t"
2113 "punpcklbw %%mm2, %%mm2 \n\t"
2114 "punpckhbw %%mm4, %%mm4 \n\t"
2115 "punpckhbw %%mm5, %%mm5 \n\t"
2117 "movq %%mm1, %%mm6 \n\t"
2118 "punpcklbw %%mm2, %%mm1 \n\t"
2119 "punpcklbw %%mm1, %%mm0 \n\t"
2120 "punpckhbw %%mm1, %%mm3 \n\t"
2121 MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2122 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2124 "punpckhbw %%mm2, %%mm6 \n\t"
2125 "movq 8(%1, %0, 4), %%mm0 \n\t"
2126 "movq %%mm0, %%mm3 \n\t"
2127 "punpcklbw %%mm6, %%mm0 \n\t"
2128 "punpckhbw %%mm6, %%mm3 \n\t"
2129 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2130 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2132 "movq %%mm4, %%mm6 \n\t"
2133 "movq 16(%1, %0, 4), %%mm0 \n\t"
2134 "movq %%mm0, %%mm3 \n\t"
2135 "punpcklbw %%mm5, %%mm4 \n\t"
2136 "punpcklbw %%mm4, %%mm0 \n\t"
2137 "punpckhbw %%mm4, %%mm3 \n\t"
2138 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2139 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2141 "punpckhbw %%mm5, %%mm6 \n\t"
2142 "movq 24(%1, %0, 4), %%mm0 \n\t"
2143 "movq %%mm0, %%mm3 \n\t"
2144 "punpcklbw %%mm6, %%mm0 \n\t"
2145 "punpckhbw %%mm6, %%mm3 \n\t"
2146 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2147 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2150 :
"r"(yp),
"r" (up),
"r"(vp),
"r"(d)
2154 const int x2 = x<<2;
2157 d[8*x+2] = yp[x2+1];
2159 d[8*x+4] = yp[x2+2];
2161 d[8*x+6] = yp[x2+3];
2182 "pcmpeqw %%mm7, %%mm7 \n\t"
2183 "psrlw $8, %%mm7 \n\t"
2185 "movq -30(%1, %0, 2), %%mm0 \n\t"
2186 "movq -22(%1, %0, 2), %%mm1 \n\t"
2187 "movq -14(%1, %0, 2), %%mm2 \n\t"
2188 "movq -6(%1, %0, 2), %%mm3 \n\t"
2189 "pand %%mm7, %%mm0 \n\t"
2190 "pand %%mm7, %%mm1 \n\t"
2191 "pand %%mm7, %%mm2 \n\t"
2192 "pand %%mm7, %%mm3 \n\t"
2193 "packuswb %%mm1, %%mm0 \n\t"
2194 "packuswb %%mm3, %%mm2 \n\t"
2195 MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2196 MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2200 :
"r"(src),
"r"(dst)
2220 "pcmpeqw %%mm7, %%mm7 \n\t"
2221 "psrlw $8, %%mm7 \n\t"
2223 "movq -32(%1, %0, 2), %%mm0 \n\t"
2224 "movq -24(%1, %0, 2), %%mm1 \n\t"
2225 "movq -16(%1, %0, 2), %%mm2 \n\t"
2226 "movq -8(%1, %0, 2), %%mm3 \n\t"
2227 "pand %%mm7, %%mm0 \n\t"
2228 "pand %%mm7, %%mm1 \n\t"
2229 "pand %%mm7, %%mm2 \n\t"
2230 "pand %%mm7, %%mm3 \n\t"
2231 "packuswb %%mm1, %%mm0 \n\t"
2232 "packuswb %%mm3, %%mm2 \n\t"
2233 MOVNTQ" %%mm0,-16(%2, %0) \n\t"
2234 MOVNTQ" %%mm2,- 8(%2, %0) \n\t"
2238 :
"r"(src),
"r"(dst)
2248 #if !COMPILE_TEMPLATE_AMD3DNOW
2258 "pcmpeqw %%mm7, %%mm7 \n\t"
2259 "psrlw $8, %%mm7 \n\t"
2261 "movq -28(%1, %0, 4), %%mm0 \n\t"
2262 "movq -20(%1, %0, 4), %%mm1 \n\t"
2263 "movq -12(%1, %0, 4), %%mm2 \n\t"
2264 "movq -4(%1, %0, 4), %%mm3 \n\t"
2265 "pand %%mm7, %%mm0 \n\t"
2266 "pand %%mm7, %%mm1 \n\t"
2267 "pand %%mm7, %%mm2 \n\t"
2268 "pand %%mm7, %%mm3 \n\t"
2269 "packuswb %%mm1, %%mm0 \n\t"
2270 "packuswb %%mm3, %%mm2 \n\t"
2271 "movq %%mm0, %%mm1 \n\t"
2272 "movq %%mm2, %%mm3 \n\t"
2273 "psrlw $8, %%mm0 \n\t"
2274 "psrlw $8, %%mm2 \n\t"
2275 "pand %%mm7, %%mm1 \n\t"
2276 "pand %%mm7, %%mm3 \n\t"
2277 "packuswb %%mm2, %%mm0 \n\t"
2278 "packuswb %%mm3, %%mm1 \n\t"
2279 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2280 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2284 :
"r"(src),
"r"(dst0),
"r"(dst1)
2307 "pcmpeqw %%mm7, %%mm7 \n\t"
2308 "psrlw $8, %%mm7 \n\t"
2310 "movq -28(%1, %0, 4), %%mm0 \n\t"
2311 "movq -20(%1, %0, 4), %%mm1 \n\t"
2312 "movq -12(%1, %0, 4), %%mm2 \n\t"
2313 "movq -4(%1, %0, 4), %%mm3 \n\t"
2314 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2315 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2316 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2317 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2318 "pand %%mm7, %%mm0 \n\t"
2319 "pand %%mm7, %%mm1 \n\t"
2320 "pand %%mm7, %%mm2 \n\t"
2321 "pand %%mm7, %%mm3 \n\t"
2322 "packuswb %%mm1, %%mm0 \n\t"
2323 "packuswb %%mm3, %%mm2 \n\t"
2324 "movq %%mm0, %%mm1 \n\t"
2325 "movq %%mm2, %%mm3 \n\t"
2326 "psrlw $8, %%mm0 \n\t"
2327 "psrlw $8, %%mm2 \n\t"
2328 "pand %%mm7, %%mm1 \n\t"
2329 "pand %%mm7, %%mm3 \n\t"
2330 "packuswb %%mm2, %%mm0 \n\t"
2331 "packuswb %%mm3, %%mm1 \n\t"
2332 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2333 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2337 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2349 #if !COMPILE_TEMPLATE_AMD3DNOW
2359 "pcmpeqw %%mm7, %%mm7 \n\t"
2360 "psrlw $8, %%mm7 \n\t"
2362 "movq -28(%1, %0, 4), %%mm0 \n\t"
2363 "movq -20(%1, %0, 4), %%mm1 \n\t"
2364 "movq -12(%1, %0, 4), %%mm2 \n\t"
2365 "movq -4(%1, %0, 4), %%mm3 \n\t"
2366 "psrlw $8, %%mm0 \n\t"
2367 "psrlw $8, %%mm1 \n\t"
2368 "psrlw $8, %%mm2 \n\t"
2369 "psrlw $8, %%mm3 \n\t"
2370 "packuswb %%mm1, %%mm0 \n\t"
2371 "packuswb %%mm3, %%mm2 \n\t"
2372 "movq %%mm0, %%mm1 \n\t"
2373 "movq %%mm2, %%mm3 \n\t"
2374 "psrlw $8, %%mm0 \n\t"
2375 "psrlw $8, %%mm2 \n\t"
2376 "pand %%mm7, %%mm1 \n\t"
2377 "pand %%mm7, %%mm3 \n\t"
2378 "packuswb %%mm2, %%mm0 \n\t"
2379 "packuswb %%mm3, %%mm1 \n\t"
2380 MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2381 MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2385 :
"r"(src),
"r"(dst0),
"r"(dst1)
2409 "pcmpeqw %%mm7, %%mm7 \n\t"
2410 "psrlw $8, %%mm7 \n\t"
2412 "movq -28(%1, %0, 4), %%mm0 \n\t"
2413 "movq -20(%1, %0, 4), %%mm1 \n\t"
2414 "movq -12(%1, %0, 4), %%mm2 \n\t"
2415 "movq -4(%1, %0, 4), %%mm3 \n\t"
2416 PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2417 PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2418 PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2419 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2420 "psrlw $8, %%mm0 \n\t"
2421 "psrlw $8, %%mm1 \n\t"
2422 "psrlw $8, %%mm2 \n\t"
2423 "psrlw $8, %%mm3 \n\t"
2424 "packuswb %%mm1, %%mm0 \n\t"
2425 "packuswb %%mm3, %%mm2 \n\t"
2426 "movq %%mm0, %%mm1 \n\t"
2427 "movq %%mm2, %%mm3 \n\t"
2428 "psrlw $8, %%mm0 \n\t"
2429 "psrlw $8, %%mm2 \n\t"
2430 "pand %%mm7, %%mm1 \n\t"
2431 "pand %%mm7, %%mm3 \n\t"
2432 "packuswb %%mm2, %%mm0 \n\t"
2433 "packuswb %%mm3, %%mm1 \n\t"
2434 MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2435 MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2439 :
"r"(src0),
"r"(src1),
"r"(dst0),
"r"(dst1)
2455 int lumStride,
int chromStride,
int srcStride)
2460 for (y=0; y<
height; y++) {
2478 #if !COMPILE_TEMPLATE_AMD3DNOW
2481 int lumStride,
int chromStride,
int srcStride)
2486 for (y=0; y<
height; y++) {
2505 int lumStride,
int chromStride,
int srcStride)
2510 for (y=0; y<
height; y++) {
2528 #if !COMPILE_TEMPLATE_AMD3DNOW
2531 int lumStride,
int chromStride,
int srcStride)
2536 for (y=0; y<
height; y++) {
2556 #if !COMPILE_TEMPLATE_SSE2
2557 #if !COMPILE_TEMPLATE_AMD3DNOW
2587 #if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2598 #if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
2601 #if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
2602 #if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM