2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
26 #include "libavutil/avassert.h"
27 #include "libavutil/cpu.h"
28 #include "libavutil/x86/asm.h"
29 #include "libavcodec/videodsp.h"
30 #include "constants.h"
31 #include "dsputil_x86.h"
32 #include "diracdsp_mmx.h"
36 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
47 "movq (%3), %%mm0 \n\t"
48 "movq 8(%3), %%mm1 \n\t"
49 "movq 16(%3), %%mm2 \n\t"
50 "movq 24(%3), %%mm3 \n\t"
51 "movq 32(%3), %%mm4 \n\t"
52 "movq 40(%3), %%mm5 \n\t"
53 "movq 48(%3), %%mm6 \n\t"
54 "movq 56(%3), %%mm7 \n\t"
55 "packuswb %%mm1, %%mm0 \n\t"
56 "packuswb %%mm3, %%mm2 \n\t"
57 "packuswb %%mm5, %%mm4 \n\t"
58 "packuswb %%mm7, %%mm6 \n\t"
59 "movq %%mm0, (%0) \n\t"
60 "movq %%mm2, (%0, %1) \n\t"
61 "movq %%mm4, (%0, %1, 2) \n\t"
62 "movq %%mm6, (%0, %2) \n\t"
63 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
69 // if here would be an exact copy of the code above
70 // compiler would generate some very strange code
73 "movq (%3), %%mm0 \n\t"
74 "movq 8(%3), %%mm1 \n\t"
75 "movq 16(%3), %%mm2 \n\t"
76 "movq 24(%3), %%mm3 \n\t"
77 "movq 32(%3), %%mm4 \n\t"
78 "movq 40(%3), %%mm5 \n\t"
79 "movq 48(%3), %%mm6 \n\t"
80 "movq 56(%3), %%mm7 \n\t"
81 "packuswb %%mm1, %%mm0 \n\t"
82 "packuswb %%mm3, %%mm2 \n\t"
83 "packuswb %%mm5, %%mm4 \n\t"
84 "packuswb %%mm7, %%mm6 \n\t"
85 "movq %%mm0, (%0) \n\t"
86 "movq %%mm2, (%0, %1) \n\t"
87 "movq %%mm4, (%0, %1, 2) \n\t"
88 "movq %%mm6, (%0, %2) \n\t"
89 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
93 #define put_signed_pixels_clamped_mmx_half(off) \
94 "movq "#off"(%2), %%mm1 \n\t" \
95 "movq 16 + "#off"(%2), %%mm2 \n\t" \
96 "movq 32 + "#off"(%2), %%mm3 \n\t" \
97 "movq 48 + "#off"(%2), %%mm4 \n\t" \
98 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
99 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
100 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
101 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
102 "paddb %%mm0, %%mm1 \n\t" \
103 "paddb %%mm0, %%mm2 \n\t" \
104 "paddb %%mm0, %%mm3 \n\t" \
105 "paddb %%mm0, %%mm4 \n\t" \
106 "movq %%mm1, (%0) \n\t" \
107 "movq %%mm2, (%0, %3) \n\t" \
108 "movq %%mm3, (%0, %3, 2) \n\t" \
109 "movq %%mm4, (%0, %1) \n\t"
111 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
114 x86_reg line_skip = line_size;
118 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
119 "lea (%3, %3, 2), %1 \n\t"
120 put_signed_pixels_clamped_mmx_half(0)
121 "lea (%0, %3, 4), %0 \n\t"
122 put_signed_pixels_clamped_mmx_half(64)
123 : "+&r"(pixels), "=&r"(line_skip3)
124 : "r"(block), "r"(line_skip)
128 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
135 /* read the pixels */
142 "movq (%2), %%mm0 \n\t"
143 "movq 8(%2), %%mm1 \n\t"
144 "movq 16(%2), %%mm2 \n\t"
145 "movq 24(%2), %%mm3 \n\t"
146 "movq %0, %%mm4 \n\t"
147 "movq %1, %%mm6 \n\t"
148 "movq %%mm4, %%mm5 \n\t"
149 "punpcklbw %%mm7, %%mm4 \n\t"
150 "punpckhbw %%mm7, %%mm5 \n\t"
151 "paddsw %%mm4, %%mm0 \n\t"
152 "paddsw %%mm5, %%mm1 \n\t"
153 "movq %%mm6, %%mm5 \n\t"
154 "punpcklbw %%mm7, %%mm6 \n\t"
155 "punpckhbw %%mm7, %%mm5 \n\t"
156 "paddsw %%mm6, %%mm2 \n\t"
157 "paddsw %%mm5, %%mm3 \n\t"
158 "packuswb %%mm1, %%mm0 \n\t"
159 "packuswb %%mm3, %%mm2 \n\t"
160 "movq %%mm0, %0 \n\t"
161 "movq %%mm2, %1 \n\t"
162 : "+m"(*pix), "+m"(*(pix + line_size))
165 pix += line_size * 2;
170 #define CLEAR_BLOCKS(name, n) \
171 void name(int16_t *blocks) \
174 "pxor %%mm7, %%mm7 \n\t" \
175 "mov %1, %%"REG_a" \n\t" \
177 "movq %%mm7, (%0, %%"REG_a") \n\t" \
178 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
179 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
180 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
181 "add $32, %%"REG_a" \n\t" \
183 :: "r"(((uint8_t *)blocks) + 128 * n), \
188 CLEAR_BLOCKS(ff_clear_blocks_mmx, 6)
189 CLEAR_BLOCKS(ff_clear_block_mmx, 1)
191 void ff_clear_block_sse(int16_t *block)
194 "xorps %%xmm0, %%xmm0 \n"
195 "movaps %%xmm0, (%0) \n"
196 "movaps %%xmm0, 16(%0) \n"
197 "movaps %%xmm0, 32(%0) \n"
198 "movaps %%xmm0, 48(%0) \n"
199 "movaps %%xmm0, 64(%0) \n"
200 "movaps %%xmm0, 80(%0) \n"
201 "movaps %%xmm0, 96(%0) \n"
202 "movaps %%xmm0, 112(%0) \n"
208 void ff_clear_blocks_sse(int16_t *blocks)
211 "xorps %%xmm0, %%xmm0 \n"
212 "mov %1, %%"REG_a" \n"
214 "movaps %%xmm0, (%0, %%"REG_a") \n"
215 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
216 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
217 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
218 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
219 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
220 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
221 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
222 "add $128, %%"REG_a" \n"
224 :: "r"(((uint8_t *)blocks) + 128 * 6),
230 void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
236 "movq (%1, %0), %%mm0 \n\t"
237 "movq (%2, %0), %%mm1 \n\t"
238 "paddb %%mm0, %%mm1 \n\t"
239 "movq %%mm1, (%2, %0) \n\t"
240 "movq 8(%1, %0), %%mm0 \n\t"
241 "movq 8(%2, %0), %%mm1 \n\t"
242 "paddb %%mm0, %%mm1 \n\t"
243 "movq %%mm1, 8(%2, %0) \n\t"
249 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
252 dst[i + 0] += src[i + 0];
255 /* Draw the edges of width 'w' of an image of size width, height
256 * this MMX version can only handle w == 8 || w == 16. */
257 void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
258 int w, int h, int sides)
260 uint8_t *ptr, *last_line;
263 last_line = buf + (height - 1) * wrap;
269 "movd (%0), %%mm0 \n\t"
270 "punpcklbw %%mm0, %%mm0 \n\t"
271 "punpcklwd %%mm0, %%mm0 \n\t"
272 "punpckldq %%mm0, %%mm0 \n\t"
273 "movq %%mm0, -8(%0) \n\t"
274 "movq -8(%0, %2), %%mm1 \n\t"
275 "punpckhbw %%mm1, %%mm1 \n\t"
276 "punpckhwd %%mm1, %%mm1 \n\t"
277 "punpckhdq %%mm1, %%mm1 \n\t"
278 "movq %%mm1, (%0, %2) \n\t"
283 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
288 "movd (%0), %%mm0 \n\t"
289 "punpcklbw %%mm0, %%mm0 \n\t"
290 "punpcklwd %%mm0, %%mm0 \n\t"
291 "punpckldq %%mm0, %%mm0 \n\t"
292 "movq %%mm0, -8(%0) \n\t"
293 "movq %%mm0, -16(%0) \n\t"
294 "movq -8(%0, %2), %%mm1 \n\t"
295 "punpckhbw %%mm1, %%mm1 \n\t"
296 "punpckhwd %%mm1, %%mm1 \n\t"
297 "punpckhdq %%mm1, %%mm1 \n\t"
298 "movq %%mm1, (%0, %2) \n\t"
299 "movq %%mm1, 8(%0, %2) \n\t"
304 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
310 "movd (%0), %%mm0 \n\t"
311 "punpcklbw %%mm0, %%mm0 \n\t"
312 "punpcklwd %%mm0, %%mm0 \n\t"
313 "movd %%mm0, -4(%0) \n\t"
314 "movd -4(%0, %2), %%mm1 \n\t"
315 "punpcklbw %%mm1, %%mm1 \n\t"
316 "punpckhwd %%mm1, %%mm1 \n\t"
317 "punpckhdq %%mm1, %%mm1 \n\t"
318 "movd %%mm1, (%0, %2) \n\t"
323 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
327 /* top and bottom (and hopefully also the corners) */
328 if (sides & EDGE_TOP) {
329 for (i = 0; i < h; i += 4) {
330 ptr = buf - (i + 1) * wrap - w;
333 "movq (%1, %0), %%mm0 \n\t"
334 "movq %%mm0, (%0) \n\t"
335 "movq %%mm0, (%0, %2) \n\t"
336 "movq %%mm0, (%0, %2, 2) \n\t"
337 "movq %%mm0, (%0, %3) \n\t"
342 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
343 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
348 if (sides & EDGE_BOTTOM) {
349 for (i = 0; i < h; i += 4) {
350 ptr = last_line + (i + 1) * wrap - w;
353 "movq (%1, %0), %%mm0 \n\t"
354 "movq %%mm0, (%0) \n\t"
355 "movq %%mm0, (%0, %2) \n\t"
356 "movq %%mm0, (%0, %2, 2) \n\t"
357 "movq %%mm0, (%0, %3) \n\t"
362 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
363 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
364 "r"(ptr + width + 2 * w)
370 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
371 ptrdiff_t dst_stride,
372 ptrdiff_t src_linesize,
373 int block_w, int block_h,
374 int src_x, int src_y, int w, int h);
376 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
377 int stride, int h, int ox, int oy,
378 int dxx, int dxy, int dyx, int dyy,
379 int shift, int r, int width, int height,
380 emulated_edge_mc_func *emu_edge_fn)
383 const int ix = ox >> (16 + shift);
384 const int iy = oy >> (16 + shift);
385 const int oxs = ox >> 4;
386 const int oys = oy >> 4;
387 const int dxxs = dxx >> 4;
388 const int dxys = dxy >> 4;
389 const int dyxs = dyx >> 4;
390 const int dyys = dyy >> 4;
391 const uint16_t r4[4] = { r, r, r, r };
392 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
393 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
394 const uint64_t shift2 = 2 * shift;
395 #define MAX_STRIDE 4096U
397 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
400 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
401 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
402 const int dxh = dxy * (h - 1);
403 const int dyw = dyx * (w - 1);
404 int need_emu = (unsigned)ix >= width - w ||
405 (unsigned)iy >= height - h;
407 if ( // non-constant fullpel offset (3% of blocks)
408 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
409 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
410 // uses more than 16 bits of subpel mv (only at huge resolution)
411 || (dxx | dxy | dyx | dyy) & 15
412 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
413 // FIXME could still use mmx for some of the rows
414 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
415 shift, r, width, height);
419 src += ix + iy * stride;
421 emu_edge_fn(edge_buf, src, stride, stride, w + 1, h + 1, ix, iy, width, height);
426 "movd %0, %%mm6 \n\t"
427 "pxor %%mm7, %%mm7 \n\t"
428 "punpcklwd %%mm6, %%mm6 \n\t"
429 "punpcklwd %%mm6, %%mm6 \n\t"
433 for (x = 0; x < w; x += 4) {
434 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
435 oxs - dxys + dxxs * (x + 1),
436 oxs - dxys + dxxs * (x + 2),
437 oxs - dxys + dxxs * (x + 3) };
438 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
439 oys - dyys + dyxs * (x + 1),
440 oys - dyys + dyxs * (x + 2),
441 oys - dyys + dyxs * (x + 3) };
443 for (y = 0; y < h; y++) {
445 "movq %0, %%mm4 \n\t"
446 "movq %1, %%mm5 \n\t"
447 "paddw %2, %%mm4 \n\t"
448 "paddw %3, %%mm5 \n\t"
449 "movq %%mm4, %0 \n\t"
450 "movq %%mm5, %1 \n\t"
451 "psrlw $12, %%mm4 \n\t"
452 "psrlw $12, %%mm5 \n\t"
453 : "+m"(*dx4), "+m"(*dy4)
454 : "m"(*dxy4), "m"(*dyy4)
458 "movq %%mm6, %%mm2 \n\t"
459 "movq %%mm6, %%mm1 \n\t"
460 "psubw %%mm4, %%mm2 \n\t"
461 "psubw %%mm5, %%mm1 \n\t"
462 "movq %%mm2, %%mm0 \n\t"
463 "movq %%mm4, %%mm3 \n\t"
464 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
465 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
466 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
467 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
469 "movd %4, %%mm5 \n\t"
470 "movd %3, %%mm4 \n\t"
471 "punpcklbw %%mm7, %%mm5 \n\t"
472 "punpcklbw %%mm7, %%mm4 \n\t"
473 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
474 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
476 "movd %2, %%mm5 \n\t"
477 "movd %1, %%mm4 \n\t"
478 "punpcklbw %%mm7, %%mm5 \n\t"
479 "punpcklbw %%mm7, %%mm4 \n\t"
480 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
481 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
482 "paddw %5, %%mm1 \n\t"
483 "paddw %%mm3, %%mm2 \n\t"
484 "paddw %%mm1, %%mm0 \n\t"
485 "paddw %%mm2, %%mm0 \n\t"
487 "psrlw %6, %%mm0 \n\t"
488 "packuswb %%mm0, %%mm0 \n\t"
489 "movd %%mm0, %0 \n\t"
491 : "=m"(dst[x + y * stride])
492 : "m"(src[0]), "m"(src[1]),
493 "m"(src[stride]), "m"(src[stride + 1]),
494 "m"(*r4), "m"(shift2)
498 src += 4 - h * stride;
505 void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
506 int stride, int h, int ox, int oy,
507 int dxx, int dxy, int dyx, int dyy,
508 int shift, int r, int width, int height)
510 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
511 width, height, &ff_emulated_edge_mc_8);
514 void ff_gmc_sse(uint8_t *dst, uint8_t *src,
515 int stride, int h, int ox, int oy,
516 int dxx, int dxy, int dyx, int dyy,
517 int shift, int r, int width, int height)
519 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
520 width, height, &ff_emulated_edge_mc_8);
523 void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
524 int stride, int h, int ox, int oy,
525 int dxx, int dxy, int dyx, int dyy,
526 int shift, int r, int width, int height)
528 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
529 width, height, &ff_emulated_edge_mc_8);
534 #if CONFIG_DIRAC_DECODER
535 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
536 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
539 ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
541 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
543 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
546 ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
548 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
550 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
553 ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
555 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
556 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
561 PIXELS16(static, ff_avg, , , _mmxext)
562 DIRAC_PIXOP(put, ff_put, mmx)
563 DIRAC_PIXOP(avg, ff_avg, mmx)
567 DIRAC_PIXOP(avg, ff_avg, mmxext)
569 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
572 ff_put_dirac_pixels16_c(dst, src, stride, h);
574 ff_put_pixels16_sse2(dst, src[0], stride, h);
576 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
579 ff_avg_dirac_pixels16_c(dst, src, stride, h);
581 ff_avg_pixels16_sse2(dst, src[0], stride, h);
583 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
586 ff_put_dirac_pixels32_c(dst, src, stride, h);
588 ff_put_pixels16_sse2(dst , src[0] , stride, h);
589 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
592 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
595 ff_avg_dirac_pixels32_c(dst, src, stride, h);
597 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
598 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
604 void ff_vector_clipf_sse(float *dst, const float *src,
605 float min, float max, int len)
607 x86_reg i = (len - 16) * 4;
609 "movss %3, %%xmm4 \n\t"
610 "movss %4, %%xmm5 \n\t"
611 "shufps $0, %%xmm4, %%xmm4 \n\t"
612 "shufps $0, %%xmm5, %%xmm5 \n\t"
614 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
615 "movaps 16(%2, %0), %%xmm1 \n\t"
616 "movaps 32(%2, %0), %%xmm2 \n\t"
617 "movaps 48(%2, %0), %%xmm3 \n\t"
618 "maxps %%xmm4, %%xmm0 \n\t"
619 "maxps %%xmm4, %%xmm1 \n\t"
620 "maxps %%xmm4, %%xmm2 \n\t"
621 "maxps %%xmm4, %%xmm3 \n\t"
622 "minps %%xmm5, %%xmm0 \n\t"
623 "minps %%xmm5, %%xmm1 \n\t"
624 "minps %%xmm5, %%xmm2 \n\t"
625 "minps %%xmm5, %%xmm3 \n\t"
626 "movaps %%xmm0, (%1, %0) \n\t"
627 "movaps %%xmm1, 16(%1, %0) \n\t"
628 "movaps %%xmm2, 32(%1, %0) \n\t"
629 "movaps %%xmm3, 48(%1, %0) \n\t"
633 : "r"(dst), "r"(src), "m"(min), "m"(max)
638 #endif /* HAVE_INLINE_ASM */