2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
26 #include "libavutil/avassert.h"
27 #include "libavutil/cpu.h"
28 #include "libavutil/x86/asm.h"
29 #include "libavcodec/videodsp.h"
30 #include "constants.h"
31 #include "dsputil_x86.h"
32 #include "diracdsp_mmx.h"
36 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
47 "movq (%3), %%mm0 \n\t"
48 "movq 8(%3), %%mm1 \n\t"
49 "movq 16(%3), %%mm2 \n\t"
50 "movq 24(%3), %%mm3 \n\t"
51 "movq 32(%3), %%mm4 \n\t"
52 "movq 40(%3), %%mm5 \n\t"
53 "movq 48(%3), %%mm6 \n\t"
54 "movq 56(%3), %%mm7 \n\t"
55 "packuswb %%mm1, %%mm0 \n\t"
56 "packuswb %%mm3, %%mm2 \n\t"
57 "packuswb %%mm5, %%mm4 \n\t"
58 "packuswb %%mm7, %%mm6 \n\t"
59 "movq %%mm0, (%0) \n\t"
60 "movq %%mm2, (%0, %1) \n\t"
61 "movq %%mm4, (%0, %1, 2) \n\t"
62 "movq %%mm6, (%0, %2) \n\t"
63 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
69 // if here would be an exact copy of the code above
70 // compiler would generate some very strange code
73 "movq (%3), %%mm0 \n\t"
74 "movq 8(%3), %%mm1 \n\t"
75 "movq 16(%3), %%mm2 \n\t"
76 "movq 24(%3), %%mm3 \n\t"
77 "movq 32(%3), %%mm4 \n\t"
78 "movq 40(%3), %%mm5 \n\t"
79 "movq 48(%3), %%mm6 \n\t"
80 "movq 56(%3), %%mm7 \n\t"
81 "packuswb %%mm1, %%mm0 \n\t"
82 "packuswb %%mm3, %%mm2 \n\t"
83 "packuswb %%mm5, %%mm4 \n\t"
84 "packuswb %%mm7, %%mm6 \n\t"
85 "movq %%mm0, (%0) \n\t"
86 "movq %%mm2, (%0, %1) \n\t"
87 "movq %%mm4, (%0, %1, 2) \n\t"
88 "movq %%mm6, (%0, %2) \n\t"
89 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
93 #define put_signed_pixels_clamped_mmx_half(off) \
94 "movq "#off"(%2), %%mm1 \n\t" \
95 "movq 16 + "#off"(%2), %%mm2 \n\t" \
96 "movq 32 + "#off"(%2), %%mm3 \n\t" \
97 "movq 48 + "#off"(%2), %%mm4 \n\t" \
98 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
99 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
100 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
101 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
102 "paddb %%mm0, %%mm1 \n\t" \
103 "paddb %%mm0, %%mm2 \n\t" \
104 "paddb %%mm0, %%mm3 \n\t" \
105 "paddb %%mm0, %%mm4 \n\t" \
106 "movq %%mm1, (%0) \n\t" \
107 "movq %%mm2, (%0, %3) \n\t" \
108 "movq %%mm3, (%0, %3, 2) \n\t" \
109 "movq %%mm4, (%0, %1) \n\t"
111 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
114 x86_reg line_skip = line_size;
118 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
119 "lea (%3, %3, 2), %1 \n\t"
120 put_signed_pixels_clamped_mmx_half(0)
121 "lea (%0, %3, 4), %0 \n\t"
122 put_signed_pixels_clamped_mmx_half(64)
123 : "+&r"(pixels), "=&r"(line_skip3)
124 : "r"(block), "r"(line_skip)
128 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
135 /* read the pixels */
142 "movq (%2), %%mm0 \n\t"
143 "movq 8(%2), %%mm1 \n\t"
144 "movq 16(%2), %%mm2 \n\t"
145 "movq 24(%2), %%mm3 \n\t"
146 "movq %0, %%mm4 \n\t"
147 "movq %1, %%mm6 \n\t"
148 "movq %%mm4, %%mm5 \n\t"
149 "punpcklbw %%mm7, %%mm4 \n\t"
150 "punpckhbw %%mm7, %%mm5 \n\t"
151 "paddsw %%mm4, %%mm0 \n\t"
152 "paddsw %%mm5, %%mm1 \n\t"
153 "movq %%mm6, %%mm5 \n\t"
154 "punpcklbw %%mm7, %%mm6 \n\t"
155 "punpckhbw %%mm7, %%mm5 \n\t"
156 "paddsw %%mm6, %%mm2 \n\t"
157 "paddsw %%mm5, %%mm3 \n\t"
158 "packuswb %%mm1, %%mm0 \n\t"
159 "packuswb %%mm3, %%mm2 \n\t"
160 "movq %%mm0, %0 \n\t"
161 "movq %%mm2, %1 \n\t"
162 : "+m"(*pix), "+m"(*(pix + line_size))
165 pix += line_size * 2;
170 #define CLEAR_BLOCKS(name, n) \
171 void name(int16_t *blocks) \
174 "pxor %%mm7, %%mm7 \n\t" \
175 "mov %1, %%"REG_a" \n\t" \
177 "movq %%mm7, (%0, %%"REG_a") \n\t" \
178 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
179 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
180 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
181 "add $32, %%"REG_a" \n\t" \
183 :: "r"(((uint8_t *)blocks) + 128 * n), \
188 CLEAR_BLOCKS(ff_clear_blocks_mmx, 6)
189 CLEAR_BLOCKS(ff_clear_block_mmx, 1)
191 void ff_clear_block_sse(int16_t *block)
194 "xorps %%xmm0, %%xmm0 \n"
195 "movaps %%xmm0, (%0) \n"
196 "movaps %%xmm0, 16(%0) \n"
197 "movaps %%xmm0, 32(%0) \n"
198 "movaps %%xmm0, 48(%0) \n"
199 "movaps %%xmm0, 64(%0) \n"
200 "movaps %%xmm0, 80(%0) \n"
201 "movaps %%xmm0, 96(%0) \n"
202 "movaps %%xmm0, 112(%0) \n"
208 void ff_clear_blocks_sse(int16_t *blocks)
211 "xorps %%xmm0, %%xmm0 \n"
212 "mov %1, %%"REG_a" \n"
214 "movaps %%xmm0, (%0, %%"REG_a") \n"
215 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
216 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
217 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
218 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
219 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
220 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
221 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
222 "add $128, %%"REG_a" \n"
224 :: "r"(((uint8_t *)blocks) + 128 * 6),
230 void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
236 "movq (%1, %0), %%mm0 \n\t"
237 "movq (%2, %0), %%mm1 \n\t"
238 "paddb %%mm0, %%mm1 \n\t"
239 "movq %%mm1, (%2, %0) \n\t"
240 "movq 8(%1, %0), %%mm0 \n\t"
241 "movq 8(%2, %0), %%mm1 \n\t"
242 "paddb %%mm0, %%mm1 \n\t"
243 "movq %%mm1, 8(%2, %0) \n\t"
249 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
252 dst[i + 0] += src[i + 0];
255 /* Draw the edges of width 'w' of an image of size width, height
256 * this MMX version can only handle w == 8 || w == 16. */
257 void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
258 int w, int h, int sides)
260 uint8_t *ptr, *last_line;
263 last_line = buf + (height - 1) * wrap;
269 "movd (%0), %%mm0 \n\t"
270 "punpcklbw %%mm0, %%mm0 \n\t"
271 "punpcklwd %%mm0, %%mm0 \n\t"
272 "punpckldq %%mm0, %%mm0 \n\t"
273 "movq %%mm0, -8(%0) \n\t"
274 "movq -8(%0, %2), %%mm1 \n\t"
275 "punpckhbw %%mm1, %%mm1 \n\t"
276 "punpckhwd %%mm1, %%mm1 \n\t"
277 "punpckhdq %%mm1, %%mm1 \n\t"
278 "movq %%mm1, (%0, %2) \n\t"
283 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
288 "movd (%0), %%mm0 \n\t"
289 "punpcklbw %%mm0, %%mm0 \n\t"
290 "punpcklwd %%mm0, %%mm0 \n\t"
291 "punpckldq %%mm0, %%mm0 \n\t"
292 "movq %%mm0, -8(%0) \n\t"
293 "movq %%mm0, -16(%0) \n\t"
294 "movq -8(%0, %2), %%mm1 \n\t"
295 "punpckhbw %%mm1, %%mm1 \n\t"
296 "punpckhwd %%mm1, %%mm1 \n\t"
297 "punpckhdq %%mm1, %%mm1 \n\t"
298 "movq %%mm1, (%0, %2) \n\t"
299 "movq %%mm1, 8(%0, %2) \n\t"
304 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
310 "movd (%0), %%mm0 \n\t"
311 "punpcklbw %%mm0, %%mm0 \n\t"
312 "punpcklwd %%mm0, %%mm0 \n\t"
313 "movd %%mm0, -4(%0) \n\t"
314 "movd -4(%0, %2), %%mm1 \n\t"
315 "punpcklbw %%mm1, %%mm1 \n\t"
316 "punpckhwd %%mm1, %%mm1 \n\t"
317 "punpckhdq %%mm1, %%mm1 \n\t"
318 "movd %%mm1, (%0, %2) \n\t"
323 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
327 /* top and bottom (and hopefully also the corners) */
328 if (sides & EDGE_TOP) {
329 for (i = 0; i < h; i += 4) {
330 ptr = buf - (i + 1) * wrap - w;
333 "movq (%1, %0), %%mm0 \n\t"
334 "movq %%mm0, (%0) \n\t"
335 "movq %%mm0, (%0, %2) \n\t"
336 "movq %%mm0, (%0, %2, 2) \n\t"
337 "movq %%mm0, (%0, %3) \n\t"
342 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
343 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
348 if (sides & EDGE_BOTTOM) {
349 for (i = 0; i < h; i += 4) {
350 ptr = last_line + (i + 1) * wrap - w;
353 "movq (%1, %0), %%mm0 \n\t"
354 "movq %%mm0, (%0) \n\t"
355 "movq %%mm0, (%0, %2) \n\t"
356 "movq %%mm0, (%0, %2, 2) \n\t"
357 "movq %%mm0, (%0, %3) \n\t"
362 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
363 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
364 "r"(ptr + width + 2 * w)
370 typedef void emulated_edge_mc_func(uint8_t *dst, ptrdiff_t dst_stride,
371 const uint8_t *src, ptrdiff_t src_linesize,
372 int block_w, int block_h,
373 int src_x, int src_y, int w, int h);
375 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
376 int stride, int h, int ox, int oy,
377 int dxx, int dxy, int dyx, int dyy,
378 int shift, int r, int width, int height,
379 emulated_edge_mc_func *emu_edge_fn)
382 const int ix = ox >> (16 + shift);
383 const int iy = oy >> (16 + shift);
384 const int oxs = ox >> 4;
385 const int oys = oy >> 4;
386 const int dxxs = dxx >> 4;
387 const int dxys = dxy >> 4;
388 const int dyxs = dyx >> 4;
389 const int dyys = dyy >> 4;
390 const uint16_t r4[4] = { r, r, r, r };
391 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
392 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
393 const uint64_t shift2 = 2 * shift;
394 #define MAX_STRIDE 4096U
396 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
399 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
400 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
401 const int dxh = dxy * (h - 1);
402 const int dyw = dyx * (w - 1);
403 int need_emu = (unsigned)ix >= width - w ||
404 (unsigned)iy >= height - h;
406 if ( // non-constant fullpel offset (3% of blocks)
407 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
408 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
409 // uses more than 16 bits of subpel mv (only at huge resolution)
410 || (dxx | dxy | dyx | dyy) & 15
411 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
412 // FIXME could still use mmx for some of the rows
413 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
414 shift, r, width, height);
418 src += ix + iy * stride;
420 emu_edge_fn(edge_buf, stride, src, stride, w + 1, h + 1, ix, iy, width, height);
425 "movd %0, %%mm6 \n\t"
426 "pxor %%mm7, %%mm7 \n\t"
427 "punpcklwd %%mm6, %%mm6 \n\t"
428 "punpcklwd %%mm6, %%mm6 \n\t"
432 for (x = 0; x < w; x += 4) {
433 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
434 oxs - dxys + dxxs * (x + 1),
435 oxs - dxys + dxxs * (x + 2),
436 oxs - dxys + dxxs * (x + 3) };
437 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
438 oys - dyys + dyxs * (x + 1),
439 oys - dyys + dyxs * (x + 2),
440 oys - dyys + dyxs * (x + 3) };
442 for (y = 0; y < h; y++) {
444 "movq %0, %%mm4 \n\t"
445 "movq %1, %%mm5 \n\t"
446 "paddw %2, %%mm4 \n\t"
447 "paddw %3, %%mm5 \n\t"
448 "movq %%mm4, %0 \n\t"
449 "movq %%mm5, %1 \n\t"
450 "psrlw $12, %%mm4 \n\t"
451 "psrlw $12, %%mm5 \n\t"
452 : "+m"(*dx4), "+m"(*dy4)
453 : "m"(*dxy4), "m"(*dyy4)
457 "movq %%mm6, %%mm2 \n\t"
458 "movq %%mm6, %%mm1 \n\t"
459 "psubw %%mm4, %%mm2 \n\t"
460 "psubw %%mm5, %%mm1 \n\t"
461 "movq %%mm2, %%mm0 \n\t"
462 "movq %%mm4, %%mm3 \n\t"
463 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
464 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
465 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
466 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
468 "movd %4, %%mm5 \n\t"
469 "movd %3, %%mm4 \n\t"
470 "punpcklbw %%mm7, %%mm5 \n\t"
471 "punpcklbw %%mm7, %%mm4 \n\t"
472 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
473 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
475 "movd %2, %%mm5 \n\t"
476 "movd %1, %%mm4 \n\t"
477 "punpcklbw %%mm7, %%mm5 \n\t"
478 "punpcklbw %%mm7, %%mm4 \n\t"
479 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
480 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
481 "paddw %5, %%mm1 \n\t"
482 "paddw %%mm3, %%mm2 \n\t"
483 "paddw %%mm1, %%mm0 \n\t"
484 "paddw %%mm2, %%mm0 \n\t"
486 "psrlw %6, %%mm0 \n\t"
487 "packuswb %%mm0, %%mm0 \n\t"
488 "movd %%mm0, %0 \n\t"
490 : "=m"(dst[x + y * stride])
491 : "m"(src[0]), "m"(src[1]),
492 "m"(src[stride]), "m"(src[stride + 1]),
493 "m"(*r4), "m"(shift2)
497 src += 4 - h * stride;
504 void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
505 int stride, int h, int ox, int oy,
506 int dxx, int dxy, int dyx, int dyy,
507 int shift, int r, int width, int height)
509 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
510 width, height, &ff_emulated_edge_mc_8);
513 void ff_gmc_sse(uint8_t *dst, uint8_t *src,
514 int stride, int h, int ox, int oy,
515 int dxx, int dxy, int dyx, int dyy,
516 int shift, int r, int width, int height)
518 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
519 width, height, &ff_emulated_edge_mc_8);
522 void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
523 int stride, int h, int ox, int oy,
524 int dxx, int dxy, int dyx, int dyy,
525 int shift, int r, int width, int height)
527 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
528 width, height, &ff_emulated_edge_mc_8);
533 #if CONFIG_DIRAC_DECODER
534 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
535 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
538 ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
540 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
542 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
545 ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
547 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
549 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
552 ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
554 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
555 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
560 PIXELS16(static, ff_avg, , , _mmxext)
561 DIRAC_PIXOP(put, ff_put, mmx)
562 DIRAC_PIXOP(avg, ff_avg, mmx)
566 DIRAC_PIXOP(avg, ff_avg, mmxext)
568 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
571 ff_put_dirac_pixels16_c(dst, src, stride, h);
573 ff_put_pixels16_sse2(dst, src[0], stride, h);
575 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
578 ff_avg_dirac_pixels16_c(dst, src, stride, h);
580 ff_avg_pixels16_sse2(dst, src[0], stride, h);
582 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
585 ff_put_dirac_pixels32_c(dst, src, stride, h);
587 ff_put_pixels16_sse2(dst , src[0] , stride, h);
588 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
591 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
594 ff_avg_dirac_pixels32_c(dst, src, stride, h);
596 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
597 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
603 void ff_vector_clipf_sse(float *dst, const float *src,
604 float min, float max, int len)
606 x86_reg i = (len - 16) * 4;
608 "movss %3, %%xmm4 \n\t"
609 "movss %4, %%xmm5 \n\t"
610 "shufps $0, %%xmm4, %%xmm4 \n\t"
611 "shufps $0, %%xmm5, %%xmm5 \n\t"
613 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
614 "movaps 16(%2, %0), %%xmm1 \n\t"
615 "movaps 32(%2, %0), %%xmm2 \n\t"
616 "movaps 48(%2, %0), %%xmm3 \n\t"
617 "maxps %%xmm4, %%xmm0 \n\t"
618 "maxps %%xmm4, %%xmm1 \n\t"
619 "maxps %%xmm4, %%xmm2 \n\t"
620 "maxps %%xmm4, %%xmm3 \n\t"
621 "minps %%xmm5, %%xmm0 \n\t"
622 "minps %%xmm5, %%xmm1 \n\t"
623 "minps %%xmm5, %%xmm2 \n\t"
624 "minps %%xmm5, %%xmm3 \n\t"
625 "movaps %%xmm0, (%1, %0) \n\t"
626 "movaps %%xmm1, 16(%1, %0) \n\t"
627 "movaps %%xmm2, 32(%1, %0) \n\t"
628 "movaps %%xmm3, 48(%1, %0) \n\t"
632 : "r"(dst), "r"(src), "m"(min), "m"(max)
637 #endif /* HAVE_INLINE_ASM */