Index: apps/codecs/Tremor/asm_arm.h =================================================================== RCS file: /cvsroot/rockbox/apps/codecs/Tremor/asm_arm.h,v retrieving revision 1.2 diff -u -r1.2 asm_arm.h --- apps/codecs/Tremor/asm_arm.h 28 Jan 2006 21:21:21 -0000 1.2 +++ apps/codecs/Tremor/asm_arm.h 10 Apr 2006 18:11:53 -0000 @@ -95,6 +95,112 @@ *y = y1 << 1; } +#ifndef _V_VECT_OPS +#define _V_VECT_OPS + +/* asm versions of vector operations for block.c, window.c */ +static inline +void vect_add(ogg_int32_t *x, ogg_int32_t *y, int n) +{ + while (n>=4) { + asm volatile ("ldmia %[x], {r0, r1, r2, r3};" + "ldmia %[y]!, {r4, r5, r6, r7};" + "add r0, r0, r4;" + "add r1, r1, r5;" + "add r2, r2, r6;" + "add r3, r3, r7;" + "stmia %[x]!, {r0, r1, r2, r3};" + : [x] "+r" (x), [y] "+r" (y) + : : "r0", "r1", "r2", "r3", + "r4", "r5", "r6", "r7", + "memory"); + n -= 4; + } + /* add final elements */ + while (n>0) { + *x++ += *y++; + n--; + } +} + +static inline +void vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n) +{ + while (n>=4) { + asm volatile ("ldmia %[y]!, {r0, r1, r2, r3};" + "stmia %[x]!, {r0, r1, r2, r3};" + : [x] "+r" (x), [y] "+r" (y) + : : "r0", "r1", "r2", "r3", + "memory"); + n -= 4; + } + /* copy final elements */ + while (n>0) { + *x++ = *y++; + n--; + } +} + +static inline +void vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n) +{ + while (n>=4) { + asm volatile ("ldmia %[d], {r0, r1, r2, r3};" + "ldmia %[w]!, {r4, r5, r6, r7};" + "smull r8, r9, r0, r4;" + "mov r0, r9, lsl #1;" + "smull r8, r9, r1, r5;" + "mov r1, r9, lsl #1;" + "smull r8, r9, r2, r6;" + "mov r2, r9, lsl #1;" + "smull r8, r9, r3, r7;" + "mov r3, r9, lsl #1;" + "stmia %[d]!, {r0, r1, r2, r3};" + : [d] "+r" (data), [w] "+r" (window) + : : "r0", "r1", "r2", "r3", + "r4", "r5", "r6", "r7", "r8", "r9", + "memory", "cc"); + n -= 4; + } + while(n>0) { + *data = MULT31(*data, *window); + data++; + window++; + n--; + } +} + +static inline +void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n) +{ + while (n>=4) { + asm volatile ("ldmia %[d], {r0, r1, r2, r3};" + "ldmda %[w]!, {r4, r5, r6, r7};" + "smull r8, r9, r0, r7;" + "mov r0, r9, lsl #1;" + "smull r8, r9, r1, r6;" + "mov r1, r9, lsl #1;" + "smull r8, r9, r2, r5;" + "mov r2, r9, lsl #1;" + "smull r8, r9, r3, r4;" + "mov r3, r9, lsl #1;" + "stmia %[d]!, {r0, r1, r2, r3};" + : [d] "+r" (data), [w] "+r" (window) + : : "r0", "r1", "r2", "r3", + "r4", "r5", "r6", "r7", "r8", "r9", + "memory", "cc"); + n -= 4; + } + while(n>0) { + *data = MULT31(*data, *window); + data++; + window--; + n--; + } +} + +#endif + #endif #ifndef _V_CLIP_MATH Index: apps/codecs/Tremor/asm_mcf5249.h =================================================================== RCS file: /cvsroot/rockbox/apps/codecs/Tremor/asm_mcf5249.h,v retrieving revision 1.15 diff -u -r1.15 asm_mcf5249.h --- apps/codecs/Tremor/asm_mcf5249.h 21 Sep 2005 23:09:19 -0000 1.15 +++ apps/codecs/Tremor/asm_mcf5249.h 10 Apr 2006 18:11:53 -0000 @@ -132,10 +132,13 @@ [t] "r" (_t), [v] "r" (_v) \ : "cc"); +#ifndef _V_VECT_OPS +#define _V_VECT_OPS + /* asm versions of vector operations for block.c, window.c */ /* assumes MAC is initialized & accumulators cleared */ static inline -void mcf5249_vect_add(ogg_int32_t *x, ogg_int32_t *y, int n) +void vect_add(ogg_int32_t *x, ogg_int32_t *y, int n) { /* align to 16 bytes */ while(n>0 && (int)x&16) { @@ -169,7 +172,7 @@ } static inline -void mcf5249_vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n) +void vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n) { /* align to 16 bytes */ while(n>0 && (int)x&16) { @@ -196,7 +199,7 @@ static inline -void mcf5249_vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n) +void vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n) { /* ensure data is aligned to 16-bytes */ while(n>0 && (int)data%16) { @@ -250,7 +253,7 @@ } static inline -void mcf5249_vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n) +void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n) { /* ensure at least data is aligned to 16-bytes */ while(n>0 && (int)data%16) { @@ -338,6 +341,8 @@ #endif +#endif + #ifndef _V_CLIP_MATH #define _V_CLIP_MATH Index: apps/codecs/Tremor/block.c =================================================================== RCS file: /cvsroot/rockbox/apps/codecs/Tremor/block.c,v retrieving revision 1.6 diff -u -r1.6 block.c --- apps/codecs/Tremor/block.c 28 Dec 2005 20:42:24 -0000 1.6 +++ apps/codecs/Tremor/block.c 10 Apr 2006 18:11:54 -0000 @@ -261,11 +261,7 @@ vorbis_info *vi=v->vi; codec_setup_info *ci=(codec_setup_info *)vi->codec_setup; private_state *b=v->backend_state; -#ifdef CPU_COLDFIRE int j; -#else - int i,j; -#endif if(v->pcm_current>v->pcm_returned && v->pcm_returned!=-1)return(OV_EINVAL); @@ -311,47 +307,25 @@ /* large/large */ ogg_int32_t *pcm=v->pcm[j]+prevCenter; ogg_int32_t *p=vb->pcm[j]; -#ifdef CPU_COLDFIRE - mcf5249_vect_add(pcm, p, n1); -#else - for(i=0;ipcm[j]+prevCenter+n1/2-n0/2; ogg_int32_t *p=vb->pcm[j]; -#ifdef CPU_COLDFIRE - mcf5249_vect_add(pcm, p, n0); -#else - for(i=0;iW){ /* small/large */ ogg_int32_t *pcm=v->pcm[j]+prevCenter; ogg_int32_t *p=vb->pcm[j]+n1/2-n0/2; -#ifdef CPU_COLDFIRE - mcf5249_vect_add(pcm, p, n0); - mcf5249_vect_copy(&pcm[n0], &p[n0], n1/2-n0/2); -#else - for(i=0;ipcm[j]+prevCenter; ogg_int32_t *p=vb->pcm[j]; -#ifdef CPU_COLDFIRE - mcf5249_vect_add(pcm, p, n0); -#else - for(i=0;ipcm[j]+thisCenter; ogg_int32_t *p=vb->pcm[j]+n; -#ifdef CPU_COLDFIRE - mcf5249_vect_copy(pcm, p, n); -#else - for(i=0;i>16)&0x0000ffff) | ((x<<16)&0xffff0000); - x= ((x>> 8)&0x00ff00ff) | ((x<< 8)&0xff00ff00); - x= ((x>> 4)&0x0f0f0f0f) | ((x<< 4)&0xf0f0f0f0); - x= ((x>> 2)&0x33333333) | ((x<< 2)&0xcccccccc); - return((x>> 1)&0x55555555) | ((x<< 1)&0xaaaaaaaa); + ogg_uint32_t m = 0x00ff00ff; + + asm("mov %0, %0, ror #16\n\t" + "and r2, %1, %0, lsr #8\n\t" + "and %0, %0, %1\n\t" + "orr %0, r2, %0, lsl #8\n\t" + "eor %1, %1, %1, lsl #4\n\t" + "and r2, %1, %0, lsr #4\n\t" + "and %0, %0, %1\n\t" + "orr %0, r2, %0, lsl #4\n\t" + "eor %1, %1, %1, lsl #2\n\t" + "and r2, %1, %0, lsr #2\n\t" + "and %0, %0, %1\n\t" + "orr %0, r2, %0, lsl #2\n\t" + "eor %1, %1, %1, lsl #1\n\t" + "and r2, %1, %0, lsr #1\n\t" + "and %0, %0, %1\n\t" + "orr %0, r2, %0, lsl #1" + : "+r" (x), "+r" (m) + : : "r2"); + return x; } static inline long decode_packed_entry_number(codebook *book, @@ -180,9 +196,10 @@ while(hi-lo>1){ long p=(hi-lo)>>1; - long test=book->codelist[lo+p]>testword; - lo+=p&(test-1); - hi-=p&(-test); + if (book->codelist[lo+p]>testword) + hi-=p; + else + lo+=p; } if(book->dec_codelengths[lo]<=read){ @@ -195,6 +212,144 @@ return(-1); } +static long decode_packed_block(codebook *book, oggpack_buffer *b, + long *buf, int n){ + long *bufptr = buf; + long *bufend = buf + n; + + while (bufptrheadend > 8) { + ogg_uint32_t *ptr; + unsigned long bit, bitend; + unsigned long adr; + ogg_uint32_t word = 0; + int wordbits = 0; + + adr = (unsigned long)b->headptr; + bit = (adr&3)*8+b->headbit; + ptr = (ogg_uint32_t *)(adr&~3); + bitend = ((adr&3)+b->headend)*8; + while (bufptrdec_maxlength, 0)) { + if (bit-wordbits+32>=bitend) + break; + bit-=wordbits; + word=ptr[bit>>5] >> (bit&31); + if (bit&31) + word|=ptr[(bit>>5)+1] << (32-(bit&31)); + wordbits=32; + bit+=32; + } + + entry=book->dec_firsttable[word&((1<dec_firsttablen)-1)]; + if(__builtin_expect(entry&0x80000000UL, 0)){ + lo=(entry>>15)&0x7fff; + hi=book->used_entries-(entry&0x7fff); + { + ogg_uint32_t testword=bitreverse((ogg_uint32_t)word); + + while(__builtin_expect(hi-lo>1, 1)){ + long p=(hi-lo)>>1; + if (book->codelist[lo+p]>testword) + hi-=p; + else + lo+=p; + } + entry=lo; + } + }else + entry--; + + *bufptr++=entry; + { + int l=book->dec_codelengths[entry]; + wordbits-=l; + word>>=l; + } + } + + adr=(unsigned long)b->headptr; + bit-=(adr&3)*8+wordbits; + b->headend-=(bit/8); + b->headptr+=bit/8; + b->headbit=bit%8; + } else { + long r = decode_packed_entry_number(book, b); + if (r == -1) return bufptr-buf; + *bufptr++ = r; + } + } + return n; +} + +#if 0 +static long decode_packed_block(codebook *book, oggpack_buffer *b, + long *buf, int n){ + long *bufptr = buf; + long *bufend = buf + n; + + while (bufptrheadend > 12) { + ogg_uint32_t *ptr, *end; + int bit; + unsigned long adr; + + adr = (unsigned long)b->headptr; + bit = (adr&3)*8+b->headbit; + ptr = (ogg_uint32_t *)(adr&~3); + end = (ogg_uint32_t *)(b->headptr+b->headend-7); +/* + asm volatile ( + : "+r" (ptr), "+r" (bit), "+r" (bufptr) + : "r" (32-book->dec_maxlength), "r" ((1<dec_firsttablen)-1), + "r" (book->dec_firsttable), "r" (bufend), "r" (book) +*/ + while (ptr> bit; + if (bit>32-book->dec_maxlength) + word|=ptr[1]<<(32-bit); + entry=book->dec_firsttable[word&((1<dec_firsttablen)-1)]; + if(__builtin_expect(entry&0x80000000UL, 0)){ + lo=(entry>>15)&0x7fff; + hi=book->used_entries-(entry&0x7fff); + { + ogg_uint32_t testword=bitreverse((ogg_uint32_t)word); + + while(__builtin_expect(hi-lo>1, 1)){ + long p=(hi-lo)>>1; + if (book->codelist[lo+p]>testword) + hi-=p; + else + lo+=p; + } + entry=lo; + } + }else + entry--; + + *bufptr++=entry; + bit+=book->dec_codelengths[entry]; + ptr+=bit>>5; + bit&=31; + } + + adr=(unsigned long)ptr; + adr+=bit/8; + b->headend-=((unsigned char *)adr)-b->headptr; + b->headptr=(unsigned char *)adr; + b->headbit=bit%8; + } else { + long r = decode_packed_entry_number(book, b); + if (r == -1) return bufptr-buf; + *bufptr++ = r; + } + } + return n; +} +#endif + /* Decode side is specced and easier, because we don't need to find matches using different criteria; we simply read and map. There are two things we need to do 'depending': @@ -311,17 +466,20 @@ long vorbis_book_decodevv_add(codebook *book,ogg_int32_t **a, long offset,int ch, oggpack_buffer *b,int n,int point){ - long i,j,entry; + long i,j,k; int chptr=0; int shift=point-book->binarypoint; - - if(shift>=0){ - - for(i=offset;ivaluelist+entry*book->dim; + long entries[32]; + + for(i=offset;i=0){ + if (chunk*book->dim>(offset+n-i)*ch) + chunk=((offset+n-i)*ch+book->dim-1)/book->dim; + read = decode_packed_block(book,b,entries,chunk); + for(k=0;kvaluelist+entries[k]*book->dim; for (j=0;jdim;j++){ a[chptr++][i]+=t[j]>>shift; if(chptr==ch){ @@ -330,16 +488,14 @@ } } } - } - }else{ - shift = -shift; - for(i=offset;ivaluelist+entry*book->dim; + } else { + if (chunk*book->dim>(offset+n-i)*ch) + chunk=((offset+n-i)*ch+book->dim-1)/book->dim; + read = decode_packed_block(book,b,entries,chunk); + for(k=0;kvaluelist+entries[k]*book->dim; for (j=0;jdim;j++){ - a[chptr++][i]+=t[j]<pcm[info->coupling_mag[i]]; ogg_int32_t *pcmA=vb->pcm[info->coupling_ang[i]]; +#if 0 +#define COUPLE_STEP(m, a, a1, t) \ + "movs " a1 ", " m "\n\t" \ + "movpl " t ", " a "\n\t" \ + "rsbmi " t ", " a ", #0\n\t" \ + "cmp " a ", #0\n\t" \ + "subgt " a1 ", " a1 ", " t "\n\t" \ + "addle " m ", " m ", " a1 "\n\t" + + for (j=0;jchannels;j++) //_analysis_output("residue",seq+j,vb->pcm[j],-8,n/2,0,0); Index: apps/codecs/Tremor/mdct.c =================================================================== RCS file: /cvsroot/rockbox/apps/codecs/Tremor/mdct.c,v retrieving revision 1.8 diff -u -r1.8 mdct.c --- apps/codecs/Tremor/mdct.c 28 Dec 2005 20:42:24 -0000 1.8 +++ apps/codecs/Tremor/mdct.c 10 Apr 2006 18:11:54 -0000 @@ -41,6 +41,33 @@ /* 8 point butterfly (in place) */ STIN void mdct_butterfly_8(DATA_TYPE *x){ +#ifdef CPU_ARM + /* GCC sometimes uses r11 as input even when it is + marked as clobbered (GCC bug?), so I use r12. */ + asm volatile ("ldmia %0, {r0, r1, r2, r3, r4, r5, r6, r7}\n\t" + "add r8, r4, r0\n\t" + "sub r4, r4, r0\n\t" + "add r9, r5, r1\n\t" + "sub r5, r5, r1\n\t" + "add r10, r6, r2\n\t" + "sub r6, r6, r2\n\t" + "add r12, r7, r3\n\t" + "sub r7, r7, r3\n\t" + "add r0, r6, r5\n\t" + "sub r1, r7, r4\n\t" + "sub r2, r6, r5\n\t" + "add r3, r7, r4\n\t" + "sub r4, r10, r8\n\t" + "sub r5, r12, r9\n\t" + "add r6, r10, r8\n\t" + "add r7, r12, r9\n\t" + "stmia %0, {r0, r1, r2, r3, r4, r5, r6, r7}" + : : "r" (x) : + "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", + "r8", "r9", "r10", "r12", "memory"); + +#else + REG_TYPE r0 = x[4] + x[0]; REG_TYPE r1 = x[4] - x[0]; REG_TYPE r2 = x[5] + x[1]; @@ -59,6 +86,7 @@ x[6] = r4 + r0; x[7] = r6 + r2; MB(); +#endif } /* 16 point butterfly (in place, 4 register) */ @@ -143,35 +171,108 @@ mdct_butterfly_16(x+16); } +#define ASM_XPROD31(a, b, t, v, x, y, tmp) \ + "smull " tmp ", " x ", " a ", " t "\n\t" \ + "smlal " tmp ", " x ", " b ", " v "\n\t" \ + "rsb " a ", " a ", #0\n\t" \ + "smull " tmp ", " y ", " b ", " t "\n\t" \ + "smlal " tmp ", " y ", " a ", " v "\n\t" \ + "mov " x ", " x ", lsl #1\n\t" \ + "mov " y ", " y ", lsl #1\n\t" + +#define ASM_XNPROD31(a, b, t, v, x, y, tmp) \ + "rsb " y ", " b ", #0\n\t" \ + "smull " tmp ", " x ", " a ", " t "\n\t" \ + "smlal " tmp ", " x ", " y ", " v "\n\t" \ + "smull " tmp ", " y ", " b ", " t "\n\t" \ + "smlal " tmp ", " y ", " a ", " v "\n\t" \ + "mov " x ", " x ", lsl #1\n\t" \ + "mov " y ", " y ", lsl #1\n\t" + /* N/stage point generic N stage butterfly (in place, 4 register) */ -void mdct_butterfly_generic(DATA_TYPE *x,int points, int step) ICODE_ATTR; +void mdct_butterfly_generic(DATA_TYPE *x,int points, int step) ICODE_ATTR_TREMOR_MDCT; void mdct_butterfly_generic(DATA_TYPE *x,int points, int step){ LOOKUP_T *T = sincos_lookup0; - DATA_TYPE *x1 = x + points - 8; - DATA_TYPE *x2 = x + (points>>1) - 8; - REG_TYPE r0; - REG_TYPE r1; - REG_TYPE r2; - REG_TYPE r3; - + DATA_TYPE *x1 = x + points; + DATA_TYPE *x2 = x + (points>>1); +#if 0 + REG_TYPE r0 asm("r8"); + REG_TYPE r1 asm("r6"); + REG_TYPE r2 asm("r7"); + REG_TYPE r3 asm("r4"); +#endif + do{ + asm ("ldmdb %0, {r0, r1, r2, r3}\n\t" + "ldmdb %1, {r5, r6, r7, r8}\n\t" + "sub r4, r0, r5\n\t" + "add r0, r0, r5\n\t" + "sub r5, r6, r1\n\t" + "add r1, r1, r6\n\t" + "sub r6, r2, r7\n\t" + "add r2, r2, r7\n\t" + "sub r7, r8, r3\n\t" + "add r3, r3, r8\n\t" + "stmdb %0!, {r0, r1, r2, r3}\n\t" + "ldmia %2, {r0, r1}\n\t" + "add %2, %2, %3, lsl #2\n\t" + ASM_XPROD31("r5", "r4", "r0", "r1", "r2", "r3", "r8") + "ldmia %2, {r0, r1}\n\t" + "add %2, %2, %3, lsl #2\n\t" + ASM_XPROD31("r7", "r6", "r0", "r1", "r4", "r5", "r8") + "stmdb %1!, {r2, r3, r4, r5}" + : "+r" (x1), "+r" (x2), "+r" (T) + : "r" (step) + : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", + "memory"); + +#if 0 +#if 0 r0 = x1[6] - x2[6]; x1[6] += x2[6]; r1 = x2[7] - x1[7]; x1[7] += x2[7]; r2 = x1[4] - x2[4]; x1[4] += x2[4]; r3 = x2[5] - x1[5]; x1[5] += x2[5]; +#endif XPROD31( r1, r0, T[0], T[1], &x2[6], &x2[7] ); T+=step; XPROD31( r3, r2, T[0], T[1], &x2[4], &x2[5] ); T+=step; +#if 0 r0 = x1[2] - x2[2]; x1[2] += x2[2]; r1 = x2[3] - x1[3]; x1[3] += x2[3]; r2 = x1[0] - x2[0]; x1[0] += x2[0]; r3 = x2[1] - x1[1]; x1[1] += x2[1]; +#endif XPROD31( r1, r0, T[0], T[1], &x2[2], &x2[3] ); T+=step; XPROD31( r3, r2, T[0], T[1], &x2[0], &x2[1] ); T+=step; x1-=8; x2-=8; +#endif }while(Tsincos_lookup0); do{ + asm ("ldmdb %0, {r0, r1, r2, r3}\n\t" + "ldmdb %1, {r5, r6, r7, r8}\n\t" + "sub r4, r5, r0\n\t" + "add r0, r0, r5\n\t" + "sub r5, r6, r1\n\t" + "add r1, r1, r6\n\t" + "sub r6, r7, r2\n\t" + "add r2, r2, r7\n\t" + "sub r7, r8, r3\n\t" + "add r3, r3, r8\n\t" + "stmdb %0!, {r0, r1, r2, r3}\n\t" + "ldmia %2, {r0, r1}\n\t" + "add %2, %2, %3, lsl #2\n\t" + ASM_XPROD31("r4", "r5", "r0", "r1", "r2", "r3", "r8") + "ldmia %2, {r0, r1}\n\t" + "add %2, %2, %3, lsl #2\n\t" + ASM_XPROD31("r6", "r7", "r0", "r1", "r4", "r5", "r8") + "stmdb %1!, {r2, r3, r4, r5}" + : "+r" (x1), "+r" (x2), "+r" (T) + : "r" (step) + : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8", + "memory"); + +#if 0 r0 = x2[6] - x1[6]; x1[6] += x2[6]; r1 = x2[7] - x1[7]; x1[7] += x2[7]; r2 = x2[4] - x1[4]; x1[4] += x2[4]; @@ -196,16 +324,43 @@ XPROD31( r0, r1, T[0], T[1], &x2[6], &x2[7] ); T+=step; XPROD31( r2, r3, T[0], T[1], &x2[4], &x2[5] ); T+=step; +#if 0 r0 = x2[2] - x1[2]; x1[2] += x2[2]; r1 = x2[3] - x1[3]; x1[3] += x2[3]; r2 = x2[0] - x1[0]; x1[0] += x2[0]; r3 = x2[1] - x1[1]; x1[1] += x2[1]; +#endif XPROD31( r0, r1, T[0], T[1], &x2[2], &x2[3] ); T+=step; XPROD31( r2, r3, T[0], T[1], &x2[0], &x2[1] ); T+=step; x1-=8; x2-=8; +#endif }while(Tsincos_lookup0); } +void mdct_double_butterfly_generic(DATA_TYPE *x,int points, int step) ICODE_ATTR_TREMOR_MDCT; +void mdct_double_butterfly_generic(DATA_TYPE *x,int points,int step){ + + LOOKUP_T *T1 = sincos_lookup0; + LOOKUP_T *T2 = sincos_lookup0; + DATA_TYPE *x1 = x + points - 2; + DATA_TYPE *x2 = x + (points>>1) - 2; + //REG_TYPE r0; + //REG_TYPE r1; + //DATA_TYPE r3, r4, r5, r6, r7, r8, r9, r10; + //DATA_TYPE *x3, *x4; + + do{ + asm volatile ("ldmia %[x1], {r0, r1}\n\t" + "ldmia %[x2], {r2, r3}\n\t" + "add r2, r0, r2\n\t" + "add r3, r1, r3\n\t" + "rsb r0, r2, r0, lsl #1\n\t" + "sub r1, r3, r1, lsl #1\n\t" + "ldmia %[t1], {r4, r5}\n\t" + "smull r8, r6, r1, r4\n\t" + "smlal r8, r6, r0, r5\n\t" + "rsb r8, r1, #0\n\t" + "smull r8, r7, r5, r8\n\t" + "smlal r8, r7, r0, r4\n\t" + "mov r6, r6, lsl #1\n\t" + "mov r7, r7, lsl #1\n\t" + "stmia %[x2], {r6, r7}\n\t" + "sub %[x1], %[x1], %[points]\n\t" + "sub %[x2], %[x2], %[points]\n\t" + "ldmia %[x1], {r0, r1}\n\t" + "ldmia %[x2], {r6, r7}\n\t" + "add r6, r0, r6\n\t" + "add r7, r1, r7\n\t" + "sub r0, r6, r0, lsl #1\n\t" + "sub r1, r7, r1, lsl #1\n\t" + "add r6, r2, r6\n\t" + "add r7, r3, r7\n\t" + "rsb r2, r6, r2, lsl #1\n\t" + "sub r3, r7, r3, lsl #1\n\t" + "add %[x1], %[x1], %[points]\n\t" + "stmia %[x1], {r6, r7}\n\t" + "smull r8, r6, r0, r4\n\t" + "smlal r8, r6, r1, r5\n\t" + "rsb r8, r0, #0\n\t" + "smull r8, r7, r5, r8\n\t" + "smlal r8, r7, r1, r4\n\t" + "mov r6, r6, lsl #1\n\t" + "mov r7, r7, lsl #1\n\t" + "ldmia %[t2], {r4, r5}\n\t" + "smull r8, r0, r3, r4\n\t" + "smlal r8, r0, r2, r5\n\t" + "rsb r8, r3, #0\n\t" + "smull r8, r1, r5, r8\n\t" + "smlal r8, r1, r2, r4\n\t" + "mov r0, r0, lsl #1\n\t" + "mov r1, r1, lsl #1\n\t" + "sub %[x1], %[x1], %[points]\n\t" + "stmia %[x1], {r0, r1}\n\t" + "add %[x2], %[x2], %[points]\n\t" + "ldmia %[x2], {r0, r1}\n\t" + "add r6, r0, r6\n\t" + "add r7, r1, r7\n\t" + "rsb r0, r6, r0, lsl #1\n\t" + "sub r1, r7, r1, lsl #1\n\t" + "stmia %[x2], {r6, r7}\n\t" + "smull r8, r2, r1, r4\n\t" + "smlal r8, r2, r0, r5\n\t" + "rsb r8, r1, #0\n\t" + "smull r8, r3, r5, r8\n\t" + "smlal r8, r3, r0, r4\n\t" + "mov r2, r2, lsl #1\n\t" + "mov r3, r3, lsl #1\n\t" + "sub %[x2], %[x2], %[points]\n\t" + "stmia %[x2], {r2, r3}\n\t" + "add %[x1], %[x1], %[points]\n\t" + "add %[x2], %[x2], %[points]\n\t" + : : [x1] "r" (x1), [x2] "r" (x2), + [t1] "r" (T1), [t2] "r" (T2), [points] "r" (points) + : "r0", "r1", "r2", "r3", "r4", "r5", "r6", + "r7", "r8", "memory"); + T1+=step; + T2+=(step << 1); + x1-=2; x2-=2; + }while(T2sincos_lookup0); + do{ + asm volatile ("ldmia %[x1], {r0, r1}\n\t" + "ldmia %[x2], {r2, r3}\n\t" + "add r2, r0, r2\n\t" + "add r3, r1, r3\n\t" + "rsb r0, r2, r0, lsl #1\n\t" + "rsb r1, r3, r1, lsl #1\n\t" + "ldmia %[t1], {r4, r5}\n\t" + "rsb r7, r1, #0\n\t" + "smull r8, r6, r0, r4\n\t" + "smlal r8, r6, r7, r5\n\t" + "smull r8, r7, r1, r4\n\t" + "smlal r8, r7, r0, r5\n\t" + "mov r6, r6, lsl #1\n\t" + "mov r7, r7, lsl #1\n\t" + "stmia %[x2], {r6, r7}\n\t" + "sub %[x1], %[x1], %[points]\n\t" + "sub %[x2], %[x2], %[points]\n\t" + "ldmia %[x1], {r0, r1}\n\t" + "ldmia %[x2], {r6, r7}\n\t" + "add r6, r0, r6\n\t" + "add r7, r1, r7\n\t" + "rsb r0, r6, r0, lsl #1\n\t" + "sub r1, r7, r1, lsl #1\n\t" + "add r6, r2, r6\n\t" + "add r7, r3, r7\n\t" + "sub r2, r6, r2, lsl #1\n\t" + "sub r3, r7, r3, lsl #1\n\t" + "add %[x1], %[x1], %[points]\n\t" + "stmia %[x1], {r6, r7}\n\t" + "rsb r7, r0, #0\n\t" + "smull r8, r6, r1, r4\n\t" + "smlal r8, r6, r7, r5\n\t" + "smull r8, r7, r0, r4\n\t" + "smlal r8, r7, r1, r5\n\t" + "mov r6, r6, lsl #1\n\t" + "mov r7, r7, lsl #1\n\t" + "ldmia %[t2], {r4, r5}\n\t" + "smull r8, r0, r2, r4\n\t" + "smlal r8, r0, r3, r5\n\t" + "rsb r8, r2, #0\n\t" + "smull r8, r1, r5, r8\n\t" + "smlal r8, r1, r3, r4\n\t" + "mov r0, r0, lsl #1\n\t" + "mov r1, r1, lsl #1\n\t" + "sub %[x1], %[x1], %[points]\n\t" + "stmia %[x1], {r0, r1}\n\t" + "add %[x2], %[x2], %[points]\n\t" + "ldmia %[x2], {r0, r1}\n\t" + "add r6, r0, r6\n\t" + "add r7, r1, r7\n\t" + "sub r0, r6, r0, lsl #1\n\t" + "sub r1, r7, r1, lsl #1\n\t" + "stmia %[x2], {r6, r7}\n\t" + "smull r8, r2, r0, r4\n\t" + "smlal r8, r2, r1, r5\n\t" + "rsb r8, r0, #0\n\t" + "smull r8, r3, r5, r8\n\t" + "smlal r8, r3, r1, r4\n\t" + "mov r2, r2, lsl #1\n\t" + "mov r3, r3, lsl #1\n\t" + "sub %[x2], %[x2], %[points]\n\t" + "stmia %[x2], {r2, r3}\n\t" + "add %[x1], %[x1], %[points]\n\t" + "add %[x2], %[x2], %[points]\n\t" + : : [x1] "r" (x1), [x2] "r" (x2), + [t1] "r" (T1), [t2] "r" (T2), [points] "r" (points) + : "r0", "r1", "r2", "r3", "r4", "r5", "r6", + "r7", "r8", "memory"); + T1-=step; + T2+=(step<<1); + x1-=2; x2-=2; + }while(T2sincos_lookup0); +} + +static void mdct_butterflies(DATA_TYPE *x,int points,int shift) { + if (points > 64) { + mdct_double_butterfly_generic(x, points, 4 << shift); + mdct_butterflies(x, points >> 2, shift + 2); + mdct_butterflies(x + (points >> 2), points >> 2, shift + 2); + mdct_butterflies(x + (points >> 1), points >> 2, shift + 2); + mdct_butterflies(x + (points >> 2) * 3, points >> 2, shift + 2); + } else if (points > 32) { + mdct_butterfly_generic(x, points, 4 << shift); + mdct_butterflies(x, points >> 1, shift + 1); + mdct_butterflies(x + (points >> 1), points >> 1, shift + 1); + } else + mdct_butterfly_32(x); +} + +#if 0 STIN void mdct_butterflies(DATA_TYPE *x,int points,int shift) { int stages=8-shift; @@ -237,7 +718,7 @@ for(j=0;j>1; int n4=n>>2; @@ -477,6 +958,17 @@ oX2=oX1; do{ + asm volatile("ldmdb %0!,{r0,r1,r2,r3}\n\t" + "stmdb %1!,{r0,r1,r2,r3}\n\t" + "rsb r3,r3,#0\n\t" + "rsb r4,r2,#0\n\t" + "rsb r5,r1,#0\n\t" + "rsb r6,r0,#0\n\t" + "stmia %2!,{r3,r4,r5,r6}" + : "+r" (iX), "+r" (oX1), "+r" (oX2) : + : "r0", "r1", "r2", "r3", "r4" ,"r5", "r6", + "memory"); +#if 0 oX1-=4; iX-=4; @@ -486,6 +978,7 @@ oX2[3] = -(oX1[0] = iX[0]); oX2+=4; +#endif }while(oX2oX2); } } Index: apps/codecs/Tremor/misc.h =================================================================== RCS file: /cvsroot/rockbox/apps/codecs/Tremor/misc.h,v retrieving revision 1.7 diff -u -r1.7 misc.h --- apps/codecs/Tremor/misc.h 17 Oct 2005 18:46:46 -0000 1.7 +++ apps/codecs/Tremor/misc.h 10 Apr 2006 18:11:54 -0000 @@ -151,6 +151,51 @@ *y = MULT31(b, t) + MULT31(a, v); } #endif + +#ifndef _V_VECT_OPS +#define _V_VECT_OPS + +static inline +void vect_add(ogg_int32_t *x, ogg_int32_t *y, int n) +{ + while (n>0) { + *x++ += *y++; + n--; + } +} + +static inline +void vect_copy(ogg_int32_t *x, ogg_int32_t *y, int n) +{ + while (n>0) { + *x++ = *y++; + n--; + } +} + +static inline +void vect_mult_fw(ogg_int32_t *data, LOOKUP_T *window, int n) +{ + while(n>0) { + *data = MULT31(*data, *window); + data++; + window++; + n--; + } +} + +static inline +void vect_mult_bw(ogg_int32_t *data, LOOKUP_T *window, int n) +{ + while(n>0) { + *data = MULT31(*data, *window); + data++; + window--; + n--; + } +} +#endif + #endif #ifndef _V_CLIP_MATH Index: apps/codecs/Tremor/window.c =================================================================== RCS file: /cvsroot/rockbox/apps/codecs/Tremor/window.c,v retrieving revision 1.6 diff -u -r1.6 window.c --- apps/codecs/Tremor/window.c 18 Jul 2005 12:40:28 -0000 1.6 +++ apps/codecs/Tremor/window.c 10 Apr 2006 18:11:54 -0000 @@ -68,27 +68,11 @@ long rightbegin=n/2+n/4-rn/4; long rightend=rightbegin+rn/2; -#ifdef CPU_COLDFIRE memset((void *)&d[0], 0, sizeof(ogg_int32_t)*leftbegin); /* mcf5249_vect_zero(&d[0], leftbegin); */ - mcf5249_vect_mult_fw(&d[leftbegin], &window[lW][0], leftend-leftbegin); - mcf5249_vect_mult_bw(&d[rightbegin], &window[nW][rn/2-1], rightend-rightbegin); + vect_mult_fw(&d[leftbegin], &window[lW][0], leftend-leftbegin); + vect_mult_bw(&d[rightbegin], &window[nW][rn/2-1], rightend-rightbegin); memset((void *)&d[rightend], 0, sizeof(ogg_int32_t)*(n-rightend)); /* mcf5249_vect_zero(&d[rightend], n-rightend); */ -#else - int i,p; - - for(i=0;i