## ffmpeg / libavcodec / armv4l / simple_idct_armv6.S @ 782fc0c3

History | View | Annotate | Download (13.3 KB)

1 |
/* |
---|---|

2 |
* Simple IDCT |

3 |
* |

4 |
* Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at> |

5 |
* Copyright (c) 2007 Mans Rullgard <mans@mansr.com> |

6 |
* |

7 |
* This file is part of FFmpeg. |

8 |
* |

9 |
* FFmpeg is free software; you can redistribute it and/or |

10 |
* modify it under the terms of the GNU Lesser General Public |

11 |
* License as published by the Free Software Foundation; either |

12 |
* version 2.1 of the License, or (at your option) any later version. |

13 |
* |

14 |
* FFmpeg is distributed in the hope that it will be useful, |

15 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |

16 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |

17 |
* Lesser General Public License for more details. |

18 |
* |

19 |
* You should have received a copy of the GNU Lesser General Public |

20 |
* License along with FFmpeg; if not, write to the Free Software |

21 |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |

22 |
*/ |

23 | |

24 |
#include "asm.S" |

25 | |

26 |
#define W1 22725 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

27 |
#define W2 21407 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

28 |
#define W3 19266 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

29 |
#define W4 16383 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

30 |
#define W5 12873 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

31 |
#define W6 8867 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

32 |
#define W7 4520 /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */ |

33 |
#define ROW_SHIFT 11 |

34 |
#define COL_SHIFT 20 |

35 | |

36 |
#define W13 (W1 | (W3 << 16)) |

37 |
#define W26 (W2 | (W6 << 16)) |

38 |
#define W42 (W4 | (W2 << 16)) |

39 |
#define W42n (-W4&0xffff | (-W2 << 16)) |

40 |
#define W46 (W4 | (W6 << 16)) |

41 |
#define W57 (W5 | (W7 << 16)) |

42 | |

43 |
.text |

44 |
.align |

45 |
w13: .long W13 |

46 |
w26: .long W26 |

47 |
w42: .long W42 |

48 |
w42n: .long W42n |

49 |
w46: .long W46 |

50 |
w57: .long W57 |

51 | |

52 |
/* |

53 |
Compute partial IDCT of single row. |

54 |
shift = left-shift amount |

55 |
a1 = source address |

56 |
a3 = row[2,0] <= 2 cycles |

57 |
a4 = row[3,1] |

58 |
ip = w42 <= 2 cycles |

59 | |

60 |
Output in registers v1--v8 |

61 |
*/ |

62 |
.macro idct_row shift |

63 |
ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ |

64 |
mov a2, #(1<<(\shift-1)) |

65 |
smlad v1, a3, ip, a2 |

66 |
smlsd v4, a3, ip, a2 |

67 |
ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ |

68 |
ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ |

69 |
smlad v2, a3, lr, a2 |

70 |
smlsd v3, a3, lr, a2 |

71 | |

72 |
smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ |

73 |
smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ |

74 |
ldr lr, [a1, #12] /* lr = row[7,5] */ |

75 |
pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ |

76 |
pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ |

77 |
smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ |

78 |
smlad v5, lr, v7, v5 /* B0 += W5*row[5] + W7*row[7] */ |

79 |
smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ |

80 | |

81 |
ldr a4, [pc, #(w42n-.-8)] /* a4 = -W4 | (-W2 << 16) */ |

82 |
smlad v7, lr, a3, v7 /* B2 += W7*row[5] + W3*row[7] */ |

83 |
ldr a3, [a1, #4] /* a3 = row[6,4] */ |

84 |
smlsdx fp, lr, ip, fp /* B3 += W3*row[5] - W1*row[7] */ |

85 |
ldr ip, [pc, #(w46-.-8)] /* ip = W4 | (W6 << 16) */ |

86 |
smlad v6, lr, a2, v6 /* B1 -= W1*row[5] + W5*row[7] */ |

87 | |

88 |
smlad v2, a3, a4, v2 /* A1 += -W4*row[4] - W2*row[6] */ |

89 |
smlsd v3, a3, a4, v3 /* A2 += -W4*row[4] + W2*row[6] */ |

90 |
smlad v1, a3, ip, v1 /* A0 += W4*row[4] + W6*row[6] */ |

91 |
smlsd v4, a3, ip, v4 /* A3 += W4*row[4] - W6*row[6] */ |

92 |
.endm |

93 | |

94 |
/* |

95 |
Compute partial IDCT of half row. |

96 |
shift = left-shift amount |

97 |
a3 = row[2,0] |

98 |
a4 = row[3,1] |

99 |
ip = w42 |

100 | |

101 |
Output in registers v1--v8 |

102 |
*/ |

103 |
.macro idct_row4 shift |

104 |
ldr lr, [pc, #(w46-.-8)] /* lr = W4 | (W6 << 16) */ |

105 |
ldr v7, [pc, #(w57-.-8)] /* v7 = W5 | (W7 << 16) */ |

106 |
mov a2, #(1<<(\shift-1)) |

107 |
smlad v1, a3, ip, a2 |

108 |
smlsd v4, a3, ip, a2 |

109 |
ldr ip, [pc, #(w13-.-8)] /* ip = W1 | (W3 << 16) */ |

110 |
smlad v2, a3, lr, a2 |

111 |
smlsd v3, a3, lr, a2 |

112 |
smusdx fp, a4, v7 /* fp = B3 = W7*row[1] - W5*row[3] */ |

113 |
smuad v5, a4, ip /* v5 = B0 = W1*row[1] + W3*row[3] */ |

114 |
pkhtb a3, ip, v7, asr #16 /* a4 = W7 | (W3 << 16) */ |

115 |
pkhbt a2, ip, v7, lsl #16 /* a2 = W1 | (W5 << 16) */ |

116 |
smusdx v6, a3, a4 /* v6 = -B1 = W7*row[3] - W3*row[1] */ |

117 |
smusdx v7, a4, a2 /* v7 = B2 = W5*row[1] - W1*row[3] */ |

118 |
.endm |

119 | |

120 |
/* |

121 |
Compute final part of IDCT single row without shift. |

122 |
Input in registers v1--v8 |

123 |
Output in registers ip, v1--v3, lr, v5--v7 |

124 |
*/ |

125 |
.macro idct_finish |

126 |
add ip, v1, v5 /* a2 = A0 + B0 */ |

127 |
sub lr, v1, v5 /* a3 = A0 - B0 */ |

128 |
sub v1, v2, v6 /* a3 = A1 + B1 */ |

129 |
add v5, v2, v6 /* a3 = A1 - B1 */ |

130 |
add v2, v3, v7 /* a2 = A2 + B2 */ |

131 |
sub v6, v3, v7 /* a2 = A2 - B2 */ |

132 |
add v3, v4, fp /* a3 = A3 + B3 */ |

133 |
sub v7, v4, fp /* a3 = A3 - B3 */ |

134 |
.endm |

135 | |

136 |
/* |

137 |
Compute final part of IDCT single row. |

138 |
shift = right-shift amount |

139 |
Input/output in registers v1--v8 |

140 |
*/ |

141 |
.macro idct_finish_shift shift |

142 |
add a4, v1, v5 /* a4 = A0 + B0 */ |

143 |
sub a3, v1, v5 /* a3 = A0 - B0 */ |

144 |
mov v1, a4, asr #\shift |

145 |
mov v5, a3, asr #\shift |

146 | |

147 |
sub a4, v2, v6 /* a4 = A1 + B1 */ |

148 |
add a3, v2, v6 /* a3 = A1 - B1 */ |

149 |
mov v2, a4, asr #\shift |

150 |
mov v6, a3, asr #\shift |

151 | |

152 |
add a4, v3, v7 /* a4 = A2 + B2 */ |

153 |
sub a3, v3, v7 /* a3 = A2 - B2 */ |

154 |
mov v3, a4, asr #\shift |

155 |
mov v7, a3, asr #\shift |

156 | |

157 |
add a4, v4, fp /* a4 = A3 + B3 */ |

158 |
sub a3, v4, fp /* a3 = A3 - B3 */ |

159 |
mov v4, a4, asr #\shift |

160 |
mov fp, a3, asr #\shift |

161 |
.endm |

162 | |

163 |
/* |

164 |
Compute final part of IDCT single row, saturating results at 8 bits. |

165 |
shift = right-shift amount |

166 |
Input/output in registers v1--v8 |

167 |
*/ |

168 |
.macro idct_finish_shift_sat shift |

169 |
add a4, v1, v5 /* a4 = A0 + B0 */ |

170 |
sub ip, v1, v5 /* ip = A0 - B0 */ |

171 |
usat v1, #8, a4, asr #\shift |

172 |
usat v5, #8, ip, asr #\shift |

173 | |

174 |
sub a4, v2, v6 /* a4 = A1 + B1 */ |

175 |
add ip, v2, v6 /* ip = A1 - B1 */ |

176 |
usat v2, #8, a4, asr #\shift |

177 |
usat v6, #8, ip, asr #\shift |

178 | |

179 |
add a4, v3, v7 /* a4 = A2 + B2 */ |

180 |
sub ip, v3, v7 /* ip = A2 - B2 */ |

181 |
usat v3, #8, a4, asr #\shift |

182 |
usat v7, #8, ip, asr #\shift |

183 | |

184 |
add a4, v4, fp /* a4 = A3 + B3 */ |

185 |
sub ip, v4, fp /* ip = A3 - B3 */ |

186 |
usat v4, #8, a4, asr #\shift |

187 |
usat fp, #8, ip, asr #\shift |

188 |
.endm |

189 | |

190 |
/* |

191 |
Compute IDCT of single row, storing as column. |

192 |
a1 = source |

193 |
a2 = dest |

194 |
*/ |

195 |
function idct_row_armv6 |

196 |
str lr, [sp, #-4]! |

197 | |

198 |
ldr lr, [a1, #12] /* lr = row[7,5] */ |

199 |
ldr ip, [a1, #4] /* ip = row[6,4] */ |

200 |
ldr a4, [a1, #8] /* a4 = row[3,1] */ |

201 |
ldr a3, [a1] /* a3 = row[2,0] */ |

202 |
orrs lr, lr, ip |

203 |
cmpeq lr, a4 |

204 |
cmpeq lr, a3, lsr #16 |

205 |
beq 1f |

206 |
str a2, [sp, #-4]! |

207 |
ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |

208 |
cmp lr, #0 |

209 |
beq 2f |

210 | |

211 |
idct_row ROW_SHIFT |

212 |
b 3f |

213 | |

214 |
2: idct_row4 ROW_SHIFT |

215 | |

216 |
3: ldr a2, [sp], #4 |

217 |
idct_finish_shift ROW_SHIFT |

218 | |

219 |
strh v1, [a2] |

220 |
strh v2, [a2, #(16*2)] |

221 |
strh v3, [a2, #(16*4)] |

222 |
strh v4, [a2, #(16*6)] |

223 |
strh fp, [a2, #(16*1)] |

224 |
strh v7, [a2, #(16*3)] |

225 |
strh v6, [a2, #(16*5)] |

226 |
strh v5, [a2, #(16*7)] |

227 | |

228 |
ldr pc, [sp], #4 |

229 | |

230 |
1: mov a3, a3, lsl #3 |

231 |
strh a3, [a2] |

232 |
strh a3, [a2, #(16*2)] |

233 |
strh a3, [a2, #(16*4)] |

234 |
strh a3, [a2, #(16*6)] |

235 |
strh a3, [a2, #(16*1)] |

236 |
strh a3, [a2, #(16*3)] |

237 |
strh a3, [a2, #(16*5)] |

238 |
strh a3, [a2, #(16*7)] |

239 |
ldr pc, [sp], #4 |

240 |
.endfunc |

241 | |

242 |
/* |

243 |
Compute IDCT of single column, read as row. |

244 |
a1 = source |

245 |
a2 = dest |

246 |
*/ |

247 |
function idct_col_armv6 |

248 |
stmfd sp!, {a2, lr} |

249 | |

250 |
ldr a3, [a1] /* a3 = row[2,0] */ |

251 |
ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |

252 |
ldr a4, [a1, #8] /* a4 = row[3,1] */ |

253 |
idct_row COL_SHIFT |

254 |
ldr a2, [sp], #4 |

255 |
idct_finish_shift COL_SHIFT |

256 | |

257 |
strh v1, [a2] |

258 |
strh v2, [a2, #(16*1)] |

259 |
strh v3, [a2, #(16*2)] |

260 |
strh v4, [a2, #(16*3)] |

261 |
strh fp, [a2, #(16*4)] |

262 |
strh v7, [a2, #(16*5)] |

263 |
strh v6, [a2, #(16*6)] |

264 |
strh v5, [a2, #(16*7)] |

265 | |

266 |
ldr pc, [sp], #4 |

267 |
.endfunc |

268 | |

269 |
/* |

270 |
Compute IDCT of single column, read as row, store saturated 8-bit. |

271 |
a1 = source |

272 |
a2 = dest |

273 |
a3 = line size |

274 |
*/ |

275 |
function idct_col_put_armv6 |

276 |
stmfd sp!, {a2, a3, lr} |

277 | |

278 |
ldr a3, [a1] /* a3 = row[2,0] */ |

279 |
ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |

280 |
ldr a4, [a1, #8] /* a4 = row[3,1] */ |

281 |
idct_row COL_SHIFT |

282 |
ldmfd sp!, {a2, a3} |

283 |
idct_finish_shift_sat COL_SHIFT |

284 | |

285 |
strb v1, [a2], a3 |

286 |
strb v2, [a2], a3 |

287 |
strb v3, [a2], a3 |

288 |
strb v4, [a2], a3 |

289 |
strb fp, [a2], a3 |

290 |
strb v7, [a2], a3 |

291 |
strb v6, [a2], a3 |

292 |
strb v5, [a2], a3 |

293 | |

294 |
sub a2, a2, a3, lsl #3 |

295 | |

296 |
ldr pc, [sp], #4 |

297 |
.endfunc |

298 | |

299 |
/* |

300 |
Compute IDCT of single column, read as row, add/store saturated 8-bit. |

301 |
a1 = source |

302 |
a2 = dest |

303 |
a3 = line size |

304 |
*/ |

305 |
function idct_col_add_armv6 |

306 |
stmfd sp!, {a2, a3, lr} |

307 | |

308 |
ldr a3, [a1] /* a3 = row[2,0] */ |

309 |
ldr ip, [pc, #(w42-.-8)] /* ip = W4 | (W2 << 16) */ |

310 |
ldr a4, [a1, #8] /* a4 = row[3,1] */ |

311 |
idct_row COL_SHIFT |

312 |
ldmfd sp!, {a2, a3} |

313 |
idct_finish |

314 | |

315 |
ldrb a4, [a2] |

316 |
ldrb v4, [a2, a3] |

317 |
ldrb fp, [a2, a3, lsl #2] |

318 |
add ip, a4, ip, asr #COL_SHIFT |

319 |
usat ip, #8, ip |

320 |
add v1, v4, v1, asr #COL_SHIFT |

321 |
strb ip, [a2], a3 |

322 |
ldrb ip, [a2, a3] |

323 |
usat v1, #8, v1 |

324 |
ldrb fp, [a2, a3, lsl #2] |

325 |
add v2, ip, v2, asr #COL_SHIFT |

326 |
usat v2, #8, v2 |

327 |
strb v1, [a2], a3 |

328 |
ldrb a4, [a2, a3] |

329 |
ldrb ip, [a2, a3, lsl #2] |

330 |
strb v2, [a2], a3 |

331 |
ldrb v4, [a2, a3] |

332 |
ldrb v1, [a2, a3, lsl #2] |

333 |
add v3, a4, v3, asr #COL_SHIFT |

334 |
usat v3, #8, v3 |

335 |
add v7, v4, v7, asr #COL_SHIFT |

336 |
usat v7, #8, v7 |

337 |
add v6, fp, v6, asr #COL_SHIFT |

338 |
usat v6, #8, v6 |

339 |
add v5, ip, v5, asr #COL_SHIFT |

340 |
usat v5, #8, v5 |

341 |
add lr, v1, lr, asr #COL_SHIFT |

342 |
usat lr, #8, lr |

343 |
strb v3, [a2], a3 |

344 |
strb v7, [a2], a3 |

345 |
strb v6, [a2], a3 |

346 |
strb v5, [a2], a3 |

347 |
strb lr, [a2], a3 |

348 | |

349 |
sub a2, a2, a3, lsl #3 |

350 | |

351 |
ldr pc, [sp], #4 |

352 |
.endfunc |

353 | |

354 |
/* |

355 |
Compute 8 IDCT row transforms. |

356 |
func = IDCT row->col function |

357 |
width = width of columns in bytes |

358 |
*/ |

359 |
.macro idct_rows func width |

360 |
bl \func |

361 |
add a1, a1, #(16*2) |

362 |
add a2, a2, #\width |

363 |
bl \func |

364 |
add a1, a1, #(16*2) |

365 |
add a2, a2, #\width |

366 |
bl \func |

367 |
add a1, a1, #(16*2) |

368 |
add a2, a2, #\width |

369 |
bl \func |

370 |
sub a1, a1, #(16*5) |

371 |
add a2, a2, #\width |

372 |
bl \func |

373 |
add a1, a1, #(16*2) |

374 |
add a2, a2, #\width |

375 |
bl \func |

376 |
add a1, a1, #(16*2) |

377 |
add a2, a2, #\width |

378 |
bl \func |

379 |
add a1, a1, #(16*2) |

380 |
add a2, a2, #\width |

381 |
bl \func |

382 | |

383 |
sub a1, a1, #(16*7) |

384 |
.endm |

385 | |

386 |
/* void ff_simple_idct_armv6(DCTELEM *data); */ |

387 |
function ff_simple_idct_armv6, export=1 |

388 |
stmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, lr} |

389 |
sub sp, sp, #128 |

390 | |

391 |
mov a2, sp |

392 |
idct_rows idct_row_armv6, 2 |

393 |
mov a2, a1 |

394 |
mov a1, sp |

395 |
idct_rows idct_col_armv6, 2 |

396 | |

397 |
add sp, sp, #128 |

398 |
ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} |

399 |
.endfunc |

400 | |

401 |
/* ff_simple_idct_add_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ |

402 |
function ff_simple_idct_add_armv6, export=1 |

403 |
stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} |

404 |
sub sp, sp, #128 |

405 | |

406 |
mov a1, a3 |

407 |
mov a2, sp |

408 |
idct_rows idct_row_armv6, 2 |

409 |
mov a1, sp |

410 |
ldr a2, [sp, #128] |

411 |
ldr a3, [sp, #(128+4)] |

412 |
idct_rows idct_col_add_armv6, 1 |

413 | |

414 |
add sp, sp, #(128+8) |

415 |
ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} |

416 |
.endfunc |

417 | |

418 |
/* ff_simple_idct_put_armv6(uint8_t *dest, int line_size, DCTELEM *data); */ |

419 |
function ff_simple_idct_put_armv6, export=1 |

420 |
stmfd sp!, {a1, a2, v1, v2, v3, v4, v5, v6, v7, fp, lr} |

421 |
sub sp, sp, #128 |

422 | |

423 |
mov a1, a3 |

424 |
mov a2, sp |

425 |
idct_rows idct_row_armv6, 2 |

426 |
mov a1, sp |

427 |
ldr a2, [sp, #128] |

428 |
ldr a3, [sp, #(128+4)] |

429 |
idct_rows idct_col_put_armv6, 1 |

430 | |

431 |
add sp, sp, #(128+8) |

432 |
ldmfd sp!, {v1, v2, v3, v4, v5, v6, v7, fp, pc} |

433 |
.endfunc |