//
|
// Created by YY on 2018/10/30.
|
//
|
|
#include "libyuv.h"
|
#include <jni.h>
|
#include <android/log.h>
|
#include <arm_neon.h>
|
|
#define y(r,g,b) (((66 * r + 129 * g + 25 * b + 128) >> 8) + 16)
|
#define u(r,g,b) (((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128)
|
#define v(r,g,b) (((112 * r - 94 * g - 18 * b + 128) >> 8) + 128)
|
#define color(x) ((unsigned char)((x < 0) ? 0 : ((x > 255) ? 255 : x)))
|
|
/**
|
* type 0-3位表示b的偏移量
|
* 4-7位表示g的偏移量
|
* 8-11位表示r的偏移量
|
* 12-15位表示rgba一个像素所占的byte
|
* 16-19位表示yuv的类型,0为420sp,1为420p
|
*/
|
|
void rgbaToYuv(int width,int height,unsigned char * rgb,unsigned char * yuv,int type) {
|
const int frameSize = width * height;
|
const int yuvType=(type&0x10000)>>16;
|
const int byteRgba=(type&0x0F000)>>12;
|
const int rShift=(type&0x00F00)>>8;
|
const int gShift=(type&0x000F0)>>4;
|
const int bShift= (type&0x0000F);
|
const int uIndex=0;
|
const int vIndex=yuvType; //yuvType为1表示YUV420p,为0表示420sp
|
|
int yIndex = 0;
|
int uvIndex[2]={frameSize,frameSize+frameSize/4};
|
|
unsigned char R, G, B, Y, U, V;
|
unsigned int index = 0;
|
for (int j = 0; j < height; j++) {
|
for (int i = 0; i < width; i++) {
|
index = j * width + i;
|
|
R = rgb[index*byteRgba+rShift]&0xFF;
|
G = rgb[index*byteRgba+gShift]&0xFF;
|
B = rgb[index*byteRgba+bShift]&0xFF;
|
|
Y = y(R,G,B);
|
U = u(R,G,B);
|
V = v(R,G,B);
|
|
yuv[yIndex++] = color(Y);
|
if (j % 2 == 0 && index % 2 == 0) {
|
yuv[uvIndex[uIndex]++] =color(U);
|
yuv[uvIndex[vIndex]++] =color(V);
|
}
|
}
|
}
|
}
|
|
void rgbaToYuvSIMD(int width,int height,unsigned char * rgb,unsigned char * yuv,int type) {
|
const int frameSize = width * height;
|
const int yuvType=(type&0x30000)>>16;
|
// const int byteRgba=(type&0x0F000)>>12;
|
const int rShift=(type&0x00F00)>>8;
|
const int gShift=(type&0x000F0)>>4;
|
const int bShift= (type&0x0000F);
|
//const int uIndex=0;
|
//const int vIndex=yuvType; //yuvType为1表示YUV420p,为0表示420sp
|
|
int yIndex = 0;
|
// int uvIndex[2]={frameSize,frameSize+frameSize/4};
|
int n = 0;
|
|
int uIndex, vIndex;
|
int uvIndexStep;
|
|
if (yuvType == 0) {
|
uIndex = frameSize;
|
vIndex = frameSize + 1;
|
uvIndexStep = 2;
|
} else if (yuvType == 1) {
|
uIndex = frameSize;
|
vIndex = frameSize+frameSize/4;
|
uvIndexStep = 1;
|
} else if (yuvType == 2) {
|
vIndex = frameSize;
|
uIndex = frameSize + 1;
|
uvIndexStep = 2;
|
} else if (yuvType == 3) {
|
vIndex = frameSize;
|
uIndex = frameSize+frameSize/4;
|
uvIndexStep = 1;
|
}
|
|
|
uint8x8_t Y_RFac = vdup_n_u8(66);
|
uint8x8_t Y_GFac = vdup_n_u8(129);
|
uint8x8_t Y_BFac = vdup_n_u8(25);
|
uint8x8_t Y_Add1Fac = vdup_n_u8(128);
|
uint8x8_t Y_Add2Fac = vdup_n_u8(16);
|
|
int16x8_t U_RFac = vdupq_n_s16(-38);
|
int16x8_t U_GFac = vdupq_n_s16(-74);
|
int16x8_t U_BFac = vdupq_n_s16(112);
|
int16x8_t U_AddFac = vdupq_n_s16(128);
|
|
int16x8_t V_RFac = vdupq_n_s16(112);
|
int16x8_t V_GFac = vdupq_n_s16(-94);
|
int16x8_t V_BFac = vdupq_n_s16(-18);
|
|
for (int j = 0; j < height; j++) {
|
for (int i = 0; i < width; i += 8) { // 每次处理32字节(8像素)
|
uint8x8x4_t a = vld4_u8(rgb + n); // 步长为4交叉存入4个向量寄存器
|
|
n += 32;
|
|
// Y分量(((66 * r + 129 * g + 25 * b + 128) >> 8) + 16)
|
uint8x8_t result;
|
uint16x8_t temp = vmull_u8(a.val[rShift], Y_RFac);
|
temp = vmlal_u8 (temp, a.val[gShift], Y_GFac);
|
temp = vmlal_u8(temp, a.val[bShift], Y_BFac);
|
temp = vaddw_u8(temp, Y_Add1Fac);
|
result = vshrn_n_u16(temp, 8);
|
result = vqadd_u8(result, Y_Add2Fac);
|
|
vst1_u8(yuv + yIndex, result); // 从向量寄存器回存8字节Y分量(8像素)
|
yIndex += 8;
|
|
if ((j % 2 == 0) && (i % 2 == 0)) {
|
int16_t uBuffer[8];
|
int16_t vBuffer[8];
|
|
// U分量(((-38 * r - 74 * g + 112 * b + 128) >> 8) + 128)
|
int16x8_t U;
|
int16x8_t ur = {vget_lane_u8(a.val[rShift], 0),
|
vget_lane_u8(a.val[rShift], 1),
|
vget_lane_u8(a.val[rShift], 2),
|
vget_lane_u8(a.val[rShift], 3),
|
vget_lane_u8(a.val[rShift], 4),
|
vget_lane_u8(a.val[rShift], 5),
|
vget_lane_u8(a.val[rShift], 6),
|
vget_lane_u8(a.val[rShift], 7)};
|
|
int16x8_t ug = {vget_lane_u8(a.val[gShift], 0),
|
vget_lane_u8(a.val[gShift], 1),
|
vget_lane_u8(a.val[gShift], 2),
|
vget_lane_u8(a.val[gShift], 3),
|
vget_lane_u8(a.val[gShift], 4),
|
vget_lane_u8(a.val[gShift], 5),
|
vget_lane_u8(a.val[gShift], 6),
|
vget_lane_u8(a.val[gShift], 7)};
|
|
int16x8_t ub = {vget_lane_u8(a.val[bShift], 0),
|
vget_lane_u8(a.val[bShift], 1),
|
vget_lane_u8(a.val[bShift], 2),
|
vget_lane_u8(a.val[bShift], 3),
|
vget_lane_u8(a.val[bShift], 4),
|
vget_lane_u8(a.val[bShift], 5),
|
vget_lane_u8(a.val[bShift], 6),
|
vget_lane_u8(a.val[bShift], 7)};
|
|
U = vmulq_s16(ur, U_RFac);
|
U = vmlaq_s16(U, ug, U_GFac);
|
U = vmlaq_s16(U, ub, U_BFac);
|
U = vaddq_s16(U, U_AddFac);
|
|
U = vshrq_n_s16(U, 8);
|
U = vaddq_s16(U, U_AddFac); // 范围[16 - 240]
|
|
vst1q_s16(uBuffer, U);
|
|
// V分量(((112 * r - 94 * g - 18 * b + 128) >> 8) + 128)
|
int16x8_t V;
|
|
V = vmulq_s16(ur, V_RFac);
|
V = vmlaq_s16(V, ug, V_GFac);
|
V = vmlaq_s16(V, ub, V_BFac);
|
V = vaddq_s16(V, U_AddFac);
|
|
V = vshrq_n_s16(V, 8);
|
V = vaddq_s16(V, U_AddFac); // 范围[16 - 240]
|
|
vst1q_s16(vBuffer, V);
|
|
yuv[uIndex] = uBuffer[0];
|
uIndex += uvIndexStep;
|
yuv[uIndex] = uBuffer[2];
|
uIndex += uvIndexStep;
|
yuv[uIndex] = uBuffer[4];
|
uIndex += uvIndexStep;
|
yuv[uIndex] = uBuffer[6];
|
uIndex += uvIndexStep;
|
|
yuv[vIndex] = vBuffer[0];
|
vIndex += uvIndexStep;
|
yuv[vIndex] = vBuffer[2];
|
vIndex += uvIndexStep;
|
yuv[vIndex] = vBuffer[4];
|
vIndex += uvIndexStep;
|
yuv[vIndex] = vBuffer[6];
|
vIndex += uvIndexStep;
|
}
|
}
|
}
|
}
|