generated from hertog/godot-module-template
Initial commit
This commit is contained in:
commit
65227bf3a5
12416 changed files with 6001067 additions and 0 deletions
28
engine/thirdparty/libtheora/COPYING
vendored
Normal file
28
engine/thirdparty/libtheora/COPYING
vendored
Normal file
|
|
@ -0,0 +1,28 @@
|
|||
Copyright (C) 2002-2009 Xiph.org Foundation
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
- Neither the name of the Xiph.org Foundation nor the names of its
|
||||
contributors may be used to endorse or promote products derived from
|
||||
this software without specific prior written permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||||
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||||
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||||
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
18
engine/thirdparty/libtheora/LICENSE
vendored
Normal file
18
engine/thirdparty/libtheora/LICENSE
vendored
Normal file
|
|
@ -0,0 +1,18 @@
|
|||
Please see the file COPYING for the copyright license for this software.
|
||||
|
||||
In addition to and irrespective of the copyright license associated
|
||||
with this software, On2 Technologies, Inc. makes the following statement
|
||||
regarding technology used in this software:
|
||||
|
||||
On2 represents and warrants that it shall not assert any rights
|
||||
relating to infringement of On2's registered patents, nor initiate
|
||||
any litigation asserting such rights, against any person who, or
|
||||
entity which utilizes the On2 VP3 Codec Software, including any
|
||||
use, distribution, and sale of said Software; which make changes,
|
||||
modifications, and improvements in said Software; and to use,
|
||||
distribute, and sell said changes as well as applications for other
|
||||
fields of use.
|
||||
|
||||
This reference implementation is originally derived from the On2 VP3
|
||||
Codec Software, and the Theora video format is essentially compatible
|
||||
with the VP3 video format, consisting of a backward-compatible superset.
|
||||
2712
engine/thirdparty/libtheora/analyze.c
vendored
Normal file
2712
engine/thirdparty/libtheora/analyze.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
166
engine/thirdparty/libtheora/apiwrapper.c
vendored
Normal file
166
engine/thirdparty/libtheora/apiwrapper.c
vendored
Normal file
|
|
@ -0,0 +1,166 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include "apiwrapper.h"
|
||||
|
||||
|
||||
|
||||
const char *theora_version_string(void){
|
||||
return th_version_string();
|
||||
}
|
||||
|
||||
ogg_uint32_t theora_version_number(void){
|
||||
return th_version_number();
|
||||
}
|
||||
|
||||
void theora_info_init(theora_info *_ci){
|
||||
memset(_ci,0,sizeof(*_ci));
|
||||
}
|
||||
|
||||
void theora_info_clear(theora_info *_ci){
|
||||
th_api_wrapper *api;
|
||||
api=(th_api_wrapper *)_ci->codec_setup;
|
||||
memset(_ci,0,sizeof(*_ci));
|
||||
if(api!=NULL){
|
||||
if(api->clear!=NULL)(*api->clear)(api);
|
||||
_ogg_free(api);
|
||||
}
|
||||
}
|
||||
|
||||
void theora_clear(theora_state *_th){
|
||||
/*Provide compatibility with mixed encoder and decoder shared lib versions.*/
|
||||
if(_th->internal_decode!=NULL){
|
||||
(*((oc_state_dispatch_vtable *)_th->internal_decode)->clear)(_th);
|
||||
}
|
||||
if(_th->internal_encode!=NULL){
|
||||
(*((oc_state_dispatch_vtable *)_th->internal_encode)->clear)(_th);
|
||||
}
|
||||
if(_th->i!=NULL)theora_info_clear(_th->i);
|
||||
memset(_th,0,sizeof(*_th));
|
||||
}
|
||||
|
||||
int theora_control(theora_state *_th,int _req,void *_buf,size_t _buf_sz){
|
||||
/*Provide compatibility with mixed encoder and decoder shared lib versions.*/
|
||||
if(_th->internal_decode!=NULL){
|
||||
return (*((oc_state_dispatch_vtable *)_th->internal_decode)->control)(_th,
|
||||
_req,_buf,_buf_sz);
|
||||
}
|
||||
else if(_th->internal_encode!=NULL){
|
||||
return (*((oc_state_dispatch_vtable *)_th->internal_encode)->control)(_th,
|
||||
_req,_buf,_buf_sz);
|
||||
}
|
||||
else return TH_EINVAL;
|
||||
}
|
||||
|
||||
ogg_int64_t theora_granule_frame(theora_state *_th,ogg_int64_t _gp){
|
||||
/*Provide compatibility with mixed encoder and decoder shared lib versions.*/
|
||||
if(_th->internal_decode!=NULL){
|
||||
return (*((oc_state_dispatch_vtable *)_th->internal_decode)->granule_frame)(
|
||||
_th,_gp);
|
||||
}
|
||||
else if(_th->internal_encode!=NULL){
|
||||
return (*((oc_state_dispatch_vtable *)_th->internal_encode)->granule_frame)(
|
||||
_th,_gp);
|
||||
}
|
||||
else return -1;
|
||||
}
|
||||
|
||||
double theora_granule_time(theora_state *_th, ogg_int64_t _gp){
|
||||
/*Provide compatibility with mixed encoder and decoder shared lib versions.*/
|
||||
if(_th->internal_decode!=NULL){
|
||||
return (*((oc_state_dispatch_vtable *)_th->internal_decode)->granule_time)(
|
||||
_th,_gp);
|
||||
}
|
||||
else if(_th->internal_encode!=NULL){
|
||||
return (*((oc_state_dispatch_vtable *)_th->internal_encode)->granule_time)(
|
||||
_th,_gp);
|
||||
}
|
||||
else return -1;
|
||||
}
|
||||
|
||||
void oc_theora_info2th_info(th_info *_info,const theora_info *_ci){
|
||||
_info->version_major=_ci->version_major;
|
||||
_info->version_minor=_ci->version_minor;
|
||||
_info->version_subminor=_ci->version_subminor;
|
||||
_info->frame_width=_ci->width;
|
||||
_info->frame_height=_ci->height;
|
||||
_info->pic_width=_ci->frame_width;
|
||||
_info->pic_height=_ci->frame_height;
|
||||
_info->pic_x=_ci->offset_x;
|
||||
_info->pic_y=_ci->offset_y;
|
||||
_info->fps_numerator=_ci->fps_numerator;
|
||||
_info->fps_denominator=_ci->fps_denominator;
|
||||
_info->aspect_numerator=_ci->aspect_numerator;
|
||||
_info->aspect_denominator=_ci->aspect_denominator;
|
||||
switch(_ci->colorspace){
|
||||
case OC_CS_ITU_REC_470M:_info->colorspace=TH_CS_ITU_REC_470M;break;
|
||||
case OC_CS_ITU_REC_470BG:_info->colorspace=TH_CS_ITU_REC_470BG;break;
|
||||
default:_info->colorspace=TH_CS_UNSPECIFIED;break;
|
||||
}
|
||||
switch(_ci->pixelformat){
|
||||
case OC_PF_420:_info->pixel_fmt=TH_PF_420;break;
|
||||
case OC_PF_422:_info->pixel_fmt=TH_PF_422;break;
|
||||
case OC_PF_444:_info->pixel_fmt=TH_PF_444;break;
|
||||
default:_info->pixel_fmt=TH_PF_RSVD;
|
||||
}
|
||||
_info->target_bitrate=_ci->target_bitrate;
|
||||
_info->quality=_ci->quality;
|
||||
_info->keyframe_granule_shift=_ci->keyframe_frequency_force>0?
|
||||
OC_MINI(31,oc_ilog(_ci->keyframe_frequency_force-1)):0;
|
||||
}
|
||||
|
||||
int theora_packet_isheader(ogg_packet *_op){
|
||||
return th_packet_isheader(_op);
|
||||
}
|
||||
|
||||
int theora_packet_iskeyframe(ogg_packet *_op){
|
||||
return th_packet_iskeyframe(_op);
|
||||
}
|
||||
|
||||
int theora_granule_shift(theora_info *_ci){
|
||||
/*This breaks when keyframe_frequency_force is not positive or is larger than
|
||||
2**31 (if your int is more than 32 bits), but that's what the original
|
||||
function does.*/
|
||||
return oc_ilog(_ci->keyframe_frequency_force-1);
|
||||
}
|
||||
|
||||
void theora_comment_init(theora_comment *_tc){
|
||||
th_comment_init((th_comment *)_tc);
|
||||
}
|
||||
|
||||
char *theora_comment_query(theora_comment *_tc,char *_tag,int _count){
|
||||
return th_comment_query((th_comment *)_tc,_tag,_count);
|
||||
}
|
||||
|
||||
int theora_comment_query_count(theora_comment *_tc,char *_tag){
|
||||
return th_comment_query_count((th_comment *)_tc,_tag);
|
||||
}
|
||||
|
||||
void theora_comment_clear(theora_comment *_tc){
|
||||
th_comment_clear((th_comment *)_tc);
|
||||
}
|
||||
|
||||
void theora_comment_add(theora_comment *_tc,char *_comment){
|
||||
th_comment_add((th_comment *)_tc,_comment);
|
||||
}
|
||||
|
||||
void theora_comment_add_tag(theora_comment *_tc, char *_tag, char *_value){
|
||||
th_comment_add_tag((th_comment *)_tc,_tag,_value);
|
||||
}
|
||||
54
engine/thirdparty/libtheora/apiwrapper.h
vendored
Normal file
54
engine/thirdparty/libtheora/apiwrapper.h
vendored
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: apiwrapper.h 13596 2007-08-23 20:05:38Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_apiwrapper_H)
|
||||
# define _apiwrapper_H (1)
|
||||
# include <ogg/ogg.h>
|
||||
# include <theora/theora.h>
|
||||
# include "theora/theoradec.h"
|
||||
# include "theora/theoraenc.h"
|
||||
# include "state.h"
|
||||
|
||||
typedef struct th_api_wrapper th_api_wrapper;
|
||||
typedef struct th_api_info th_api_info;
|
||||
|
||||
/*Provide an entry point for the codec setup to clear itself in case we ever
|
||||
want to break pieces off into a common base library shared by encoder and
|
||||
decoder.
|
||||
In addition, this makes several other pieces of the API wrapper cleaner.*/
|
||||
typedef void (*oc_setup_clear_func)(void *_ts);
|
||||
|
||||
/*Generally only one of these pointers will be non-NULL in any given instance.
|
||||
Technically we do not even really need this struct, since we should be able
|
||||
to figure out which one from "context", but doing it this way makes sure we
|
||||
don't flub it up.*/
|
||||
struct th_api_wrapper{
|
||||
oc_setup_clear_func clear;
|
||||
th_setup_info *setup;
|
||||
th_dec_ctx *decode;
|
||||
th_enc_ctx *encode;
|
||||
};
|
||||
|
||||
struct th_api_info{
|
||||
th_api_wrapper api;
|
||||
theora_info info;
|
||||
};
|
||||
|
||||
|
||||
void oc_theora_info2th_info(th_info *_info,const theora_info *_ci);
|
||||
|
||||
#endif
|
||||
114
engine/thirdparty/libtheora/bitpack.c
vendored
Normal file
114
engine/thirdparty/libtheora/bitpack.c
vendored
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE OggTheora SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function: packing variable sized words into an octet stream
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
#include <string.h>
|
||||
#include <stdlib.h>
|
||||
#include "bitpack.h"
|
||||
|
||||
/*We're 'MSb' endian; if we write a word but read individual bits,
|
||||
then we'll read the MSb first.*/
|
||||
|
||||
void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes){
|
||||
memset(_b,0,sizeof(*_b));
|
||||
_b->ptr=_buf;
|
||||
_b->stop=_buf+_bytes;
|
||||
}
|
||||
|
||||
static oc_pb_window oc_pack_refill(oc_pack_buf *_b,int _bits){
|
||||
const unsigned char *ptr;
|
||||
const unsigned char *stop;
|
||||
oc_pb_window window;
|
||||
int available;
|
||||
unsigned shift;
|
||||
stop=_b->stop;
|
||||
ptr=_b->ptr;
|
||||
window=_b->window;
|
||||
available=_b->bits;
|
||||
shift=OC_PB_WINDOW_SIZE-available;
|
||||
while(7<shift&&ptr<stop){
|
||||
shift-=8;
|
||||
window|=(oc_pb_window)*ptr++<<shift;
|
||||
}
|
||||
_b->ptr=ptr;
|
||||
available=OC_PB_WINDOW_SIZE-shift;
|
||||
if(_bits>available){
|
||||
if(ptr>=stop){
|
||||
_b->eof=1;
|
||||
available=OC_LOTS_OF_BITS;
|
||||
}
|
||||
else window|=*ptr>>(available&7);
|
||||
}
|
||||
_b->bits=available;
|
||||
return window;
|
||||
}
|
||||
|
||||
int oc_pack_look1(oc_pack_buf *_b){
|
||||
oc_pb_window window;
|
||||
int available;
|
||||
window=_b->window;
|
||||
available=_b->bits;
|
||||
if(available<1)_b->window=window=oc_pack_refill(_b,1);
|
||||
return window>>OC_PB_WINDOW_SIZE-1;
|
||||
}
|
||||
|
||||
void oc_pack_adv1(oc_pack_buf *_b){
|
||||
_b->window<<=1;
|
||||
_b->bits--;
|
||||
}
|
||||
|
||||
/*Here we assume that 0<=_bits&&_bits<=32.*/
|
||||
long oc_pack_read_c(oc_pack_buf *_b,int _bits){
|
||||
oc_pb_window window;
|
||||
int available;
|
||||
long result;
|
||||
window=_b->window;
|
||||
available=_b->bits;
|
||||
if(_bits==0)return 0;
|
||||
if(available<_bits){
|
||||
window=oc_pack_refill(_b,_bits);
|
||||
available=_b->bits;
|
||||
}
|
||||
result=window>>OC_PB_WINDOW_SIZE-_bits;
|
||||
available-=_bits;
|
||||
window<<=1;
|
||||
window<<=_bits-1;
|
||||
_b->window=window;
|
||||
_b->bits=available;
|
||||
return result;
|
||||
}
|
||||
|
||||
int oc_pack_read1_c(oc_pack_buf *_b){
|
||||
oc_pb_window window;
|
||||
int available;
|
||||
int result;
|
||||
window=_b->window;
|
||||
available=_b->bits;
|
||||
if(available<1){
|
||||
window=oc_pack_refill(_b,1);
|
||||
available=_b->bits;
|
||||
}
|
||||
result=window>>OC_PB_WINDOW_SIZE-1;
|
||||
available--;
|
||||
window<<=1;
|
||||
_b->window=window;
|
||||
_b->bits=available;
|
||||
return result;
|
||||
}
|
||||
|
||||
long oc_pack_bytes_left(oc_pack_buf *_b){
|
||||
if(_b->eof)return -1;
|
||||
return _b->stop-_b->ptr+(_b->bits>>3);
|
||||
}
|
||||
76
engine/thirdparty/libtheora/bitpack.h
vendored
Normal file
76
engine/thirdparty/libtheora/bitpack.h
vendored
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE OggTheora SOURCE CODE IS (C) COPYRIGHT 1994-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function: packing variable sized words into an octet stream
|
||||
last mod: $Id: bitwise.c 7675 2004-09-01 00:34:39Z xiphmont $
|
||||
|
||||
********************************************************************/
|
||||
#if !defined(_bitpack_H)
|
||||
# define _bitpack_H (1)
|
||||
# include <stddef.h>
|
||||
# include <limits.h>
|
||||
# include "internal.h"
|
||||
|
||||
|
||||
|
||||
typedef size_t oc_pb_window;
|
||||
typedef struct oc_pack_buf oc_pack_buf;
|
||||
|
||||
|
||||
|
||||
/*Custom bitpacker implementations.*/
|
||||
# if defined(OC_ARM_ASM)
|
||||
# include "arm/armbits.h"
|
||||
# endif
|
||||
|
||||
# if !defined(oc_pack_read)
|
||||
# define oc_pack_read oc_pack_read_c
|
||||
# endif
|
||||
# if !defined(oc_pack_read1)
|
||||
# define oc_pack_read1 oc_pack_read1_c
|
||||
# endif
|
||||
# if !defined(oc_huff_token_decode)
|
||||
# define oc_huff_token_decode oc_huff_token_decode_c
|
||||
# endif
|
||||
|
||||
# define OC_PB_WINDOW_SIZE ((int)sizeof(oc_pb_window)*CHAR_BIT)
|
||||
/*This is meant to be a large, positive constant that can still be efficiently
|
||||
loaded as an immediate (on platforms like ARM, for example).
|
||||
Even relatively modest values like 100 would work fine.*/
|
||||
# define OC_LOTS_OF_BITS (0x40000000)
|
||||
|
||||
|
||||
|
||||
struct oc_pack_buf{
|
||||
const unsigned char *stop;
|
||||
const unsigned char *ptr;
|
||||
oc_pb_window window;
|
||||
int bits;
|
||||
int eof;
|
||||
};
|
||||
|
||||
void oc_pack_readinit(oc_pack_buf *_b,unsigned char *_buf,long _bytes);
|
||||
int oc_pack_look1(oc_pack_buf *_b);
|
||||
void oc_pack_adv1(oc_pack_buf *_b);
|
||||
/*Here we assume 0<=_bits&&_bits<=32.*/
|
||||
long oc_pack_read_c(oc_pack_buf *_b,int _bits);
|
||||
int oc_pack_read1_c(oc_pack_buf *_b);
|
||||
/* returns -1 for read beyond EOF, or the number of whole bytes available */
|
||||
long oc_pack_bytes_left(oc_pack_buf *_b);
|
||||
|
||||
/*These two functions are implemented locally in huffdec.c*/
|
||||
/*Read in bits without advancing the bitptr.
|
||||
Here we assume 0<=_bits&&_bits<=32.*/
|
||||
/*static int oc_pack_look(oc_pack_buf *_b,int _bits);*/
|
||||
/*static void oc_pack_adv(oc_pack_buf *_b,int _bits);*/
|
||||
|
||||
#endif
|
||||
974
engine/thirdparty/libtheora/collect.c
vendored
Normal file
974
engine/thirdparty/libtheora/collect.c
vendored
Normal file
|
|
@ -0,0 +1,974 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2011 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function: mode selection code
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
#include <stdio.h>
|
||||
#include <limits.h>
|
||||
#include <math.h>
|
||||
#include <string.h>
|
||||
#include "collect.h"
|
||||
|
||||
#if defined(OC_COLLECT_METRICS)
|
||||
|
||||
int OC_HAS_MODE_METRICS;
|
||||
double OC_MODE_RD_WEIGHT_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
|
||||
double OC_MODE_RD_WEIGHT_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
|
||||
oc_mode_metrics OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
|
||||
oc_mode_metrics OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
|
||||
const char *OC_MODE_METRICS_FILENAME="modedec.stats";
|
||||
|
||||
void oc_mode_metrics_add(oc_mode_metrics *_metrics,
|
||||
double _w,int _s,int _q,int _r,double _d){
|
||||
if(_metrics->w>0){
|
||||
double ds;
|
||||
double dq;
|
||||
double dr;
|
||||
double dd;
|
||||
double ds2;
|
||||
double dq2;
|
||||
double s2;
|
||||
double sq;
|
||||
double q2;
|
||||
double sr;
|
||||
double qr;
|
||||
double sd;
|
||||
double qd;
|
||||
double s2q;
|
||||
double sq2;
|
||||
double w;
|
||||
double wa;
|
||||
double rwa;
|
||||
double rwa2;
|
||||
double rwb;
|
||||
double rwb2;
|
||||
double rw2;
|
||||
double rw3;
|
||||
double rw4;
|
||||
wa=_metrics->w;
|
||||
ds=_s-_metrics->s/wa;
|
||||
dq=_q-_metrics->q/wa;
|
||||
dr=_r-_metrics->r/wa;
|
||||
dd=_d-_metrics->d/wa;
|
||||
ds2=ds*ds;
|
||||
dq2=dq*dq;
|
||||
s2=_metrics->s2;
|
||||
sq=_metrics->sq;
|
||||
q2=_metrics->q2;
|
||||
sr=_metrics->sr;
|
||||
qr=_metrics->qr;
|
||||
sd=_metrics->sd;
|
||||
qd=_metrics->qd;
|
||||
s2q=_metrics->s2q;
|
||||
sq2=_metrics->sq2;
|
||||
w=wa+_w;
|
||||
rwa=wa/w;
|
||||
rwb=_w/w;
|
||||
rwa2=rwa*rwa;
|
||||
rwb2=rwb*rwb;
|
||||
rw2=wa*rwb;
|
||||
rw3=rw2*(rwa2-rwb2);
|
||||
rw4=_w*rwa2*rwa2+wa*rwb2*rwb2;
|
||||
_metrics->s2q2+=-2*(ds*sq2+dq*s2q)*rwb
|
||||
+(ds2*q2+4*ds*dq*sq+dq2*s2)*rwb2+ds2*dq2*rw4;
|
||||
_metrics->s2q+=(-2*ds*sq-dq*s2)*rwb+ds2*dq*rw3;
|
||||
_metrics->sq2+=(-ds*q2-2*dq*sq)*rwb+ds*dq2*rw3;
|
||||
_metrics->sqr+=(-ds*qr-dq*sr-dr*sq)*rwb+ds*dq*dr*rw3;
|
||||
_metrics->sqd+=(-ds*qd-dq*sd-dd*sq)*rwb+ds*dq*dd*rw3;
|
||||
_metrics->s2+=ds2*rw2;
|
||||
_metrics->sq+=ds*dq*rw2;
|
||||
_metrics->q2+=dq2*rw2;
|
||||
_metrics->sr+=ds*dr*rw2;
|
||||
_metrics->qr+=dq*dr*rw2;
|
||||
_metrics->r2+=dr*dr*rw2;
|
||||
_metrics->sd+=ds*dd*rw2;
|
||||
_metrics->qd+=dq*dd*rw2;
|
||||
_metrics->d2+=dd*dd*rw2;
|
||||
}
|
||||
_metrics->w+=_w;
|
||||
_metrics->s+=_s*_w;
|
||||
_metrics->q+=_q*_w;
|
||||
_metrics->r+=_r*_w;
|
||||
_metrics->d+=_d*_w;
|
||||
}
|
||||
|
||||
void oc_mode_metrics_merge(oc_mode_metrics *_dst,
|
||||
const oc_mode_metrics *_src,int _n){
|
||||
int i;
|
||||
/*Find a non-empty set of metrics.*/
|
||||
for(i=0;i<_n&&_src[i].w==0;i++);
|
||||
if(i>=_n){
|
||||
memset(_dst,0,sizeof(*_dst));
|
||||
return;
|
||||
}
|
||||
memcpy(_dst,_src+i,sizeof(*_dst));
|
||||
/*And iterate over the remaining non-empty sets of metrics.*/
|
||||
for(i++;i<_n;i++)if(_src[i].w!=0){
|
||||
double ds;
|
||||
double dq;
|
||||
double dr;
|
||||
double dd;
|
||||
double ds2;
|
||||
double dq2;
|
||||
double s2a;
|
||||
double s2b;
|
||||
double sqa;
|
||||
double sqb;
|
||||
double q2a;
|
||||
double q2b;
|
||||
double sra;
|
||||
double srb;
|
||||
double qra;
|
||||
double qrb;
|
||||
double sda;
|
||||
double sdb;
|
||||
double qda;
|
||||
double qdb;
|
||||
double s2qa;
|
||||
double s2qb;
|
||||
double sq2a;
|
||||
double sq2b;
|
||||
double w;
|
||||
double wa;
|
||||
double wb;
|
||||
double rwa;
|
||||
double rwb;
|
||||
double rwa2;
|
||||
double rwb2;
|
||||
double rw2;
|
||||
double rw3;
|
||||
double rw4;
|
||||
wa=_dst->w;
|
||||
wb=_src[i].w;
|
||||
ds=_src[i].s/wb-_dst->s/wa;
|
||||
dq=_src[i].q/wb-_dst->q/wa;
|
||||
dr=_src[i].r/wb-_dst->r/wa;
|
||||
dd=_src[i].d/wb-_dst->d/wa;
|
||||
ds2=ds*ds;
|
||||
dq2=dq*dq;
|
||||
s2a=_dst->s2;
|
||||
sqa=_dst->sq;
|
||||
q2a=_dst->q2;
|
||||
sra=_dst->sr;
|
||||
qra=_dst->qr;
|
||||
sda=_dst->sd;
|
||||
qda=_dst->qd;
|
||||
s2qa=_dst->s2q;
|
||||
sq2a=_dst->sq2;
|
||||
s2b=_src[i].s2;
|
||||
sqb=_src[i].sq;
|
||||
q2b=_src[i].q2;
|
||||
srb=_src[i].sr;
|
||||
qrb=_src[i].qr;
|
||||
sdb=_src[i].sd;
|
||||
qdb=_src[i].qd;
|
||||
s2qb=_src[i].s2q;
|
||||
sq2b=_src[i].sq2;
|
||||
w=wa+wb;
|
||||
if(w==0)rwa=rwb=0;
|
||||
else{
|
||||
rwa=wa/w;
|
||||
rwb=wb/w;
|
||||
}
|
||||
rwa2=rwa*rwa;
|
||||
rwb2=rwb*rwb;
|
||||
rw2=wa*rwb;
|
||||
rw3=rw2*(rwa2-rwb2);
|
||||
rw4=wb*rwa2*rwa2+wa*rwb2*rwb2;
|
||||
/*
|
||||
(1,1,1) ->
|
||||
(0,0,0)#
|
||||
(1,0,0) C(1,1)*C(1,0)*C(1,0)-> d^{1,0,0}*(rwa*B_{0,1,1}-rwb*A_{0,1,1})
|
||||
(0,1,0) C(1,0)*C(1,1)*C(1,0)-> d^{0,1,0}*(rwa*B_{1,0,1}-rwb*A_{1,0,1})
|
||||
(0,0,1) C(1,0)*C(1,0)*C(1,1)-> d^{0,0,1}*(rwa*B_{1,1,0}-rwb*A_{1,1,0})
|
||||
(1,1,0)*
|
||||
(1,0,1)*
|
||||
(0,1,1)*
|
||||
(1,1,1) C(1,1)*C(1,1)*C(1,1)-> d^{1,1,1}*(rwa^3*wb-rwb^3*wa)
|
||||
(2,1) ->
|
||||
(0,0)#
|
||||
(1,0) C(2,1)*C(1,1)->2*d^{1,0}*(rwa*B_{1,1}-rwb*A_{1,1})
|
||||
(0,1) C(2,0)*C(1,1)-> d^{0,1}*(rwa*B_{2,0}-rwb*A_{2,0})
|
||||
(2,0)*
|
||||
(1,1)*
|
||||
(2,1) C(2,2)*C(1,1)-> d^{2,1}*(rwa^3*wb-rwb^3*wa)
|
||||
(2,2) ->
|
||||
(0,0)#
|
||||
(1,0) C(2,1)*C(2,0)->2*d^{1,0}*(rwa*B_{1,2}-rwb*A_{1,2})
|
||||
(0,1) C(2,0)*C(2,1)->2*d^{0,1}*(rwa*B_{2,1}-rwb*A_{2,1})
|
||||
(2,0) C(2,2)*C(2,0)-> d^{2,0}*(rwa^2*B_{0,2}+rwb^2*A_{0,2})
|
||||
(1,1) C(2,1)*C(2,1)->4*d^{1,1}*(rwa^2*B_{1,1}+rwb^2*A_{1,1})
|
||||
(0,2) C(2,0)*C(2,2)-> d^{0,2}*(rwa^2*B_{2,0}+rwb^2*A_{2,0})
|
||||
(1,2)*
|
||||
(2,1)*
|
||||
(2,2) C(2,2)*C(2,2)*d^{2,2}*(rwa^4*wb+rwb^4*wa)
|
||||
*/
|
||||
_dst->s2q2+=_src[i].s2q2+2*(ds*(rwa*sq2b-rwb*sq2a)+dq*(rwa*s2qb-rwb*s2qa))
|
||||
+ds2*(rwa2*q2b+rwb2*q2a)+4*ds*dq*(rwa2*sqb+rwb2*sqa)
|
||||
+dq2*(rwa2*s2b+rwb2*s2a)+ds2*dq2*rw4;
|
||||
_dst->s2q+=_src[i].s2q+2*ds*(rwa*sqb-rwb*sqa)
|
||||
+dq*(rwa*s2b-rwb*s2a)+ds2*dq*rw3;
|
||||
_dst->sq2+=_src[i].sq2+ds*(rwa*q2b-rwb*q2a)
|
||||
+2*dq*(rwa*sqb-rwb*sqa)+ds*dq2*rw3;
|
||||
_dst->sqr+=_src[i].sqr+ds*(rwa*qrb-rwb*qra)+dq*(rwa*srb-rwb*sra)
|
||||
+dr*(rwa*sqb-rwb*sqa)+ds*dq*dr*rw3;
|
||||
_dst->sqd+=_src[i].sqd+ds*(rwa*qdb-rwb*qda)+dq*(rwa*sdb-rwb*sda)
|
||||
+dd*(rwa*sqb-rwb*sqa)+ds*dq*dd*rw3;
|
||||
_dst->s2+=_src[i].s2+ds2*rw2;
|
||||
_dst->sq+=_src[i].sq+ds*dq*rw2;
|
||||
_dst->q2+=_src[i].q2+dq2*rw2;
|
||||
_dst->sr+=_src[i].sr+ds*dr*rw2;
|
||||
_dst->qr+=_src[i].qr+dq*dr*rw2;
|
||||
_dst->r2+=_src[i].r2+dr*dr*rw2;
|
||||
_dst->sd+=_src[i].sd+ds*dd*rw2;
|
||||
_dst->qd+=_src[i].qd+dq*dd*rw2;
|
||||
_dst->d2+=_src[i].d2+dd*dd*rw2;
|
||||
_dst->w+=_src[i].w;
|
||||
_dst->s+=_src[i].s;
|
||||
_dst->q+=_src[i].q;
|
||||
_dst->r+=_src[i].r;
|
||||
_dst->d+=_src[i].d;
|
||||
}
|
||||
}
|
||||
|
||||
/*Adjust a single corner of a set of metric bins to minimize the squared
|
||||
prediction error of R and D.
|
||||
Each bin is assumed to cover a quad like so:
|
||||
(s0,q0) (s1,q0)
|
||||
A----------B
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
| |
|
||||
C----------Z
|
||||
(s0,q1) (s1,q1)
|
||||
The values A, B, and C are fixed, and Z is the free parameter.
|
||||
Then, for example, R_i is predicted via bilinear interpolation as
|
||||
x_i=(s_i-s0)/(s1-s0)
|
||||
y_i=(q_i-q0)/(q1-q0)
|
||||
dRds1_i=A+(B-A)*x_i
|
||||
dRds2_i=C+(Z-C)*x_i
|
||||
R_i=dRds1_i+(dRds2_i-dRds1_i)*y_i
|
||||
To find the Z that minimizes the squared prediction error over i, this can
|
||||
be rewritten as
|
||||
R_i-(A+(B-A)*x_i+(C-A)*y_i+(A-B-C)*x_i*y_i)=x_i*y_i*Z
|
||||
Letting X={...,x_i*y_i,...}^T and
|
||||
Y={...,R_i-(A+(B-A)*x_i+(C-A)*y_i+(A-B-C)*x_i*y_i),...}^T,
|
||||
the optimal Z is given by Z=(X^T.Y)/(X^T.X).
|
||||
Now, we need to compute these dot products without actually storing data for
|
||||
each sample.
|
||||
Starting with X^T.X, we have
|
||||
X^T.X = sum(x_i^2*y_i^2) = sum((s_i-s0)^2*(q_i-q0)^2)/((s1-s0)^2*(q1-q0)^2).
|
||||
Expanding the interior of the sum in a monomial basis of s_i and q_i gives
|
||||
s0^2*q0^2 *(1)
|
||||
-2*s0*q0^2*(s_i)
|
||||
-2*s0^2*q0*(q_i)
|
||||
+q0^2 *(s_i^2)
|
||||
+4*s0*q0 *(s_i*q_i)
|
||||
+s0^2 *(q_i^2)
|
||||
-2*q0 *(s_i^2*q_i)
|
||||
-2*s0 *(s_i*q_i^2)
|
||||
+1 *(s_i^2*q_i^2).
|
||||
However, computing things directly in this basis leads to gross numerical
|
||||
errors, as most of the terms will have similar size and destructive
|
||||
cancellation results.
|
||||
A much better basis is the central (co-)moment basis:
|
||||
{1,s_i-sbar,q_i-qbar,(s_i-sbar)^2,(s_i-sbar)*(q_i-qbar),(q_i-qbar)^2,
|
||||
(s_i-sbar)^2*(q_i-qbar),(s_i-sbar)*(q_i-qbar)^2,(s_i-sbar)^2*(q_i-qbar)^2},
|
||||
where sbar and qbar are the average s and q values over the bin,
|
||||
respectively.
|
||||
In that basis, letting ds=sbar-s0 and dq=qbar-q0, (s_i-s0)^2*(q_i-q0)^2 is
|
||||
ds^2*dq^2*(1)
|
||||
+dq^2 *((s_i-sbar)^2)
|
||||
+4*ds*dq*((s_i-sbar)*(q_i-qbar))
|
||||
+ds^2 *((q_i-qbar)^2)
|
||||
+2*dq *((s_i-sbar)^2*(q_i-qbar))
|
||||
+2*ds *((s_i-sbar)*(q_i-qbar)^2)
|
||||
+1 *((s_i-sbar)^2*(q_i-qbar)^2).
|
||||
With these expressions in the central (co-)moment bases, all we need to do
|
||||
is compute sums over the (co-)moment terms, which can be done
|
||||
incrementally (see oc_mode_metrics_add() and oc_mode_metrics_merge()),
|
||||
with no need to store the individual samples.
|
||||
Now, for X^T.Y, we have
|
||||
X^T.Y = sum((R_i-A-((B-A)/(s1-s0))*(s_i-s0)-((C-A)/(q1-q0))*(q_i-q0)
|
||||
-((A-B-C)/((s1-s0)*(q1-q0)))*(s_i-s0)*(q_i-q0))*(s_i-s0)*(q_i-q0))/
|
||||
((s1-s0)*(q1-q0)),
|
||||
or, rewriting the constants to simplify notation,
|
||||
X^T.Y = sum((C0+C1*(s_i-s0)+C2*(q_i-q0)
|
||||
+C3*(s_i-s0)*(q_i-q0)+R_i)*(s_i-s0)*(q_i-q0))/((s1-s0)*(q1-q0)).
|
||||
Again, converting to the central (co-)moment basis, the interior of the
|
||||
above sum is
|
||||
ds*dq*(rbar+C0+C1*ds+C2*dq+C3*ds*dq) *(1)
|
||||
+(C1*dq+C3*dq^2) *((s_i-sbar)^2)
|
||||
+(rbar+C0+2*C1*ds+2*C2*dq+4*C3*ds*dq)*((s_i-sbar)*(q_i-qbar))
|
||||
+(C2*ds+C3*ds^2) *((q_i-qbar)^2)
|
||||
+dq *((s_i-sbar)*(r_i-rbar))
|
||||
+ds *((q_i-qbar)*(r_i-rbar))
|
||||
+(C1+2*C3*dq) *((s_i-sbar)^2*(q_i-qbar))
|
||||
+(C2+2*C3*ds) *((s_i-sbar)*(q_i-qbar)^2)
|
||||
+1 *((s_i-sbar)*(q_i-qbar)*(r_i-rbar))
|
||||
+C3 *((s_i-sbar)^2*(q_i-qbar)^2).
|
||||
You might think it would be easier (if perhaps slightly less robust) to
|
||||
accumulate terms directly around s0 and q0.
|
||||
However, we update each corner of the bins in turn, so we would have to
|
||||
change basis to move the sums from corner to corner anyway.*/
|
||||
double oc_mode_metrics_solve(double *_r,double *_d,
|
||||
const oc_mode_metrics *_metrics,const int *_s0,const int *_s1,
|
||||
const int *_q0,const int *_q1,
|
||||
const double *_ra,const double *_rb,const double *_rc,
|
||||
const double *_da,const double *_db,const double *_dc,int _n){
|
||||
double xx;
|
||||
double rxy;
|
||||
double dxy;
|
||||
double wt;
|
||||
int i;
|
||||
xx=rxy=dxy=wt=0;
|
||||
for(i=0;i<_n;i++)if(_metrics[i].w>0){
|
||||
double s10;
|
||||
double q10;
|
||||
double sq10;
|
||||
double ds;
|
||||
double dq;
|
||||
double ds2;
|
||||
double dq2;
|
||||
double r;
|
||||
double d;
|
||||
double s2;
|
||||
double sq;
|
||||
double q2;
|
||||
double sr;
|
||||
double qr;
|
||||
double sd;
|
||||
double qd;
|
||||
double s2q;
|
||||
double sq2;
|
||||
double sqr;
|
||||
double sqd;
|
||||
double s2q2;
|
||||
double c0;
|
||||
double c1;
|
||||
double c2;
|
||||
double c3;
|
||||
double w;
|
||||
w=_metrics[i].w;
|
||||
wt+=w;
|
||||
s10=_s1[i]-_s0[i];
|
||||
q10=_q1[i]-_q0[i];
|
||||
sq10=s10*q10;
|
||||
ds=_metrics[i].s/w-_s0[i];
|
||||
dq=_metrics[i].q/w-_q0[i];
|
||||
ds2=ds*ds;
|
||||
dq2=dq*dq;
|
||||
s2=_metrics[i].s2;
|
||||
sq=_metrics[i].sq;
|
||||
q2=_metrics[i].q2;
|
||||
s2q=_metrics[i].s2q;
|
||||
sq2=_metrics[i].sq2;
|
||||
s2q2=_metrics[i].s2q2;
|
||||
xx+=(dq2*(ds2*w+s2)+4*ds*dq*sq+ds2*q2+2*(dq*s2q+ds*sq2)+s2q2)/(sq10*sq10);
|
||||
r=_metrics[i].r/w;
|
||||
sr=_metrics[i].sr;
|
||||
qr=_metrics[i].qr;
|
||||
sqr=_metrics[i].sqr;
|
||||
c0=-_ra[i];
|
||||
c1=-(_rb[i]-_ra[i])/s10;
|
||||
c2=-(_rc[i]-_ra[i])/q10;
|
||||
c3=-(_ra[i]-_rb[i]-_rc[i])/sq10;
|
||||
rxy+=(ds*dq*(r+c0+c1*ds+c2*dq+c3*ds*dq)*w+(c1*dq+c3*dq2)*s2
|
||||
+(r+c0+2*(c1*ds+(c2+2*c3*ds)*dq))*sq+(c2*ds+c3*ds2)*q2+dq*sr+ds*qr
|
||||
+(c1+2*c3*dq)*s2q+(c2+2*c3*ds)*sq2+sqr+c3*s2q2)/sq10;
|
||||
d=_metrics[i].d/w;
|
||||
sd=_metrics[i].sd;
|
||||
qd=_metrics[i].qd;
|
||||
sqd=_metrics[i].sqd;
|
||||
c0=-_da[i];
|
||||
c1=-(_db[i]-_da[i])/s10;
|
||||
c2=-(_dc[i]-_da[i])/q10;
|
||||
c3=-(_da[i]-_db[i]-_dc[i])/sq10;
|
||||
dxy+=(ds*dq*(d+c0+c1*ds+c2*dq+c3*ds*dq)*w+(c1*dq+c3*dq2)*s2
|
||||
+(d+c0+2*(c1*ds+(c2+2*c3*ds)*dq))*sq+(c2*ds+c3*ds2)*q2+dq*sd+ds*qd
|
||||
+(c1+2*c3*dq)*s2q+(c2+2*c3*ds)*sq2+sqd+c3*s2q2)/sq10;
|
||||
}
|
||||
if(xx>1E-3){
|
||||
*_r=rxy/xx;
|
||||
*_d=dxy/xx;
|
||||
}
|
||||
else{
|
||||
*_r=0;
|
||||
*_d=0;
|
||||
}
|
||||
return wt;
|
||||
}
|
||||
|
||||
/*Compile collected SATD/logq/rate/RMSE metrics into a form that's immediately
|
||||
useful for mode decision.*/
|
||||
void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
|
||||
int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS],
|
||||
int _shift,double (*_weight)[3][2][OC_COMP_BINS]){
|
||||
int niters;
|
||||
int prevdr;
|
||||
int prevdd;
|
||||
int dr;
|
||||
int dd;
|
||||
int pli;
|
||||
int qti;
|
||||
int qi;
|
||||
int si;
|
||||
dd=dr=INT_MAX;
|
||||
niters=0;
|
||||
/*The encoder interpolates rate and RMSE terms bilinearly from an
|
||||
OC_LOGQ_BINS by OC_COMP_BINS grid of sample points in _table.
|
||||
To find the sample values at the grid points that minimize the total
|
||||
squared prediction error actually requires solving a relatively sparse
|
||||
linear system with a number of variables equal to the number of grid
|
||||
points.
|
||||
Instead of writing a general sparse linear system solver, we just use
|
||||
Gauss-Seidel iteration, i.e., we update one grid point at time until
|
||||
they stop changing.*/
|
||||
do{
|
||||
prevdr=dr;
|
||||
prevdd=dd;
|
||||
dd=dr=0;
|
||||
for(pli=0;pli<3;pli++){
|
||||
for(qti=0;qti<2;qti++){
|
||||
for(qi=0;qi<OC_LOGQ_BINS;qi++){
|
||||
for(si=0;si<OC_COMP_BINS;si++){
|
||||
oc_mode_metrics m[4];
|
||||
int s0[4];
|
||||
int s1[4];
|
||||
int q0[4];
|
||||
int q1[4];
|
||||
double ra[4];
|
||||
double rb[4];
|
||||
double rc[4];
|
||||
double da[4];
|
||||
double db[4];
|
||||
double dc[4];
|
||||
double r;
|
||||
double d;
|
||||
int rate;
|
||||
int rmse;
|
||||
int ds;
|
||||
int n;
|
||||
n=0;
|
||||
/*Collect the statistics for the (up to) four bins grid point
|
||||
(si,qi) touches.*/
|
||||
if(qi>0&&si>0){
|
||||
q0[n]=OC_MODE_LOGQ[qi-1][pli][qti];
|
||||
q1[n]=OC_MODE_LOGQ[qi][pli][qti];
|
||||
s0[n]=si-1<<_shift;
|
||||
s1[n]=si<<_shift;
|
||||
ra[n]=ldexp(_table[qi-1][pli][qti][si-1].rate,-OC_BIT_SCALE);
|
||||
da[n]=ldexp(_table[qi-1][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
|
||||
rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE);
|
||||
db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE);
|
||||
rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE);
|
||||
dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
|
||||
*(m+n++)=*(_metrics[qi-1][pli][qti]+si-1);
|
||||
}
|
||||
if(qi>0){
|
||||
ds=si+1<OC_COMP_BINS?1:-1;
|
||||
q0[n]=OC_MODE_LOGQ[qi-1][pli][qti];
|
||||
q1[n]=OC_MODE_LOGQ[qi][pli][qti];
|
||||
s0[n]=si+ds<<_shift;
|
||||
s1[n]=si<<_shift;
|
||||
ra[n]=ldexp(_table[qi-1][pli][qti][si+ds].rate,-OC_BIT_SCALE);
|
||||
da[n]=
|
||||
ldexp(_table[qi-1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
|
||||
rb[n]=ldexp(_table[qi-1][pli][qti][si].rate,-OC_BIT_SCALE);
|
||||
db[n]=ldexp(_table[qi-1][pli][qti][si].rmse,-OC_RMSE_SCALE);
|
||||
rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE);
|
||||
dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
|
||||
*(m+n++)=*(_metrics[qi-1][pli][qti]+si);
|
||||
}
|
||||
if(qi+1<OC_LOGQ_BINS&&si>0){
|
||||
q0[n]=OC_MODE_LOGQ[qi+1][pli][qti];
|
||||
q1[n]=OC_MODE_LOGQ[qi][pli][qti];
|
||||
s0[n]=si-1<<_shift;
|
||||
s1[n]=si<<_shift;
|
||||
ra[n]=ldexp(_table[qi+1][pli][qti][si-1].rate,-OC_BIT_SCALE);
|
||||
da[n]=ldexp(_table[qi+1][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
|
||||
rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE);
|
||||
db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE);
|
||||
rc[n]=ldexp(_table[qi][pli][qti][si-1].rate,-OC_BIT_SCALE);
|
||||
dc[n]=ldexp(_table[qi][pli][qti][si-1].rmse,-OC_RMSE_SCALE);
|
||||
*(m+n++)=*(_metrics[qi][pli][qti]+si-1);
|
||||
}
|
||||
if(qi+1<OC_LOGQ_BINS){
|
||||
ds=si+1<OC_COMP_BINS?1:-1;
|
||||
q0[n]=OC_MODE_LOGQ[qi+1][pli][qti];
|
||||
q1[n]=OC_MODE_LOGQ[qi][pli][qti];
|
||||
s0[n]=si+ds<<_shift;
|
||||
s1[n]=si<<_shift;
|
||||
ra[n]=ldexp(_table[qi+1][pli][qti][si+ds].rate,-OC_BIT_SCALE);
|
||||
da[n]=
|
||||
ldexp(_table[qi+1][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
|
||||
rb[n]=ldexp(_table[qi+1][pli][qti][si].rate,-OC_BIT_SCALE);
|
||||
db[n]=ldexp(_table[qi+1][pli][qti][si].rmse,-OC_RMSE_SCALE);
|
||||
rc[n]=ldexp(_table[qi][pli][qti][si+ds].rate,-OC_BIT_SCALE);
|
||||
dc[n]=ldexp(_table[qi][pli][qti][si+ds].rmse,-OC_RMSE_SCALE);
|
||||
*(m+n++)=*(_metrics[qi][pli][qti]+si);
|
||||
}
|
||||
/*On the first pass, initialize with a simple weighted average of
|
||||
the neighboring bins.*/
|
||||
if(!OC_HAS_MODE_METRICS&&niters==0){
|
||||
double w;
|
||||
w=r=d=0;
|
||||
while(n-->0){
|
||||
w+=m[n].w;
|
||||
r+=m[n].r;
|
||||
d+=m[n].d;
|
||||
}
|
||||
r=w>1E-3?r/w:0;
|
||||
d=w>1E-3?d/w:0;
|
||||
_weight[qi][pli][qti][si]=w;
|
||||
}
|
||||
else{
|
||||
/*Update the grid point and save the weight for later.*/
|
||||
_weight[qi][pli][qti][si]=
|
||||
oc_mode_metrics_solve(&r,&d,m,s0,s1,q0,q1,ra,rb,rc,da,db,dc,n);
|
||||
}
|
||||
rate=OC_CLAMPI(-32768,(int)(ldexp(r,OC_BIT_SCALE)+0.5),32767);
|
||||
rmse=OC_CLAMPI(-32768,(int)(ldexp(d,OC_RMSE_SCALE)+0.5),32767);
|
||||
dr+=abs(rate-_table[qi][pli][qti][si].rate);
|
||||
dd+=abs(rmse-_table[qi][pli][qti][si].rmse);
|
||||
_table[qi][pli][qti][si].rate=(ogg_int16_t)rate;
|
||||
_table[qi][pli][qti][si].rmse=(ogg_int16_t)rmse;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/*After a fixed number of initial iterations, only iterate so long as the
|
||||
total change is decreasing.
|
||||
This ensures we don't oscillate forever, which is a danger, as all of our
|
||||
results are rounded fairly coarsely.*/
|
||||
while((dr>0||dd>0)&&(niters++<_niters_min||(dr<prevdr&&dd<prevdd)));
|
||||
if(_reweight){
|
||||
/*Now, reduce the values of the optimal solution until we get enough
|
||||
samples in each bin to overcome the constant OC_ZWEIGHT factor.
|
||||
This encourages sampling under-populated bins and prevents a single large
|
||||
sample early on from discouraging coding in that bin ever again.*/
|
||||
for(pli=0;pli<3;pli++){
|
||||
for(qti=0;qti<2;qti++){
|
||||
for(qi=0;qi<OC_LOGQ_BINS;qi++){
|
||||
for(si=0;si<OC_COMP_BINS;si++){
|
||||
double wt;
|
||||
wt=_weight[qi][pli][qti][si];
|
||||
wt/=OC_ZWEIGHT+wt;
|
||||
_table[qi][pli][qti][si].rate=(ogg_int16_t)
|
||||
(_table[qi][pli][qti][si].rate*wt+0.5);
|
||||
_table[qi][pli][qti][si].rmse=(ogg_int16_t)
|
||||
(_table[qi][pli][qti][si].rmse*wt+0.5);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*Dump the in memory mode metrics to a file.
|
||||
Note this data format isn't portable between different platforms.*/
|
||||
void oc_mode_metrics_dump(void){
|
||||
FILE *fmetrics;
|
||||
fmetrics=fopen(OC_MODE_METRICS_FILENAME,"wb");
|
||||
if(fmetrics!=NULL){
|
||||
(void)fwrite(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics);
|
||||
(void)fwrite(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics);
|
||||
(void)fwrite(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics);
|
||||
fclose(fmetrics);
|
||||
}
|
||||
}
|
||||
|
||||
void oc_mode_metrics_print_rd(FILE *_fout,const char *_table_name,
|
||||
#if !defined(OC_COLLECT_METRICS)
|
||||
const oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){
|
||||
#else
|
||||
oc_mode_rd (*_mode_rd_table)[3][2][OC_COMP_BINS]){
|
||||
#endif
|
||||
int qii;
|
||||
fprintf(_fout,
|
||||
"# if !defined(OC_COLLECT_METRICS)\n"
|
||||
"static const\n"
|
||||
"# endif\n"
|
||||
"oc_mode_rd %s[OC_LOGQ_BINS][3][2][OC_COMP_BINS]={\n",_table_name);
|
||||
for(qii=0;qii<OC_LOGQ_BINS;qii++){
|
||||
int pli;
|
||||
fprintf(_fout," {\n");
|
||||
for(pli=0;pli<3;pli++){
|
||||
int qti;
|
||||
fprintf(_fout," {\n");
|
||||
for(qti=0;qti<2;qti++){
|
||||
int bin;
|
||||
int qi;
|
||||
static const char *pl_names[3]={"Y'","Cb","Cr"};
|
||||
static const char *qti_names[2]={"INTRA","INTER"};
|
||||
qi=(63*qii+(OC_LOGQ_BINS-1>>1))/(OC_LOGQ_BINS-1);
|
||||
fprintf(_fout," /*%s qi=%i %s*/\n",
|
||||
pl_names[pli],qi,qti_names[qti]);
|
||||
fprintf(_fout," {\n");
|
||||
fprintf(_fout," ");
|
||||
for(bin=0;bin<OC_COMP_BINS;bin++){
|
||||
if(bin&&!(bin&0x3))fprintf(_fout,"\n ");
|
||||
fprintf(_fout,"{%5i,%5i}",
|
||||
_mode_rd_table[qii][pli][qti][bin].rate,
|
||||
_mode_rd_table[qii][pli][qti][bin].rmse);
|
||||
if(bin+1<OC_COMP_BINS)fprintf(_fout,",");
|
||||
}
|
||||
fprintf(_fout,"\n }");
|
||||
if(qti<1)fprintf(_fout,",");
|
||||
fprintf(_fout,"\n");
|
||||
}
|
||||
fprintf(_fout," }");
|
||||
if(pli<2)fprintf(_fout,",");
|
||||
fprintf(_fout,"\n");
|
||||
}
|
||||
fprintf(_fout," }");
|
||||
if(qii+1<OC_LOGQ_BINS)fprintf(_fout,",");
|
||||
fprintf(_fout,"\n");
|
||||
}
|
||||
fprintf(_fout,
|
||||
"};\n"
|
||||
"\n");
|
||||
}
|
||||
|
||||
void oc_mode_metrics_print(FILE *_fout){
|
||||
int qii;
|
||||
fprintf(_fout,
|
||||
"/*File generated by libtheora with OC_COLLECT_METRICS"
|
||||
" defined at compile time.*/\n"
|
||||
"#if !defined(_modedec_H)\n"
|
||||
"# define _modedec_H (1)\n"
|
||||
"# include \"encint.h\"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"\n"
|
||||
"/*The log of the average quantizer for each of the OC_MODE_RD table rows\n"
|
||||
" (e.g., for the represented qi's, and each pli and qti), in Q10 format.\n"
|
||||
" The actual statistics used by the encoder will be interpolated from\n"
|
||||
" that table based on log_plq for the actual quantization matrix used.*/\n"
|
||||
"# if !defined(OC_COLLECT_METRICS)\n"
|
||||
"static const\n"
|
||||
"# endif\n"
|
||||
"ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2]={\n");
|
||||
for(qii=0;qii<OC_LOGQ_BINS;qii++){
|
||||
fprintf(_fout," { {0x%04X,0x%04X},{0x%04X,0x%04X},{0x%04X,0x%04X} }%s\n",
|
||||
OC_MODE_LOGQ[qii][0][0],OC_MODE_LOGQ[qii][0][1],OC_MODE_LOGQ[qii][1][0],
|
||||
OC_MODE_LOGQ[qii][1][1],OC_MODE_LOGQ[qii][2][0],OC_MODE_LOGQ[qii][2][1],
|
||||
qii+1<OC_LOGQ_BINS?",":"");
|
||||
}
|
||||
fprintf(_fout,
|
||||
"};\n"
|
||||
"\n");
|
||||
oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SATD",OC_MODE_RD_SATD);
|
||||
oc_mode_metrics_print_rd(_fout,"OC_MODE_RD_SAD",OC_MODE_RD_SAD);
|
||||
fprintf(_fout,
|
||||
"#endif\n");
|
||||
}
|
||||
|
||||
|
||||
# if !defined(OC_COLLECT_NO_ENC_FUNCS)
|
||||
void oc_enc_mode_metrics_load(oc_enc_ctx *_enc){
|
||||
oc_restore_fpu(&_enc->state);
|
||||
/*Load any existing mode metrics if we haven't already.*/
|
||||
if(!OC_HAS_MODE_METRICS){
|
||||
FILE *fmetrics;
|
||||
memset(OC_MODE_METRICS_SATD,0,sizeof(OC_MODE_METRICS_SATD));
|
||||
memset(OC_MODE_METRICS_SAD,0,sizeof(OC_MODE_METRICS_SAD));
|
||||
fmetrics=fopen(OC_MODE_METRICS_FILENAME,"rb");
|
||||
if(fmetrics!=NULL){
|
||||
/*Read in the binary structures as written my oc_mode_metrics_dump().
|
||||
Note this format isn't portable between different platforms.*/
|
||||
(void)fread(OC_MODE_LOGQ,sizeof(OC_MODE_LOGQ),1,fmetrics);
|
||||
(void)fread(OC_MODE_METRICS_SATD,sizeof(OC_MODE_METRICS_SATD),1,fmetrics);
|
||||
(void)fread(OC_MODE_METRICS_SAD,sizeof(OC_MODE_METRICS_SAD),1,fmetrics);
|
||||
fclose(fmetrics);
|
||||
}
|
||||
else{
|
||||
int qii;
|
||||
int qi;
|
||||
int pli;
|
||||
int qti;
|
||||
for(qii=0;qii<OC_LOGQ_BINS;qii++){
|
||||
qi=(63*qii+(OC_LOGQ_BINS-1>>1))/(OC_LOGQ_BINS-1);
|
||||
for(pli=0;pli<3;pli++)for(qti=0;qti<2;qti++){
|
||||
OC_MODE_LOGQ[qii][pli][qti]=_enc->log_plq[qi][pli][qti];
|
||||
}
|
||||
}
|
||||
}
|
||||
oc_mode_metrics_update(OC_MODE_METRICS_SATD,100,1,
|
||||
OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD);
|
||||
oc_mode_metrics_update(OC_MODE_METRICS_SAD,100,1,
|
||||
OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD);
|
||||
OC_HAS_MODE_METRICS=1;
|
||||
}
|
||||
}
|
||||
|
||||
/*The following token skipping code used to also be used in the decoder (and
|
||||
even at one point other places in the encoder).
|
||||
However, it was obsoleted by other optimizations, and is now only used here.
|
||||
It has been moved here to avoid generating the code when it's not needed.*/
|
||||
|
||||
/*Determines the number of blocks or coefficients to be skipped for a given
|
||||
token value.
|
||||
_token: The token value to skip.
|
||||
_extra_bits: The extra bits attached to this token.
|
||||
Return: A positive value indicates that number of coefficients are to be
|
||||
skipped in the current block.
|
||||
Otherwise, the negative of the return value indicates that number of
|
||||
blocks are to be ended.*/
|
||||
typedef ptrdiff_t (*oc_token_skip_func)(int _token,int _extra_bits);
|
||||
|
||||
/*Handles the simple end of block tokens.*/
|
||||
static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){
|
||||
int nblocks_adjust;
|
||||
nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1;
|
||||
return -_extra_bits-nblocks_adjust;
|
||||
}
|
||||
|
||||
/*The last EOB token has a special case, where an EOB run of size zero ends all
|
||||
the remaining blocks in the frame.*/
|
||||
static ptrdiff_t oc_token_skip_eob6(int _token,int _extra_bits){
|
||||
/*Note: We want to return -PTRDIFF_MAX, but that requires C99, which is not
|
||||
yet available everywhere; this should be equivalent.*/
|
||||
if(!_extra_bits)return -(~(size_t)0>>1);
|
||||
return -_extra_bits;
|
||||
}
|
||||
|
||||
/*Handles the pure zero run tokens.*/
|
||||
static ptrdiff_t oc_token_skip_zrl(int _token,int _extra_bits){
|
||||
return _extra_bits+1;
|
||||
}
|
||||
|
||||
/*Handles a normal coefficient value token.*/
|
||||
static ptrdiff_t oc_token_skip_val(void){
|
||||
return 1;
|
||||
}
|
||||
|
||||
/*Handles a category 1A zero run/coefficient value combo token.*/
|
||||
static ptrdiff_t oc_token_skip_run_cat1a(int _token){
|
||||
return _token-OC_DCT_RUN_CAT1A+2;
|
||||
}
|
||||
|
||||
/*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/
|
||||
static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){
|
||||
int run_cati;
|
||||
int ncoeffs_mask;
|
||||
int ncoeffs_adjust;
|
||||
run_cati=_token-OC_DCT_RUN_CAT1B;
|
||||
ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati);
|
||||
ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati);
|
||||
return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust;
|
||||
}
|
||||
|
||||
/*A jump table for computing the number of coefficients or blocks to skip for
|
||||
a given token value.
|
||||
This reduces all the conditional branches, etc., needed to parse these token
|
||||
values down to one indirect jump.*/
|
||||
static const oc_token_skip_func OC_TOKEN_SKIP_TABLE[TH_NDCT_TOKENS]={
|
||||
oc_token_skip_eob,
|
||||
oc_token_skip_eob,
|
||||
oc_token_skip_eob,
|
||||
oc_token_skip_eob,
|
||||
oc_token_skip_eob,
|
||||
oc_token_skip_eob,
|
||||
oc_token_skip_eob6,
|
||||
oc_token_skip_zrl,
|
||||
oc_token_skip_zrl,
|
||||
(oc_token_skip_func)oc_token_skip_val,
|
||||
(oc_token_skip_func)oc_token_skip_val,
|
||||
(oc_token_skip_func)oc_token_skip_val,
|
||||
(oc_token_skip_func)oc_token_skip_val,
|
||||
(oc_token_skip_func)oc_token_skip_val,
|
||||
(oc_token_skip_func)oc_token_skip_val,
|
||||
(oc_token_skip_func)oc_token_skip_val,
|
||||
(oc_token_skip_func)oc_token_skip_val,
|
||||
(oc_token_skip_func)oc_token_skip_val,
|
||||
(oc_token_skip_func)oc_token_skip_val,
|
||||
(oc_token_skip_func)oc_token_skip_val,
|
||||
(oc_token_skip_func)oc_token_skip_val,
|
||||
(oc_token_skip_func)oc_token_skip_val,
|
||||
(oc_token_skip_func)oc_token_skip_val,
|
||||
(oc_token_skip_func)oc_token_skip_run_cat1a,
|
||||
(oc_token_skip_func)oc_token_skip_run_cat1a,
|
||||
(oc_token_skip_func)oc_token_skip_run_cat1a,
|
||||
(oc_token_skip_func)oc_token_skip_run_cat1a,
|
||||
(oc_token_skip_func)oc_token_skip_run_cat1a,
|
||||
oc_token_skip_run,
|
||||
oc_token_skip_run,
|
||||
oc_token_skip_run,
|
||||
oc_token_skip_run
|
||||
};
|
||||
|
||||
/*Determines the number of blocks or coefficients to be skipped for a given
|
||||
token value.
|
||||
_token: The token value to skip.
|
||||
_extra_bits: The extra bits attached to this token.
|
||||
Return: A positive value indicates that number of coefficients are to be
|
||||
skipped in the current block.
|
||||
Otherwise, the negative of the return value indicates that number of
|
||||
blocks are to be ended.
|
||||
0 will never be returned, so that at least one coefficient in one
|
||||
block will always be decoded for every token.*/
|
||||
static ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits){
|
||||
return (*OC_TOKEN_SKIP_TABLE[_token])(_token,_extra_bits);
|
||||
}
|
||||
|
||||
|
||||
void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){
|
||||
static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
|
||||
0,16,16,16,16,16,32,32,
|
||||
32,32,32,32,32,32,32,48,
|
||||
48,48,48,48,48,48,48,48,
|
||||
48,48,48,48,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64
|
||||
};
|
||||
const oc_fragment *frags;
|
||||
const unsigned *frag_sad;
|
||||
const unsigned *frag_satd;
|
||||
const unsigned *frag_ssd;
|
||||
const ptrdiff_t *coded_fragis;
|
||||
ptrdiff_t ncoded_fragis;
|
||||
ptrdiff_t fragii;
|
||||
double fragw;
|
||||
int modelines[3][3][2];
|
||||
int qti;
|
||||
int qii;
|
||||
int qi;
|
||||
int pli;
|
||||
int zzi;
|
||||
int token;
|
||||
int eb;
|
||||
oc_restore_fpu(&_enc->state);
|
||||
/*Figure out which metric bins to use for this frame's quantizers.*/
|
||||
for(qii=0;qii<_enc->state.nqis;qii++){
|
||||
for(pli=0;pli<3;pli++){
|
||||
for(qti=0;qti<2;qti++){
|
||||
int log_plq;
|
||||
int modeline;
|
||||
log_plq=_enc->log_plq[_enc->state.qis[qii]][pli][qti];
|
||||
for(modeline=0;modeline<OC_LOGQ_BINS-1&&
|
||||
OC_MODE_LOGQ[modeline+1][pli][qti]>log_plq;modeline++);
|
||||
modelines[qii][pli][qti]=modeline;
|
||||
}
|
||||
}
|
||||
}
|
||||
qti=_enc->state.frame_type;
|
||||
frags=_enc->state.frags;
|
||||
frag_sad=_enc->frag_sad;
|
||||
frag_satd=_enc->frag_satd;
|
||||
frag_ssd=_enc->frag_ssd;
|
||||
coded_fragis=_enc->state.coded_fragis;
|
||||
ncoded_fragis=fragii=0;
|
||||
/*Weight the fragments by the inverse frame size; this prevents HD content
|
||||
from dominating the statistics.*/
|
||||
fragw=1.0/_enc->state.nfrags;
|
||||
for(pli=0;pli<3;pli++){
|
||||
ptrdiff_t ti[64];
|
||||
int eob_token[64];
|
||||
int eob_run[64];
|
||||
/*Set up token indices and eob run counts.
|
||||
We don't bother trying to figure out the real cost of the runs that span
|
||||
coefficients; instead we use the costs that were available when R-D
|
||||
token optimization was done.*/
|
||||
for(zzi=0;zzi<64;zzi++){
|
||||
ti[zzi]=_enc->dct_token_offs[pli][zzi];
|
||||
if(ti[zzi]>0){
|
||||
token=_enc->dct_tokens[pli][zzi][0];
|
||||
eb=_enc->extra_bits[pli][zzi][0];
|
||||
eob_token[zzi]=token;
|
||||
eob_run[zzi]=-oc_dct_token_skip(token,eb);
|
||||
}
|
||||
else{
|
||||
eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
|
||||
eob_run[zzi]=0;
|
||||
}
|
||||
}
|
||||
/*Scan the list of coded fragments for this plane.*/
|
||||
ncoded_fragis+=_enc->state.ncoded_fragis[pli];
|
||||
for(;fragii<ncoded_fragis;fragii++){
|
||||
ptrdiff_t fragi;
|
||||
int frag_bits;
|
||||
int huffi;
|
||||
int skip;
|
||||
int mb_mode;
|
||||
unsigned sad;
|
||||
unsigned satd;
|
||||
double sqrt_ssd;
|
||||
int bin;
|
||||
int qtj;
|
||||
fragi=coded_fragis[fragii];
|
||||
frag_bits=0;
|
||||
for(zzi=0;zzi<64;){
|
||||
if(eob_run[zzi]>0){
|
||||
/*We've reached the end of the block.*/
|
||||
eob_run[zzi]--;
|
||||
break;
|
||||
}
|
||||
huffi=_enc->huff_idxs[qti][zzi>0][pli+1>>1]
|
||||
+OC_ZZI_HUFF_OFFSET[zzi];
|
||||
if(eob_token[zzi]<OC_NDCT_EOB_TOKEN_MAX){
|
||||
/*This token caused an EOB run to be flushed.
|
||||
Therefore it gets the bits associated with it.*/
|
||||
frag_bits+=_enc->huff_codes[huffi][eob_token[zzi]].nbits
|
||||
+OC_DCT_TOKEN_EXTRA_BITS[eob_token[zzi]];
|
||||
eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
|
||||
}
|
||||
token=_enc->dct_tokens[pli][zzi][ti[zzi]];
|
||||
eb=_enc->extra_bits[pli][zzi][ti[zzi]];
|
||||
ti[zzi]++;
|
||||
skip=oc_dct_token_skip(token,eb);
|
||||
if(skip<0){
|
||||
eob_token[zzi]=token;
|
||||
eob_run[zzi]=-skip;
|
||||
}
|
||||
else{
|
||||
/*A regular DCT value token; accumulate the bits for it.*/
|
||||
frag_bits+=_enc->huff_codes[huffi][token].nbits
|
||||
+OC_DCT_TOKEN_EXTRA_BITS[token];
|
||||
zzi+=skip;
|
||||
}
|
||||
}
|
||||
mb_mode=frags[fragi].mb_mode;
|
||||
qii=frags[fragi].qii;
|
||||
qi=_enc->state.qis[qii];
|
||||
sad=frag_sad[fragi]<<(pli+1&2);
|
||||
satd=frag_satd[fragi]<<(pli+1&2);
|
||||
sqrt_ssd=sqrt(frag_ssd[fragi]);
|
||||
qtj=mb_mode!=OC_MODE_INTRA;
|
||||
/*Accumulate statistics.
|
||||
The rate (frag_bits) and RMSE (sqrt(frag_ssd)) are not scaled by
|
||||
OC_BIT_SCALE and OC_RMSE_SCALE; this lets us change the scale factor
|
||||
yet still use old data.*/
|
||||
bin=OC_MINI(satd>>OC_SATD_SHIFT,OC_COMP_BINS-1);
|
||||
oc_mode_metrics_add(
|
||||
OC_MODE_METRICS_SATD[modelines[qii][pli][qtj]][pli][qtj]+bin,
|
||||
fragw,satd,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd);
|
||||
bin=OC_MINI(sad>>OC_SAD_SHIFT,OC_COMP_BINS-1);
|
||||
oc_mode_metrics_add(
|
||||
OC_MODE_METRICS_SAD[modelines[qii][pli][qtj]][pli][qtj]+bin,
|
||||
fragw,sad,_enc->log_plq[qi][pli][qtj],frag_bits,sqrt_ssd);
|
||||
}
|
||||
}
|
||||
/*Update global SA(T)D/logq/rate/RMSE estimation matrix.*/
|
||||
oc_mode_metrics_update(OC_MODE_METRICS_SATD,4,1,
|
||||
OC_MODE_RD_SATD,OC_SATD_SHIFT,OC_MODE_RD_WEIGHT_SATD);
|
||||
oc_mode_metrics_update(OC_MODE_METRICS_SAD,4,1,
|
||||
OC_MODE_RD_SAD,OC_SAD_SHIFT,OC_MODE_RD_WEIGHT_SAD);
|
||||
}
|
||||
# endif
|
||||
|
||||
#endif
|
||||
109
engine/thirdparty/libtheora/collect.h
vendored
Normal file
109
engine/thirdparty/libtheora/collect.h
vendored
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function: mode selection code
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
#if !defined(_collect_H)
|
||||
# define _collect_H (1)
|
||||
# include "encint.h"
|
||||
# if defined(OC_COLLECT_METRICS)
|
||||
# include <stdio.h>
|
||||
|
||||
|
||||
|
||||
typedef struct oc_mode_metrics oc_mode_metrics;
|
||||
|
||||
|
||||
|
||||
/**Sets the file name to load/store mode metrics from/to.
|
||||
* The file name string is stored by reference, and so must be valid for the
|
||||
* lifetime of the encoder.
|
||||
* Mode metric collection uses global tables; do not attempt to perform
|
||||
* multiple collections at once.
|
||||
* \param[in] _buf <tt>char[]</tt> The file name.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_METRICS_FILE (0x8000)
|
||||
|
||||
|
||||
|
||||
/*Accumulates various weighted sums of the measurements.
|
||||
w -> weight
|
||||
s -> SATD
|
||||
q -> log quantizer
|
||||
r -> rate (in bits)
|
||||
d -> RMSE
|
||||
All of the single letters correspond to direct, weighted sums, e.g.,
|
||||
w=sum(w_i), s=sum(s_i*w_i), etc.
|
||||
The others correspond to central moments (or co-moments) of the given order,
|
||||
e.g., sq=sum((s_i-s/w)*(q_i-q/w)*w_i).
|
||||
Because we need some moments up to fourth order, we use central moments to
|
||||
minimize the dynamic range and prevent rounding error from dominating the
|
||||
calculations.*/
|
||||
struct oc_mode_metrics{
|
||||
double w;
|
||||
double s;
|
||||
double q;
|
||||
double r;
|
||||
double d;
|
||||
double s2;
|
||||
double sq;
|
||||
double q2;
|
||||
double sr;
|
||||
double qr;
|
||||
double r2;
|
||||
double sd;
|
||||
double qd;
|
||||
double d2;
|
||||
double s2q;
|
||||
double sq2;
|
||||
double sqr;
|
||||
double sqd;
|
||||
double s2q2;
|
||||
};
|
||||
|
||||
|
||||
# define OC_ZWEIGHT (0.25)
|
||||
|
||||
/*TODO: It may be helpful (for block-level quantizers especially) to separate
|
||||
out the contributions from AC and DC into separate tables.*/
|
||||
|
||||
extern ogg_int16_t OC_MODE_LOGQ[OC_LOGQ_BINS][3][2];
|
||||
extern oc_mode_rd OC_MODE_RD_SATD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
|
||||
extern oc_mode_rd OC_MODE_RD_SAD[OC_LOGQ_BINS][3][2][OC_COMP_BINS];
|
||||
|
||||
extern int OC_HAS_MODE_METRICS;
|
||||
extern oc_mode_metrics OC_MODE_METRICS_SATD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
|
||||
extern oc_mode_metrics OC_MODE_METRICS_SAD[OC_LOGQ_BINS-1][3][2][OC_COMP_BINS];
|
||||
extern const char *OC_MODE_METRICS_FILENAME;
|
||||
|
||||
void oc_mode_metrics_dump();
|
||||
void oc_mode_metrics_print(FILE *_fout);
|
||||
|
||||
void oc_mode_metrics_add(oc_mode_metrics *_metrics,
|
||||
double _w,int _s,int _q,int _r,double _d);
|
||||
void oc_mode_metrics_merge(oc_mode_metrics *_dst,
|
||||
const oc_mode_metrics *_src,int _n);
|
||||
double oc_mode_metrics_solve(double *_r,double *_d,
|
||||
const oc_mode_metrics *_metrics,const int *_s0,const int *_s1,
|
||||
const int *_q0,const int *_q1,
|
||||
const double *_ra,const double *_rb,const double *_rc,
|
||||
const double *_da,const double *_db,const double *_dc,int _n);
|
||||
void oc_mode_metrics_update(oc_mode_metrics (*_metrics)[3][2][OC_COMP_BINS],
|
||||
int _niters_min,int _reweight,oc_mode_rd (*_table)[3][2][OC_COMP_BINS],
|
||||
int shift,double (*_weight)[3][2][OC_COMP_BINS]);
|
||||
void oc_enc_mode_metrics_load(oc_enc_ctx *_enc);
|
||||
void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc);
|
||||
|
||||
# endif
|
||||
#endif
|
||||
31
engine/thirdparty/libtheora/dct.h
vendored
Normal file
31
engine/thirdparty/libtheora/dct.h
vendored
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/*Definitions shared by the forward and inverse DCT transforms.*/
|
||||
#if !defined(_dct_H)
|
||||
# define _dct_H (1)
|
||||
|
||||
/*cos(n*pi/16) (resp. sin(m*pi/16)) scaled by 65536.*/
|
||||
#define OC_C1S7 ((ogg_int32_t)64277)
|
||||
#define OC_C2S6 ((ogg_int32_t)60547)
|
||||
#define OC_C3S5 ((ogg_int32_t)54491)
|
||||
#define OC_C4S4 ((ogg_int32_t)46341)
|
||||
#define OC_C5S3 ((ogg_int32_t)36410)
|
||||
#define OC_C6S2 ((ogg_int32_t)25080)
|
||||
#define OC_C7S1 ((ogg_int32_t)12785)
|
||||
|
||||
#endif
|
||||
193
engine/thirdparty/libtheora/decapiwrapper.c
vendored
Normal file
193
engine/thirdparty/libtheora/decapiwrapper.c
vendored
Normal file
|
|
@ -0,0 +1,193 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: decapiwrapper.c 13596 2007-08-23 20:05:38Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include "apiwrapper.h"
|
||||
#include "decint.h"
|
||||
#include "theora/theoradec.h"
|
||||
|
||||
static void th_dec_api_clear(th_api_wrapper *_api){
|
||||
if(_api->setup)th_setup_free(_api->setup);
|
||||
if(_api->decode)th_decode_free(_api->decode);
|
||||
memset(_api,0,sizeof(*_api));
|
||||
}
|
||||
|
||||
static void theora_decode_clear(theora_state *_td){
|
||||
if(_td->i!=NULL)theora_info_clear(_td->i);
|
||||
memset(_td,0,sizeof(*_td));
|
||||
}
|
||||
|
||||
static int theora_decode_control(theora_state *_td,int _req,
|
||||
void *_buf,size_t _buf_sz){
|
||||
return th_decode_ctl(((th_api_wrapper *)_td->i->codec_setup)->decode,
|
||||
_req,_buf,_buf_sz);
|
||||
}
|
||||
|
||||
static ogg_int64_t theora_decode_granule_frame(theora_state *_td,
|
||||
ogg_int64_t _gp){
|
||||
return th_granule_frame(((th_api_wrapper *)_td->i->codec_setup)->decode,_gp);
|
||||
}
|
||||
|
||||
static double theora_decode_granule_time(theora_state *_td,ogg_int64_t _gp){
|
||||
return th_granule_time(((th_api_wrapper *)_td->i->codec_setup)->decode,_gp);
|
||||
}
|
||||
|
||||
static const oc_state_dispatch_vtable OC_DEC_DISPATCH_VTBL={
|
||||
(oc_state_clear_func)theora_decode_clear,
|
||||
(oc_state_control_func)theora_decode_control,
|
||||
(oc_state_granule_frame_func)theora_decode_granule_frame,
|
||||
(oc_state_granule_time_func)theora_decode_granule_time,
|
||||
};
|
||||
|
||||
static void th_info2theora_info(theora_info *_ci,const th_info *_info){
|
||||
_ci->version_major=_info->version_major;
|
||||
_ci->version_minor=_info->version_minor;
|
||||
_ci->version_subminor=_info->version_subminor;
|
||||
_ci->width=_info->frame_width;
|
||||
_ci->height=_info->frame_height;
|
||||
_ci->frame_width=_info->pic_width;
|
||||
_ci->frame_height=_info->pic_height;
|
||||
_ci->offset_x=_info->pic_x;
|
||||
_ci->offset_y=_info->pic_y;
|
||||
_ci->fps_numerator=_info->fps_numerator;
|
||||
_ci->fps_denominator=_info->fps_denominator;
|
||||
_ci->aspect_numerator=_info->aspect_numerator;
|
||||
_ci->aspect_denominator=_info->aspect_denominator;
|
||||
switch(_info->colorspace){
|
||||
case TH_CS_ITU_REC_470M:_ci->colorspace=OC_CS_ITU_REC_470M;break;
|
||||
case TH_CS_ITU_REC_470BG:_ci->colorspace=OC_CS_ITU_REC_470BG;break;
|
||||
default:_ci->colorspace=OC_CS_UNSPECIFIED;break;
|
||||
}
|
||||
switch(_info->pixel_fmt){
|
||||
case TH_PF_420:_ci->pixelformat=OC_PF_420;break;
|
||||
case TH_PF_422:_ci->pixelformat=OC_PF_422;break;
|
||||
case TH_PF_444:_ci->pixelformat=OC_PF_444;break;
|
||||
default:_ci->pixelformat=OC_PF_RSVD;
|
||||
}
|
||||
_ci->target_bitrate=_info->target_bitrate;
|
||||
_ci->quality=_info->quality;
|
||||
_ci->keyframe_frequency_force=1<<_info->keyframe_granule_shift;
|
||||
}
|
||||
|
||||
int theora_decode_init(theora_state *_td,theora_info *_ci){
|
||||
th_api_info *apiinfo;
|
||||
th_api_wrapper *api;
|
||||
th_info info;
|
||||
api=(th_api_wrapper *)_ci->codec_setup;
|
||||
/*Allocate our own combined API wrapper/theora_info struct.
|
||||
We put them both in one malloc'd block so that when the API wrapper is
|
||||
freed, the info struct goes with it.
|
||||
This avoids having to figure out whether or not we need to free the info
|
||||
struct in either theora_info_clear() or theora_clear().*/
|
||||
apiinfo=(th_api_info *)_ogg_calloc(1,sizeof(*apiinfo));
|
||||
if(apiinfo==NULL)return OC_FAULT;
|
||||
/*Make our own copy of the info struct, since its lifetime should be
|
||||
independent of the one we were passed in.*/
|
||||
*&apiinfo->info=*_ci;
|
||||
/*Convert the info struct now instead of saving the the one we decoded with
|
||||
theora_decode_header(), since the user might have modified values (i.e.,
|
||||
color space, aspect ratio, etc. can be specified from a higher level).
|
||||
The user also might be doing something "clever" with the header packets if
|
||||
they are not using an Ogg encapsulation.*/
|
||||
oc_theora_info2th_info(&info,_ci);
|
||||
/*Don't bother to copy the setup info; th_decode_alloc() makes its own copy
|
||||
of the stuff it needs.*/
|
||||
apiinfo->api.decode=th_decode_alloc(&info,api->setup);
|
||||
if(apiinfo->api.decode==NULL){
|
||||
_ogg_free(apiinfo);
|
||||
return OC_EINVAL;
|
||||
}
|
||||
apiinfo->api.clear=(oc_setup_clear_func)th_dec_api_clear;
|
||||
_td->internal_encode=NULL;
|
||||
/*Provide entry points for ABI compatibility with old decoder shared libs.*/
|
||||
_td->internal_decode=(void *)&OC_DEC_DISPATCH_VTBL;
|
||||
_td->granulepos=0;
|
||||
_td->i=&apiinfo->info;
|
||||
_td->i->codec_setup=&apiinfo->api;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int theora_decode_header(theora_info *_ci,theora_comment *_cc,ogg_packet *_op){
|
||||
th_api_wrapper *api;
|
||||
th_info info;
|
||||
int ret;
|
||||
api=(th_api_wrapper *)_ci->codec_setup;
|
||||
/*Allocate an API wrapper struct on demand, since it will not also include a
|
||||
theora_info struct like the ones that are used in a theora_state struct.*/
|
||||
if(api==NULL){
|
||||
_ci->codec_setup=_ogg_calloc(1,sizeof(*api));
|
||||
if(_ci->codec_setup==NULL)return OC_FAULT;
|
||||
api=(th_api_wrapper *)_ci->codec_setup;
|
||||
api->clear=(oc_setup_clear_func)th_dec_api_clear;
|
||||
}
|
||||
/*Convert from the theora_info struct instead of saving our own th_info
|
||||
struct between calls.
|
||||
The user might be doing something "clever" with the header packets if they
|
||||
are not using an Ogg encapsulation, and we don't want to break this.*/
|
||||
oc_theora_info2th_info(&info,_ci);
|
||||
/*We rely on the fact that theora_comment and th_comment structures are
|
||||
actually identical.
|
||||
Take care not to change this fact unless you change the code here as
|
||||
well!*/
|
||||
ret=th_decode_headerin(&info,(th_comment *)_cc,&api->setup,_op);
|
||||
/*We also rely on the fact that the error return code values are the same,
|
||||
and that the implementations of these two functions return the same set of
|
||||
them.
|
||||
Note that theora_decode_header() really can return OC_NOTFORMAT, even
|
||||
though it is not currently documented to do so.*/
|
||||
if(ret<0)return ret;
|
||||
th_info2theora_info(_ci,&info);
|
||||
return 0;
|
||||
}
|
||||
|
||||
int theora_decode_packetin(theora_state *_td,ogg_packet *_op){
|
||||
th_api_wrapper *api;
|
||||
ogg_int64_t gp;
|
||||
int ret;
|
||||
if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT;
|
||||
api=(th_api_wrapper *)_td->i->codec_setup;
|
||||
ret=th_decode_packetin(api->decode,_op,&gp);
|
||||
if(ret<0)return OC_BADPACKET;
|
||||
_td->granulepos=gp;
|
||||
return 0;
|
||||
}
|
||||
|
||||
int theora_decode_YUVout(theora_state *_td,yuv_buffer *_yuv){
|
||||
th_api_wrapper *api;
|
||||
th_dec_ctx *decode;
|
||||
th_ycbcr_buffer buf;
|
||||
int ret;
|
||||
if(!_td||!_td->i||!_td->i->codec_setup)return OC_FAULT;
|
||||
api=(th_api_wrapper *)_td->i->codec_setup;
|
||||
decode=(th_dec_ctx *)api->decode;
|
||||
if(!decode)return OC_FAULT;
|
||||
ret=th_decode_ycbcr_out(decode,buf);
|
||||
if(ret>=0){
|
||||
_yuv->y_width=buf[0].width;
|
||||
_yuv->y_height=buf[0].height;
|
||||
_yuv->y_stride=buf[0].stride;
|
||||
_yuv->uv_width=buf[1].width;
|
||||
_yuv->uv_height=buf[1].height;
|
||||
_yuv->uv_stride=buf[1].stride;
|
||||
_yuv->y=buf[0].data;
|
||||
_yuv->u=buf[1].data;
|
||||
_yuv->v=buf[2].data;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
274
engine/thirdparty/libtheora/decinfo.c
vendored
Normal file
274
engine/thirdparty/libtheora/decinfo.c
vendored
Normal file
|
|
@ -0,0 +1,274 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include "decint.h"
|
||||
|
||||
/*Only used for fuzzing.*/
|
||||
#if defined(HAVE_MEMORY_CONSTRAINT)
|
||||
static const int MAX_FUZZING_WIDTH = 16384;
|
||||
static const int MAX_FUZZING_HEIGHT = 16384;
|
||||
#endif
|
||||
|
||||
|
||||
/*Unpacks a series of octets from a given byte array into the pack buffer.
|
||||
No checking is done to ensure the buffer contains enough data.
|
||||
_opb: The pack buffer to read the octets from.
|
||||
_buf: The byte array to store the unpacked bytes in.
|
||||
_len: The number of octets to unpack.*/
|
||||
static void oc_unpack_octets(oc_pack_buf *_opb,char *_buf,size_t _len){
|
||||
while(_len-->0){
|
||||
long val;
|
||||
val=oc_pack_read(_opb,8);
|
||||
*_buf++=(char)val;
|
||||
}
|
||||
}
|
||||
|
||||
/*Unpacks a 32-bit integer encoded by octets in little-endian form.*/
|
||||
static long oc_unpack_length(oc_pack_buf *_opb){
|
||||
long ret[4];
|
||||
int i;
|
||||
for(i=0;i<4;i++)ret[i]=oc_pack_read(_opb,8);
|
||||
return ret[0]|ret[1]<<8|ret[2]<<16|ret[3]<<24;
|
||||
}
|
||||
|
||||
static int oc_info_unpack(oc_pack_buf *_opb,th_info *_info){
|
||||
long val;
|
||||
/*Check the codec bitstream version.*/
|
||||
val=oc_pack_read(_opb,8);
|
||||
_info->version_major=(unsigned char)val;
|
||||
val=oc_pack_read(_opb,8);
|
||||
_info->version_minor=(unsigned char)val;
|
||||
val=oc_pack_read(_opb,8);
|
||||
_info->version_subminor=(unsigned char)val;
|
||||
/*verify we can parse this bitstream version.
|
||||
We accept earlier minors and all subminors, by spec*/
|
||||
if(_info->version_major>TH_VERSION_MAJOR||
|
||||
(_info->version_major==TH_VERSION_MAJOR&&
|
||||
_info->version_minor>TH_VERSION_MINOR)){
|
||||
return TH_EVERSION;
|
||||
}
|
||||
/*Read the encoded frame description.*/
|
||||
val=oc_pack_read(_opb,16);
|
||||
_info->frame_width=(ogg_uint32_t)val<<4;
|
||||
val=oc_pack_read(_opb,16);
|
||||
_info->frame_height=(ogg_uint32_t)val<<4;
|
||||
val=oc_pack_read(_opb,24);
|
||||
_info->pic_width=(ogg_uint32_t)val;
|
||||
val=oc_pack_read(_opb,24);
|
||||
_info->pic_height=(ogg_uint32_t)val;
|
||||
val=oc_pack_read(_opb,8);
|
||||
_info->pic_x=(ogg_uint32_t)val;
|
||||
val=oc_pack_read(_opb,8);
|
||||
_info->pic_y=(ogg_uint32_t)val;
|
||||
val=oc_pack_read(_opb,32);
|
||||
_info->fps_numerator=(ogg_uint32_t)val;
|
||||
val=oc_pack_read(_opb,32);
|
||||
_info->fps_denominator=(ogg_uint32_t)val;
|
||||
if(_info->frame_width==0||_info->frame_height==0||
|
||||
_info->pic_width+_info->pic_x>_info->frame_width||
|
||||
_info->pic_height+_info->pic_y>_info->frame_height||
|
||||
_info->fps_numerator==0||_info->fps_denominator==0){
|
||||
return TH_EBADHEADER;
|
||||
}
|
||||
#if defined(HAVE_MEMORY_CONSTRAINT)
|
||||
if(_info->frame_width>=MAX_FUZZING_WIDTH&&_info->frame_height>=MAX_FUZZING_HEIGHT){
|
||||
return TH_EBADHEADER;
|
||||
}
|
||||
#endif
|
||||
/*Note: The sense of pic_y is inverted in what we pass back to the
|
||||
application compared to how it is stored in the bitstream.
|
||||
This is because the bitstream uses a right-handed coordinate system, while
|
||||
applications expect a left-handed one.*/
|
||||
_info->pic_y=_info->frame_height-_info->pic_height-_info->pic_y;
|
||||
val=oc_pack_read(_opb,24);
|
||||
_info->aspect_numerator=(ogg_uint32_t)val;
|
||||
val=oc_pack_read(_opb,24);
|
||||
_info->aspect_denominator=(ogg_uint32_t)val;
|
||||
val=oc_pack_read(_opb,8);
|
||||
_info->colorspace=(th_colorspace)val;
|
||||
val=oc_pack_read(_opb,24);
|
||||
_info->target_bitrate=(int)val;
|
||||
val=oc_pack_read(_opb,6);
|
||||
_info->quality=(int)val;
|
||||
val=oc_pack_read(_opb,5);
|
||||
_info->keyframe_granule_shift=(int)val;
|
||||
val=oc_pack_read(_opb,2);
|
||||
_info->pixel_fmt=(th_pixel_fmt)val;
|
||||
if(_info->pixel_fmt==TH_PF_RSVD)return TH_EBADHEADER;
|
||||
val=oc_pack_read(_opb,3);
|
||||
if(val!=0||oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int oc_comment_unpack(oc_pack_buf *_opb,th_comment *_tc){
|
||||
long len;
|
||||
int i;
|
||||
/*Read the vendor string.*/
|
||||
len=oc_unpack_length(_opb);
|
||||
if(len<0||len>oc_pack_bytes_left(_opb))return TH_EBADHEADER;
|
||||
_tc->vendor=_ogg_malloc((size_t)len+1);
|
||||
if(_tc->vendor==NULL)return TH_EFAULT;
|
||||
oc_unpack_octets(_opb,_tc->vendor,len);
|
||||
_tc->vendor[len]='\0';
|
||||
/*Read the user comments.*/
|
||||
_tc->comments=(int)oc_unpack_length(_opb);
|
||||
len=_tc->comments;
|
||||
if(len<0||len>(LONG_MAX>>2)||len<<2>oc_pack_bytes_left(_opb)){
|
||||
_tc->comments=0;
|
||||
return TH_EBADHEADER;
|
||||
}
|
||||
_tc->comment_lengths=(int *)_ogg_malloc(
|
||||
_tc->comments*sizeof(_tc->comment_lengths[0]));
|
||||
_tc->user_comments=(char **)_ogg_malloc(
|
||||
_tc->comments*sizeof(_tc->user_comments[0]));
|
||||
if(_tc->comment_lengths==NULL||_tc->user_comments==NULL){
|
||||
_tc->comments=0;
|
||||
return TH_EFAULT;
|
||||
}
|
||||
for(i=0;i<_tc->comments;i++){
|
||||
len=oc_unpack_length(_opb);
|
||||
if(len<0||len>oc_pack_bytes_left(_opb)){
|
||||
_tc->comments=i;
|
||||
return TH_EBADHEADER;
|
||||
}
|
||||
_tc->comment_lengths[i]=len;
|
||||
_tc->user_comments[i]=_ogg_malloc((size_t)len+1);
|
||||
if(_tc->user_comments[i]==NULL){
|
||||
_tc->comments=i;
|
||||
return TH_EFAULT;
|
||||
}
|
||||
oc_unpack_octets(_opb,_tc->user_comments[i],len);
|
||||
_tc->user_comments[i][len]='\0';
|
||||
}
|
||||
return oc_pack_bytes_left(_opb)<0?TH_EBADHEADER:0;
|
||||
}
|
||||
|
||||
static int oc_setup_unpack(oc_pack_buf *_opb,th_setup_info *_setup){
|
||||
int ret;
|
||||
/*Read the quantizer tables.*/
|
||||
ret=oc_quant_params_unpack(_opb,&_setup->qinfo);
|
||||
if(ret<0)return ret;
|
||||
/*Read the Huffman trees.*/
|
||||
return oc_huff_trees_unpack(_opb,_setup->huff_tables);
|
||||
}
|
||||
|
||||
static void oc_setup_clear(th_setup_info *_setup){
|
||||
oc_quant_params_clear(&_setup->qinfo);
|
||||
oc_huff_trees_clear(_setup->huff_tables);
|
||||
}
|
||||
|
||||
static int oc_dec_headerin(oc_pack_buf *_opb,th_info *_info,
|
||||
th_comment *_tc,th_setup_info **_setup,ogg_packet *_op){
|
||||
char buffer[6];
|
||||
long val;
|
||||
int packtype;
|
||||
int ret;
|
||||
val=oc_pack_read(_opb,8);
|
||||
packtype=(int)val;
|
||||
/*If we're at a data packet...*/
|
||||
if(!(packtype&0x80)){
|
||||
/*Check to make sure we received all three headers...
|
||||
If we haven't seen any valid headers, assume this is not actually
|
||||
Theora.*/
|
||||
if(_info->frame_width<=0)return TH_ENOTFORMAT;
|
||||
/*Follow our documentation, which says we'll return TH_EFAULT if this
|
||||
are NULL (_info was checked by our caller).*/
|
||||
if(_tc==NULL)return TH_EFAULT;
|
||||
/*And if any other headers were missing, declare this packet "out of
|
||||
sequence" instead.*/
|
||||
if(_tc->vendor==NULL)return TH_EBADHEADER;
|
||||
/*Don't check this until it's needed, since we allow passing NULL for the
|
||||
arguments that we're not expecting the next header to fill in yet.*/
|
||||
if(_setup==NULL)return TH_EFAULT;
|
||||
if(*_setup==NULL)return TH_EBADHEADER;
|
||||
/*If we got everything, we're done.*/
|
||||
return 0;
|
||||
}
|
||||
/*Check the codec string.*/
|
||||
oc_unpack_octets(_opb,buffer,6);
|
||||
if(memcmp(buffer,"theora",6)!=0)return TH_ENOTFORMAT;
|
||||
switch(packtype){
|
||||
/*Codec info header.*/
|
||||
case 0x80:{
|
||||
/*This should be the first packet, and we should not already be
|
||||
initialized.*/
|
||||
if(!_op->b_o_s||_info->frame_width>0)return TH_EBADHEADER;
|
||||
ret=oc_info_unpack(_opb,_info);
|
||||
if(ret<0)th_info_clear(_info);
|
||||
else ret=3;
|
||||
}break;
|
||||
/*Comment header.*/
|
||||
case 0x81:{
|
||||
if(_tc==NULL)return TH_EFAULT;
|
||||
/*We shoud have already decoded the info header, and should not yet have
|
||||
decoded the comment header.*/
|
||||
if(_info->frame_width==0||_tc->vendor!=NULL)return TH_EBADHEADER;
|
||||
ret=oc_comment_unpack(_opb,_tc);
|
||||
if(ret<0)th_comment_clear(_tc);
|
||||
else ret=2;
|
||||
}break;
|
||||
/*Codec setup header.*/
|
||||
case 0x82:{
|
||||
oc_setup_info *setup;
|
||||
if(_tc==NULL||_setup==NULL)return TH_EFAULT;
|
||||
/*We should have already decoded the info header and the comment header,
|
||||
and should not yet have decoded the setup header.*/
|
||||
if(_info->frame_width==0||_tc->vendor==NULL||*_setup!=NULL){
|
||||
return TH_EBADHEADER;
|
||||
}
|
||||
setup=(oc_setup_info *)_ogg_calloc(1,sizeof(*setup));
|
||||
if(setup==NULL)return TH_EFAULT;
|
||||
ret=oc_setup_unpack(_opb,setup);
|
||||
if(ret<0){
|
||||
oc_setup_clear(setup);
|
||||
_ogg_free(setup);
|
||||
}
|
||||
else{
|
||||
*_setup=setup;
|
||||
ret=1;
|
||||
}
|
||||
}break;
|
||||
default:{
|
||||
/*We don't know what this header is.*/
|
||||
return TH_EBADHEADER;
|
||||
}break;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*Decodes one header packet.
|
||||
This should be called repeatedly with the packets at the beginning of the
|
||||
stream until it returns 0.*/
|
||||
int th_decode_headerin(th_info *_info,th_comment *_tc,
|
||||
th_setup_info **_setup,ogg_packet *_op){
|
||||
oc_pack_buf opb;
|
||||
if(_op==NULL)return TH_EBADHEADER;
|
||||
if(_info==NULL)return TH_EFAULT;
|
||||
oc_pack_readinit(&opb,_op->packet,_op->bytes);
|
||||
return oc_dec_headerin(&opb,_info,_tc,_setup,_op);
|
||||
}
|
||||
|
||||
void th_setup_free(th_setup_info *_setup){
|
||||
if(_setup!=NULL){
|
||||
oc_setup_clear(_setup);
|
||||
_ogg_free(_setup);
|
||||
}
|
||||
}
|
||||
185
engine/thirdparty/libtheora/decint.h
vendored
Normal file
185
engine/thirdparty/libtheora/decint.h
vendored
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include <limits.h>
|
||||
#if !defined(_decint_H)
|
||||
# define _decint_H (1)
|
||||
# include "theora/theoradec.h"
|
||||
# include "state.h"
|
||||
# include "bitpack.h"
|
||||
# include "huffdec.h"
|
||||
# include "dequant.h"
|
||||
|
||||
typedef struct th_setup_info oc_setup_info;
|
||||
typedef struct oc_dec_opt_vtable oc_dec_opt_vtable;
|
||||
typedef struct oc_dec_pipeline_state oc_dec_pipeline_state;
|
||||
typedef struct th_dec_ctx oc_dec_ctx;
|
||||
|
||||
|
||||
|
||||
/*Decoder-specific accelerated functions.*/
|
||||
# if defined(OC_C64X_ASM)
|
||||
# include "c64x/c64xdec.h"
|
||||
# endif
|
||||
|
||||
# if !defined(oc_dec_accel_init)
|
||||
# define oc_dec_accel_init oc_dec_accel_init_c
|
||||
# endif
|
||||
# if defined(OC_DEC_USE_VTABLE)
|
||||
# if !defined(oc_dec_dc_unpredict_mcu_plane)
|
||||
# define oc_dec_dc_unpredict_mcu_plane(_dec,_pipe,_pli) \
|
||||
((*(_dec)->opt_vtable.dc_unpredict_mcu_plane)(_dec,_pipe,_pli))
|
||||
# endif
|
||||
# else
|
||||
# if !defined(oc_dec_dc_unpredict_mcu_plane)
|
||||
# define oc_dec_dc_unpredict_mcu_plane oc_dec_dc_unpredict_mcu_plane_c
|
||||
# endif
|
||||
# endif
|
||||
|
||||
|
||||
|
||||
/*Constants for the packet-in state machine specific to the decoder.*/
|
||||
|
||||
/*Next packet to read: Data packet.*/
|
||||
#define OC_PACKET_DATA (0)
|
||||
|
||||
|
||||
|
||||
struct th_setup_info{
|
||||
/*The Huffman codes.*/
|
||||
ogg_int16_t *huff_tables[TH_NHUFFMAN_TABLES];
|
||||
/*The quantization parameters.*/
|
||||
th_quant_info qinfo;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*Decoder specific functions with accelerated variants.*/
|
||||
struct oc_dec_opt_vtable{
|
||||
void (*dc_unpredict_mcu_plane)(oc_dec_ctx *_dec,
|
||||
oc_dec_pipeline_state *_pipe,int _pli);
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct oc_dec_pipeline_state{
|
||||
/*Decoded DCT coefficients.
|
||||
These are placed here instead of on the stack so that they can persist
|
||||
between blocks, which makes clearing them back to zero much faster when
|
||||
only a few non-zero coefficients were decoded.
|
||||
It requires at least 65 elements because the zig-zag index array uses the
|
||||
65th element as a dumping ground for out-of-range indices to protect us
|
||||
from buffer overflow.
|
||||
We make it fully twice as large so that the second half can serve as the
|
||||
reconstruction buffer, which saves passing another parameter to all the
|
||||
acceleration functios.
|
||||
It also solves problems with 16-byte alignment for NEON on ARM.
|
||||
gcc (as of 4.2.1) only seems to be able to give stack variables 8-byte
|
||||
alignment, and silently produces incorrect results if you ask for 16.
|
||||
Finally, keeping it off the stack means there's less likely to be a data
|
||||
hazard beween the NEON co-processor and the regular ARM core, which avoids
|
||||
unnecessary stalls.*/
|
||||
OC_ALIGN16(ogg_int16_t dct_coeffs[128]);
|
||||
OC_ALIGN16(signed char bounding_values[256]);
|
||||
ptrdiff_t ti[3][64];
|
||||
ptrdiff_t ebi[3][64];
|
||||
ptrdiff_t eob_runs[3][64];
|
||||
const ptrdiff_t *coded_fragis[3];
|
||||
const ptrdiff_t *uncoded_fragis[3];
|
||||
ptrdiff_t ncoded_fragis[3];
|
||||
ptrdiff_t nuncoded_fragis[3];
|
||||
const ogg_uint16_t *dequant[3][3][2];
|
||||
int fragy0[3];
|
||||
int fragy_end[3];
|
||||
int pred_last[3][4];
|
||||
int mcu_nvfrags;
|
||||
int loop_filter;
|
||||
int pp_level;
|
||||
};
|
||||
|
||||
|
||||
struct th_dec_ctx{
|
||||
/*Shared encoder/decoder state.*/
|
||||
oc_theora_state state;
|
||||
/*Whether or not packets are ready to be emitted.
|
||||
This takes on negative values while there are remaining header packets to
|
||||
be emitted, reaches 0 when the codec is ready for input, and goes to 1
|
||||
when a frame has been processed and a data packet is ready.*/
|
||||
int packet_state;
|
||||
/*Buffer in which to assemble packets.*/
|
||||
oc_pack_buf opb;
|
||||
/*Huffman decode trees.*/
|
||||
ogg_int16_t *huff_tables[TH_NHUFFMAN_TABLES];
|
||||
/*The index of the first token in each plane for each coefficient.*/
|
||||
ptrdiff_t ti0[3][64];
|
||||
/*The number of outstanding EOB runs at the start of each coefficient in each
|
||||
plane.*/
|
||||
ptrdiff_t eob_runs[3][64];
|
||||
/*The DCT token lists.*/
|
||||
unsigned char *dct_tokens;
|
||||
/*The extra bits associated with DCT tokens.*/
|
||||
unsigned char *extra_bits;
|
||||
/*The number of dct tokens unpacked so far.*/
|
||||
int dct_tokens_count;
|
||||
/*The out-of-loop post-processing level.*/
|
||||
int pp_level;
|
||||
/*The DC scale used for out-of-loop deblocking.*/
|
||||
int pp_dc_scale[64];
|
||||
/*The sharpen modifier used for out-of-loop deringing.*/
|
||||
int pp_sharp_mod[64];
|
||||
/*The DC quantization index of each block.*/
|
||||
unsigned char *dc_qis;
|
||||
/*The variance of each block.*/
|
||||
int *variances;
|
||||
/*The storage for the post-processed frame buffer.*/
|
||||
unsigned char *pp_frame_data;
|
||||
/*Whether or not the post-processsed frame buffer has space for chroma.*/
|
||||
int pp_frame_state;
|
||||
/*The buffer used for the post-processed frame.
|
||||
Note that this is _not_ guaranteed to have the same strides and offsets as
|
||||
the reference frame buffers.*/
|
||||
th_ycbcr_buffer pp_frame_buf;
|
||||
/*The striped decode callback function.*/
|
||||
th_stripe_callback stripe_cb;
|
||||
oc_dec_pipeline_state pipe;
|
||||
# if defined(OC_DEC_USE_VTABLE)
|
||||
/*Table for decoder acceleration functions.*/
|
||||
oc_dec_opt_vtable opt_vtable;
|
||||
# endif
|
||||
# if defined(HAVE_CAIRO)
|
||||
/*Output metrics for debugging.*/
|
||||
int telemetry_mbmode;
|
||||
int telemetry_mv;
|
||||
int telemetry_qi;
|
||||
int telemetry_bits;
|
||||
int telemetry_frame_bytes;
|
||||
int telemetry_coding_bytes;
|
||||
int telemetry_mode_bytes;
|
||||
int telemetry_mv_bytes;
|
||||
int telemetry_qi_bytes;
|
||||
int telemetry_dc_bytes;
|
||||
unsigned char *telemetry_frame_data;
|
||||
# endif
|
||||
};
|
||||
|
||||
/*Default pure-C implementations of decoder-specific accelerated functions.*/
|
||||
void oc_dec_accel_init_c(oc_dec_ctx *_dec);
|
||||
|
||||
void oc_dec_dc_unpredict_mcu_plane_c(oc_dec_ctx *_dec,
|
||||
oc_dec_pipeline_state *_pipe,int _pli);
|
||||
|
||||
#endif
|
||||
2992
engine/thirdparty/libtheora/decode.c
vendored
Normal file
2992
engine/thirdparty/libtheora/decode.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
182
engine/thirdparty/libtheora/dequant.c
vendored
Normal file
182
engine/thirdparty/libtheora/dequant.c
vendored
Normal file
|
|
@ -0,0 +1,182 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <ogg/ogg.h>
|
||||
#include "dequant.h"
|
||||
#include "decint.h"
|
||||
|
||||
int oc_quant_params_unpack(oc_pack_buf *_opb,th_quant_info *_qinfo){
|
||||
th_quant_base *base_mats;
|
||||
long val;
|
||||
int nbase_mats;
|
||||
int sizes[64];
|
||||
int indices[64];
|
||||
int nbits;
|
||||
int bmi;
|
||||
int ci;
|
||||
int qti;
|
||||
int pli;
|
||||
int qri;
|
||||
int qi;
|
||||
int i;
|
||||
val=oc_pack_read(_opb,3);
|
||||
nbits=(int)val;
|
||||
for(qi=0;qi<64;qi++){
|
||||
val=oc_pack_read(_opb,nbits);
|
||||
_qinfo->loop_filter_limits[qi]=(unsigned char)val;
|
||||
}
|
||||
val=oc_pack_read(_opb,4);
|
||||
nbits=(int)val+1;
|
||||
for(qi=0;qi<64;qi++){
|
||||
val=oc_pack_read(_opb,nbits);
|
||||
_qinfo->ac_scale[qi]=(ogg_uint16_t)val;
|
||||
}
|
||||
val=oc_pack_read(_opb,4);
|
||||
nbits=(int)val+1;
|
||||
for(qi=0;qi<64;qi++){
|
||||
val=oc_pack_read(_opb,nbits);
|
||||
_qinfo->dc_scale[qi]=(ogg_uint16_t)val;
|
||||
}
|
||||
val=oc_pack_read(_opb,9);
|
||||
nbase_mats=(int)val+1;
|
||||
base_mats=_ogg_malloc(nbase_mats*sizeof(base_mats[0]));
|
||||
if(base_mats==NULL)return TH_EFAULT;
|
||||
for(bmi=0;bmi<nbase_mats;bmi++){
|
||||
for(ci=0;ci<64;ci++){
|
||||
val=oc_pack_read(_opb,8);
|
||||
base_mats[bmi][ci]=(unsigned char)val;
|
||||
}
|
||||
}
|
||||
nbits=oc_ilog(nbase_mats-1);
|
||||
for(i=0;i<6;i++){
|
||||
th_quant_ranges *qranges;
|
||||
th_quant_base *qrbms;
|
||||
int *qrsizes;
|
||||
qti=i/3;
|
||||
pli=i%3;
|
||||
qranges=_qinfo->qi_ranges[qti]+pli;
|
||||
if(i>0){
|
||||
val=oc_pack_read1(_opb);
|
||||
if(!val){
|
||||
int qtj;
|
||||
int plj;
|
||||
if(qti>0){
|
||||
val=oc_pack_read1(_opb);
|
||||
if(val){
|
||||
qtj=qti-1;
|
||||
plj=pli;
|
||||
}
|
||||
else{
|
||||
qtj=(i-1)/3;
|
||||
plj=(i-1)%3;
|
||||
}
|
||||
}
|
||||
else{
|
||||
qtj=(i-1)/3;
|
||||
plj=(i-1)%3;
|
||||
}
|
||||
*qranges=*(_qinfo->qi_ranges[qtj]+plj);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
val=oc_pack_read(_opb,nbits);
|
||||
indices[0]=(int)val;
|
||||
for(qi=qri=0;qi<63;){
|
||||
val=oc_pack_read(_opb,oc_ilog(62-qi));
|
||||
sizes[qri]=(int)val+1;
|
||||
qi+=(int)val+1;
|
||||
val=oc_pack_read(_opb,nbits);
|
||||
indices[++qri]=(int)val;
|
||||
}
|
||||
/*Note: The caller is responsible for cleaning up any partially
|
||||
constructed qinfo.*/
|
||||
if(qi>63){
|
||||
_ogg_free(base_mats);
|
||||
return TH_EBADHEADER;
|
||||
}
|
||||
qranges->nranges=qri;
|
||||
qranges->sizes=qrsizes=(int *)_ogg_malloc(qri*sizeof(qrsizes[0]));
|
||||
if(qranges->sizes==NULL){
|
||||
/*Note: The caller is responsible for cleaning up any partially
|
||||
constructed qinfo.*/
|
||||
_ogg_free(base_mats);
|
||||
return TH_EFAULT;
|
||||
}
|
||||
memcpy(qrsizes,sizes,qri*sizeof(qrsizes[0]));
|
||||
qrbms=(th_quant_base *)_ogg_malloc((qri+1)*sizeof(qrbms[0]));
|
||||
if(qrbms==NULL){
|
||||
/*Note: The caller is responsible for cleaning up any partially
|
||||
constructed qinfo.*/
|
||||
_ogg_free(base_mats);
|
||||
return TH_EFAULT;
|
||||
}
|
||||
qranges->base_matrices=(const th_quant_base *)qrbms;
|
||||
do{
|
||||
bmi=indices[qri];
|
||||
/*Note: The caller is responsible for cleaning up any partially
|
||||
constructed qinfo.*/
|
||||
if(bmi>=nbase_mats){
|
||||
_ogg_free(base_mats);
|
||||
return TH_EBADHEADER;
|
||||
}
|
||||
memcpy(qrbms[qri],base_mats[bmi],sizeof(qrbms[qri]));
|
||||
}
|
||||
while(qri-->0);
|
||||
}
|
||||
_ogg_free(base_mats);
|
||||
return 0;
|
||||
}
|
||||
|
||||
void oc_quant_params_clear(th_quant_info *_qinfo){
|
||||
int i;
|
||||
for(i=6;i-->0;){
|
||||
int qti;
|
||||
int pli;
|
||||
qti=i/3;
|
||||
pli=i%3;
|
||||
/*Clear any duplicate pointer references.*/
|
||||
if(i>0){
|
||||
int qtj;
|
||||
int plj;
|
||||
qtj=(i-1)/3;
|
||||
plj=(i-1)%3;
|
||||
if(_qinfo->qi_ranges[qti][pli].sizes==
|
||||
_qinfo->qi_ranges[qtj][plj].sizes){
|
||||
_qinfo->qi_ranges[qti][pli].sizes=NULL;
|
||||
}
|
||||
if(_qinfo->qi_ranges[qti][pli].base_matrices==
|
||||
_qinfo->qi_ranges[qtj][plj].base_matrices){
|
||||
_qinfo->qi_ranges[qti][pli].base_matrices=NULL;
|
||||
}
|
||||
}
|
||||
if(qti>0){
|
||||
if(_qinfo->qi_ranges[1][pli].sizes==
|
||||
_qinfo->qi_ranges[0][pli].sizes){
|
||||
_qinfo->qi_ranges[1][pli].sizes=NULL;
|
||||
}
|
||||
if(_qinfo->qi_ranges[1][pli].base_matrices==
|
||||
_qinfo->qi_ranges[0][pli].base_matrices){
|
||||
_qinfo->qi_ranges[1][pli].base_matrices=NULL;
|
||||
}
|
||||
}
|
||||
/*Now free all the non-duplicate storage.*/
|
||||
_ogg_free((void *)_qinfo->qi_ranges[qti][pli].sizes);
|
||||
_ogg_free((void *)_qinfo->qi_ranges[qti][pli].base_matrices);
|
||||
}
|
||||
}
|
||||
27
engine/thirdparty/libtheora/dequant.h
vendored
Normal file
27
engine/thirdparty/libtheora/dequant.h
vendored
Normal file
|
|
@ -0,0 +1,27 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_dequant_H)
|
||||
# define _dequant_H (1)
|
||||
# include "quant.h"
|
||||
# include "bitpack.h"
|
||||
|
||||
int oc_quant_params_unpack(oc_pack_buf *_opb,
|
||||
th_quant_info *_qinfo);
|
||||
void oc_quant_params_clear(th_quant_info *_qinfo);
|
||||
|
||||
#endif
|
||||
168
engine/thirdparty/libtheora/encapiwrapper.c
vendored
Normal file
168
engine/thirdparty/libtheora/encapiwrapper.c
vendored
Normal file
|
|
@ -0,0 +1,168 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <limits.h>
|
||||
#include "apiwrapper.h"
|
||||
#include "encint.h"
|
||||
#include "theora/theoraenc.h"
|
||||
|
||||
|
||||
|
||||
static void th_enc_api_clear(th_api_wrapper *_api){
|
||||
if(_api->encode)th_encode_free(_api->encode);
|
||||
memset(_api,0,sizeof(*_api));
|
||||
}
|
||||
|
||||
static void theora_encode_clear(theora_state *_te){
|
||||
if(_te->i!=NULL)theora_info_clear(_te->i);
|
||||
memset(_te,0,sizeof(*_te));
|
||||
}
|
||||
|
||||
static int theora_encode_control(theora_state *_te,int _req,
|
||||
void *_buf,size_t _buf_sz){
|
||||
return th_encode_ctl(((th_api_wrapper *)_te->i->codec_setup)->encode,
|
||||
_req,_buf,_buf_sz);
|
||||
}
|
||||
|
||||
static ogg_int64_t theora_encode_granule_frame(theora_state *_te,
|
||||
ogg_int64_t _gp){
|
||||
return th_granule_frame(((th_api_wrapper *)_te->i->codec_setup)->encode,_gp);
|
||||
}
|
||||
|
||||
static double theora_encode_granule_time(theora_state *_te,ogg_int64_t _gp){
|
||||
return th_granule_time(((th_api_wrapper *)_te->i->codec_setup)->encode,_gp);
|
||||
}
|
||||
|
||||
static const oc_state_dispatch_vtable OC_ENC_DISPATCH_VTBL={
|
||||
(oc_state_clear_func)theora_encode_clear,
|
||||
(oc_state_control_func)theora_encode_control,
|
||||
(oc_state_granule_frame_func)theora_encode_granule_frame,
|
||||
(oc_state_granule_time_func)theora_encode_granule_time,
|
||||
};
|
||||
|
||||
int theora_encode_init(theora_state *_te,theora_info *_ci){
|
||||
th_api_info *apiinfo;
|
||||
th_info info;
|
||||
ogg_uint32_t keyframe_frequency_force;
|
||||
/*Allocate our own combined API wrapper/theora_info struct.
|
||||
We put them both in one malloc'd block so that when the API wrapper is
|
||||
freed, the info struct goes with it.
|
||||
This avoids having to figure out whether or not we need to free the info
|
||||
struct in either theora_info_clear() or theora_clear().*/
|
||||
apiinfo=(th_api_info *)_ogg_malloc(sizeof(*apiinfo));
|
||||
if(apiinfo==NULL)return TH_EFAULT;
|
||||
/*Make our own copy of the info struct, since its lifetime should be
|
||||
independent of the one we were passed in.*/
|
||||
*&apiinfo->info=*_ci;
|
||||
oc_theora_info2th_info(&info,_ci);
|
||||
apiinfo->api.encode=th_encode_alloc(&info);
|
||||
if(apiinfo->api.encode==NULL){
|
||||
_ogg_free(apiinfo);
|
||||
return OC_EINVAL;
|
||||
}
|
||||
apiinfo->api.clear=(oc_setup_clear_func)th_enc_api_clear;
|
||||
/*Provide entry points for ABI compatibility with old decoder shared libs.*/
|
||||
_te->internal_encode=(void *)&OC_ENC_DISPATCH_VTBL;
|
||||
_te->internal_decode=NULL;
|
||||
_te->granulepos=0;
|
||||
_te->i=&apiinfo->info;
|
||||
_te->i->codec_setup=&apiinfo->api;
|
||||
/*Set the precise requested keyframe frequency.*/
|
||||
keyframe_frequency_force=_ci->keyframe_auto_p?
|
||||
_ci->keyframe_frequency_force:_ci->keyframe_frequency;
|
||||
th_encode_ctl(apiinfo->api.encode,
|
||||
TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE,
|
||||
&keyframe_frequency_force,sizeof(keyframe_frequency_force));
|
||||
/*TODO: Additional codec setup using the extra fields in theora_info.*/
|
||||
return 0;
|
||||
}
|
||||
|
||||
int theora_encode_YUVin(theora_state *_te,yuv_buffer *_yuv){
|
||||
th_api_wrapper *api;
|
||||
th_ycbcr_buffer buf;
|
||||
int ret;
|
||||
api=(th_api_wrapper *)_te->i->codec_setup;
|
||||
buf[0].width=_yuv->y_width;
|
||||
buf[0].height=_yuv->y_height;
|
||||
buf[0].stride=_yuv->y_stride;
|
||||
buf[0].data=_yuv->y;
|
||||
buf[1].width=_yuv->uv_width;
|
||||
buf[1].height=_yuv->uv_height;
|
||||
buf[1].stride=_yuv->uv_stride;
|
||||
buf[1].data=_yuv->u;
|
||||
buf[2].width=_yuv->uv_width;
|
||||
buf[2].height=_yuv->uv_height;
|
||||
buf[2].stride=_yuv->uv_stride;
|
||||
buf[2].data=_yuv->v;
|
||||
ret=th_encode_ycbcr_in(api->encode,buf);
|
||||
if(ret<0)return ret;
|
||||
_te->granulepos=api->encode->state.granpos;
|
||||
return ret;
|
||||
}
|
||||
|
||||
int theora_encode_packetout(theora_state *_te,int _last_p,ogg_packet *_op){
|
||||
th_api_wrapper *api;
|
||||
api=(th_api_wrapper *)_te->i->codec_setup;
|
||||
return th_encode_packetout(api->encode,_last_p,_op);
|
||||
}
|
||||
|
||||
int theora_encode_header(theora_state *_te,ogg_packet *_op){
|
||||
oc_enc_ctx *enc;
|
||||
th_api_wrapper *api;
|
||||
int ret;
|
||||
api=(th_api_wrapper *)_te->i->codec_setup;
|
||||
enc=api->encode;
|
||||
/*If we've already started encoding, fail.*/
|
||||
if(enc->packet_state>OC_PACKET_EMPTY||enc->state.granpos!=0){
|
||||
return TH_EINVAL;
|
||||
}
|
||||
/*Reset the state to make sure we output an info packet.*/
|
||||
enc->packet_state=OC_PACKET_INFO_HDR;
|
||||
ret=th_encode_flushheader(api->encode,NULL,_op);
|
||||
return ret>=0?0:ret;
|
||||
}
|
||||
|
||||
int theora_encode_comment(theora_comment *_tc,ogg_packet *_op){
|
||||
oggpack_buffer opb;
|
||||
void *buf;
|
||||
int packet_state;
|
||||
int ret;
|
||||
packet_state=OC_PACKET_COMMENT_HDR;
|
||||
oggpackB_writeinit(&opb);
|
||||
ret=oc_state_flushheader(NULL,&packet_state,&opb,NULL,NULL,
|
||||
th_version_string(),(th_comment *)_tc,_op);
|
||||
if(ret>=0){
|
||||
/*The oggpack_buffer's lifetime ends with this function, so we have to
|
||||
copy out the packet contents.
|
||||
Presumably the application knows it is supposed to free this.
|
||||
This part works nothing like the Vorbis API, and the documentation on it
|
||||
has been wrong for some time, claiming libtheora owned the memory.*/
|
||||
buf=_ogg_malloc(_op->bytes);
|
||||
if(buf==NULL){
|
||||
_op->packet=NULL;
|
||||
ret=TH_EFAULT;
|
||||
}
|
||||
else{
|
||||
memcpy(buf,_op->packet,_op->bytes);
|
||||
_op->packet=buf;
|
||||
ret=0;
|
||||
}
|
||||
}
|
||||
oggpack_writeclear(&opb);
|
||||
return ret;
|
||||
}
|
||||
|
||||
int theora_encode_tables(theora_state *_te,ogg_packet *_op){
|
||||
oc_enc_ctx *enc;
|
||||
th_api_wrapper *api;
|
||||
int ret;
|
||||
api=(th_api_wrapper *)_te->i->codec_setup;
|
||||
enc=api->encode;
|
||||
/*If we've already started encoding, fail.*/
|
||||
if(enc->packet_state>OC_PACKET_EMPTY||enc->state.granpos!=0){
|
||||
return TH_EINVAL;
|
||||
}
|
||||
/*Reset the state to make sure we output a setup packet.*/
|
||||
enc->packet_state=OC_PACKET_SETUP_HDR;
|
||||
ret=th_encode_flushheader(api->encode,NULL,_op);
|
||||
return ret>=0?0:ret;
|
||||
}
|
||||
379
engine/thirdparty/libtheora/encfrag.c
vendored
Normal file
379
engine/thirdparty/libtheora/encfrag.c
vendored
Normal file
|
|
@ -0,0 +1,379 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "encint.h"
|
||||
|
||||
|
||||
void oc_enc_frag_sub_c(ogg_int16_t _diff[64],const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
int i;
|
||||
for(i=0;i<8;i++){
|
||||
int j;
|
||||
for(j=0;j<8;j++)_diff[i*8+j]=(ogg_int16_t)(_src[j]-_ref[j]);
|
||||
_src+=_ystride;
|
||||
_ref+=_ystride;
|
||||
}
|
||||
}
|
||||
|
||||
void oc_enc_frag_sub_128_c(ogg_int16_t *_diff,
|
||||
const unsigned char *_src,int _ystride){
|
||||
int i;
|
||||
for(i=0;i<8;i++){
|
||||
int j;
|
||||
for(j=0;j<8;j++)_diff[i*8+j]=(ogg_int16_t)(_src[j]-128);
|
||||
_src+=_ystride;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_sad_c(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
unsigned sad;
|
||||
int i;
|
||||
sad=0;
|
||||
for(i=8;i-->0;){
|
||||
int j;
|
||||
for(j=0;j<8;j++)sad+=abs(_src[j]-_ref[j]);
|
||||
_src+=_ystride;
|
||||
_ref+=_ystride;
|
||||
}
|
||||
return sad;
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _thresh){
|
||||
unsigned sad;
|
||||
int i;
|
||||
sad=0;
|
||||
for(i=8;i-->0;){
|
||||
int j;
|
||||
for(j=0;j<8;j++)sad+=abs(_src[j]-_ref[j]);
|
||||
if(sad>_thresh)break;
|
||||
_src+=_ystride;
|
||||
_ref+=_ystride;
|
||||
}
|
||||
return sad;
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||
unsigned _thresh){
|
||||
unsigned sad;
|
||||
int i;
|
||||
sad=0;
|
||||
for(i=8;i-->0;){
|
||||
int j;
|
||||
for(j=0;j<8;j++)sad+=abs(_src[j]-(_ref1[j]+_ref2[j]>>1));
|
||||
if(sad>_thresh)break;
|
||||
_src+=_ystride;
|
||||
_ref1+=_ystride;
|
||||
_ref2+=_ystride;
|
||||
}
|
||||
return sad;
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_intra_sad_c(const unsigned char *_src, int _ystride){
|
||||
const unsigned char *src = _src;
|
||||
unsigned dc;
|
||||
unsigned sad;
|
||||
int i;
|
||||
dc=0;
|
||||
for(i=8;i-->0;){
|
||||
int j;
|
||||
for(j=0;j<8;j++)dc+=src[j];
|
||||
src+=_ystride;
|
||||
}
|
||||
dc=dc+32>>6;
|
||||
sad=0;
|
||||
for(i=8;i-->0;){
|
||||
int j;
|
||||
for(j=0;j<8;j++)sad+=abs(_src[j]-dc);
|
||||
_src+=_ystride;
|
||||
}
|
||||
return sad;
|
||||
}
|
||||
|
||||
static void oc_diff_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
int i;
|
||||
for(i=0;i<8;i++){
|
||||
int t0;
|
||||
int t1;
|
||||
int t2;
|
||||
int t3;
|
||||
int t4;
|
||||
int t5;
|
||||
int t6;
|
||||
int t7;
|
||||
int r;
|
||||
/*Hadamard stage 1:*/
|
||||
t0=_src[0]-_ref[0]+_src[4]-_ref[4];
|
||||
t4=_src[0]-_ref[0]-_src[4]+_ref[4];
|
||||
t1=_src[1]-_ref[1]+_src[5]-_ref[5];
|
||||
t5=_src[1]-_ref[1]-_src[5]+_ref[5];
|
||||
t2=_src[2]-_ref[2]+_src[6]-_ref[6];
|
||||
t6=_src[2]-_ref[2]-_src[6]+_ref[6];
|
||||
t3=_src[3]-_ref[3]+_src[7]-_ref[7];
|
||||
t7=_src[3]-_ref[3]-_src[7]+_ref[7];
|
||||
/*Hadamard stage 2:*/
|
||||
r=t0;
|
||||
t0+=t2;
|
||||
t2=r-t2;
|
||||
r=t1;
|
||||
t1+=t3;
|
||||
t3=r-t3;
|
||||
r=t4;
|
||||
t4+=t6;
|
||||
t6=r-t6;
|
||||
r=t5;
|
||||
t5+=t7;
|
||||
t7=r-t7;
|
||||
/*Hadamard stage 3:*/
|
||||
_buf[0*8+i]=(ogg_int16_t)(t0+t1);
|
||||
_buf[1*8+i]=(ogg_int16_t)(t0-t1);
|
||||
_buf[2*8+i]=(ogg_int16_t)(t2+t3);
|
||||
_buf[3*8+i]=(ogg_int16_t)(t2-t3);
|
||||
_buf[4*8+i]=(ogg_int16_t)(t4+t5);
|
||||
_buf[5*8+i]=(ogg_int16_t)(t4-t5);
|
||||
_buf[6*8+i]=(ogg_int16_t)(t6+t7);
|
||||
_buf[7*8+i]=(ogg_int16_t)(t6-t7);
|
||||
_src+=_ystride;
|
||||
_ref+=_ystride;
|
||||
}
|
||||
}
|
||||
|
||||
static void oc_diff_hadamard2(ogg_int16_t _buf[64],const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
|
||||
int i;
|
||||
for(i=0;i<8;i++){
|
||||
int t0;
|
||||
int t1;
|
||||
int t2;
|
||||
int t3;
|
||||
int t4;
|
||||
int t5;
|
||||
int t6;
|
||||
int t7;
|
||||
int r;
|
||||
/*Hadamard stage 1:*/
|
||||
r=_ref1[0]+_ref2[0]>>1;
|
||||
t4=_ref1[4]+_ref2[4]>>1;
|
||||
t0=_src[0]-r+_src[4]-t4;
|
||||
t4=_src[0]-r-_src[4]+t4;
|
||||
r=_ref1[1]+_ref2[1]>>1;
|
||||
t5=_ref1[5]+_ref2[5]>>1;
|
||||
t1=_src[1]-r+_src[5]-t5;
|
||||
t5=_src[1]-r-_src[5]+t5;
|
||||
r=_ref1[2]+_ref2[2]>>1;
|
||||
t6=_ref1[6]+_ref2[6]>>1;
|
||||
t2=_src[2]-r+_src[6]-t6;
|
||||
t6=_src[2]-r-_src[6]+t6;
|
||||
r=_ref1[3]+_ref2[3]>>1;
|
||||
t7=_ref1[7]+_ref2[7]>>1;
|
||||
t3=_src[3]-r+_src[7]-t7;
|
||||
t7=_src[3]-r-_src[7]+t7;
|
||||
/*Hadamard stage 2:*/
|
||||
r=t0;
|
||||
t0+=t2;
|
||||
t2=r-t2;
|
||||
r=t1;
|
||||
t1+=t3;
|
||||
t3=r-t3;
|
||||
r=t4;
|
||||
t4+=t6;
|
||||
t6=r-t6;
|
||||
r=t5;
|
||||
t5+=t7;
|
||||
t7=r-t7;
|
||||
/*Hadamard stage 3:*/
|
||||
_buf[0*8+i]=(ogg_int16_t)(t0+t1);
|
||||
_buf[1*8+i]=(ogg_int16_t)(t0-t1);
|
||||
_buf[2*8+i]=(ogg_int16_t)(t2+t3);
|
||||
_buf[3*8+i]=(ogg_int16_t)(t2-t3);
|
||||
_buf[4*8+i]=(ogg_int16_t)(t4+t5);
|
||||
_buf[5*8+i]=(ogg_int16_t)(t4-t5);
|
||||
_buf[6*8+i]=(ogg_int16_t)(t6+t7);
|
||||
_buf[7*8+i]=(ogg_int16_t)(t6-t7);
|
||||
_src+=_ystride;
|
||||
_ref1+=_ystride;
|
||||
_ref2+=_ystride;
|
||||
}
|
||||
}
|
||||
|
||||
static void oc_intra_hadamard(ogg_int16_t _buf[64],const unsigned char *_src,
|
||||
int _ystride){
|
||||
int i;
|
||||
for(i=0;i<8;i++){
|
||||
int t0;
|
||||
int t1;
|
||||
int t2;
|
||||
int t3;
|
||||
int t4;
|
||||
int t5;
|
||||
int t6;
|
||||
int t7;
|
||||
int r;
|
||||
/*Hadamard stage 1:*/
|
||||
t0=_src[0]+_src[4];
|
||||
t4=_src[0]-_src[4];
|
||||
t1=_src[1]+_src[5];
|
||||
t5=_src[1]-_src[5];
|
||||
t2=_src[2]+_src[6];
|
||||
t6=_src[2]-_src[6];
|
||||
t3=_src[3]+_src[7];
|
||||
t7=_src[3]-_src[7];
|
||||
/*Hadamard stage 2:*/
|
||||
r=t0;
|
||||
t0+=t2;
|
||||
t2=r-t2;
|
||||
r=t1;
|
||||
t1+=t3;
|
||||
t3=r-t3;
|
||||
r=t4;
|
||||
t4+=t6;
|
||||
t6=r-t6;
|
||||
r=t5;
|
||||
t5+=t7;
|
||||
t7=r-t7;
|
||||
/*Hadamard stage 3:*/
|
||||
_buf[0*8+i]=(ogg_int16_t)(t0+t1);
|
||||
_buf[1*8+i]=(ogg_int16_t)(t0-t1);
|
||||
_buf[2*8+i]=(ogg_int16_t)(t2+t3);
|
||||
_buf[3*8+i]=(ogg_int16_t)(t2-t3);
|
||||
_buf[4*8+i]=(ogg_int16_t)(t4+t5);
|
||||
_buf[5*8+i]=(ogg_int16_t)(t4-t5);
|
||||
_buf[6*8+i]=(ogg_int16_t)(t6+t7);
|
||||
_buf[7*8+i]=(ogg_int16_t)(t6-t7);
|
||||
_src+=_ystride;
|
||||
}
|
||||
}
|
||||
|
||||
unsigned oc_hadamard_sad(int *_dc,const ogg_int16_t _buf[64]){
|
||||
unsigned sad;
|
||||
int dc;
|
||||
int t0;
|
||||
int t1;
|
||||
int t2;
|
||||
int t3;
|
||||
int t4;
|
||||
int t5;
|
||||
int t6;
|
||||
int t7;
|
||||
int r;
|
||||
int i;
|
||||
sad=dc=0;
|
||||
for(i=0;i<8;i++){
|
||||
/*Hadamard stage 1:*/
|
||||
t0=_buf[i*8+0]+_buf[i*8+4];
|
||||
t4=_buf[i*8+0]-_buf[i*8+4];
|
||||
t1=_buf[i*8+1]+_buf[i*8+5];
|
||||
t5=_buf[i*8+1]-_buf[i*8+5];
|
||||
t2=_buf[i*8+2]+_buf[i*8+6];
|
||||
t6=_buf[i*8+2]-_buf[i*8+6];
|
||||
t3=_buf[i*8+3]+_buf[i*8+7];
|
||||
t7=_buf[i*8+3]-_buf[i*8+7];
|
||||
/*Hadamard stage 2:*/
|
||||
r=t0;
|
||||
t0+=t2;
|
||||
t2=r-t2;
|
||||
r=t1;
|
||||
t1+=t3;
|
||||
t3=r-t3;
|
||||
r=t4;
|
||||
t4+=t6;
|
||||
t6=r-t6;
|
||||
r=t5;
|
||||
t5+=t7;
|
||||
t7=r-t7;
|
||||
/*Hadamard stage 3:*/
|
||||
r=abs(t0+t1)&-(i>0);
|
||||
r+=abs(t0-t1);
|
||||
r+=abs(t2+t3);
|
||||
r+=abs(t2-t3);
|
||||
r+=abs(t4+t5);
|
||||
r+=abs(t4-t5);
|
||||
r+=abs(t6+t7);
|
||||
r+=abs(t6-t7);
|
||||
sad+=r;
|
||||
}
|
||||
dc=_buf[0]+_buf[1]+_buf[2]+_buf[3]+_buf[4]+_buf[5]+_buf[6]+_buf[7];
|
||||
*_dc=dc;
|
||||
return sad;
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
ogg_int16_t buf[64];
|
||||
oc_diff_hadamard(buf,_src,_ref,_ystride);
|
||||
return oc_hadamard_sad(_dc,buf);
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
|
||||
ogg_int16_t buf[64];
|
||||
oc_diff_hadamard2(buf,_src,_ref1,_ref2,_ystride);
|
||||
return oc_hadamard_sad(_dc,buf);
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_intra_satd_c(int *_dc,
|
||||
const unsigned char *_src,int _ystride){
|
||||
ogg_int16_t buf[64];
|
||||
oc_intra_hadamard(buf,_src,_ystride);
|
||||
return oc_hadamard_sad(_dc,buf);
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
unsigned ret;
|
||||
int y;
|
||||
int x;
|
||||
ret=0;
|
||||
for(y=0;y<8;y++){
|
||||
for(x=0;x<8;x++)ret+=(_src[x]-_ref[x])*(_src[x]-_ref[x]);
|
||||
_src+=_ystride;
|
||||
_ref+=_ystride;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
|
||||
unsigned ret;
|
||||
int y;
|
||||
int x;
|
||||
ret=0;
|
||||
for(y=0;y<8;y++){
|
||||
for(x=0;x<8;x++,_mask>>=1){
|
||||
if(_mask&1)ret+=(_src[x]-_ref[x])*(_src[x]-_ref[x]);
|
||||
}
|
||||
_src+=_ystride;
|
||||
_ref+=_ystride;
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
void oc_enc_frag_copy2_c(unsigned char *_dst,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride){
|
||||
int i;
|
||||
int j;
|
||||
for(i=8;i-->0;){
|
||||
for(j=0;j<8;j++)_dst[j]=_src1[j]+_src2[j]>>1;
|
||||
_dst+=_ystride;
|
||||
_src1+=_ystride;
|
||||
_src2+=_ystride;
|
||||
}
|
||||
}
|
||||
121
engine/thirdparty/libtheora/encinfo.c
vendored
Normal file
121
engine/thirdparty/libtheora/encinfo.c
vendored
Normal file
|
|
@ -0,0 +1,121 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "state.h"
|
||||
#include "enquant.h"
|
||||
#include "huffenc.h"
|
||||
|
||||
|
||||
|
||||
/*Packs a series of octets from a given byte array into the pack buffer.
|
||||
_opb: The pack buffer to store the octets in.
|
||||
_buf: The byte array containing the bytes to pack.
|
||||
_len: The number of octets to pack.*/
|
||||
static void oc_pack_octets(oggpack_buffer *_opb,const char *_buf,int _len){
|
||||
int i;
|
||||
for(i=0;i<_len;i++)oggpackB_write(_opb,_buf[i],8);
|
||||
}
|
||||
|
||||
|
||||
|
||||
int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
|
||||
oggpack_buffer *_opb,const th_quant_info *_qinfo,
|
||||
const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS],
|
||||
const char *_vendor,th_comment *_tc,ogg_packet *_op){
|
||||
unsigned char *packet;
|
||||
int b_o_s;
|
||||
if(_op==NULL)return TH_EFAULT;
|
||||
switch(*_packet_state){
|
||||
/*Codec info header.*/
|
||||
case OC_PACKET_INFO_HDR:{
|
||||
if(_state==NULL)return TH_EFAULT;
|
||||
oggpackB_reset(_opb);
|
||||
/*Mark this packet as the info header.*/
|
||||
oggpackB_write(_opb,0x80,8);
|
||||
/*Write the codec string.*/
|
||||
oc_pack_octets(_opb,"theora",6);
|
||||
/*Write the codec bitstream version.*/
|
||||
oggpackB_write(_opb,TH_VERSION_MAJOR,8);
|
||||
oggpackB_write(_opb,TH_VERSION_MINOR,8);
|
||||
oggpackB_write(_opb,TH_VERSION_SUB,8);
|
||||
/*Describe the encoded frame.*/
|
||||
oggpackB_write(_opb,_state->info.frame_width>>4,16);
|
||||
oggpackB_write(_opb,_state->info.frame_height>>4,16);
|
||||
oggpackB_write(_opb,_state->info.pic_width,24);
|
||||
oggpackB_write(_opb,_state->info.pic_height,24);
|
||||
oggpackB_write(_opb,_state->info.pic_x,8);
|
||||
oggpackB_write(_opb,_state->info.pic_y,8);
|
||||
oggpackB_write(_opb,_state->info.fps_numerator,32);
|
||||
oggpackB_write(_opb,_state->info.fps_denominator,32);
|
||||
oggpackB_write(_opb,_state->info.aspect_numerator,24);
|
||||
oggpackB_write(_opb,_state->info.aspect_denominator,24);
|
||||
oggpackB_write(_opb,_state->info.colorspace,8);
|
||||
oggpackB_write(_opb,_state->info.target_bitrate,24);
|
||||
oggpackB_write(_opb,_state->info.quality,6);
|
||||
oggpackB_write(_opb,_state->info.keyframe_granule_shift,5);
|
||||
oggpackB_write(_opb,_state->info.pixel_fmt,2);
|
||||
/*Spare configuration bits.*/
|
||||
oggpackB_write(_opb,0,3);
|
||||
b_o_s=1;
|
||||
}break;
|
||||
/*Comment header.*/
|
||||
case OC_PACKET_COMMENT_HDR:{
|
||||
int vendor_len;
|
||||
int i;
|
||||
if(_tc==NULL)return TH_EFAULT;
|
||||
vendor_len=strlen(_vendor);
|
||||
oggpackB_reset(_opb);
|
||||
/*Mark this packet as the comment header.*/
|
||||
oggpackB_write(_opb,0x81,8);
|
||||
/*Write the codec string.*/
|
||||
oc_pack_octets(_opb,"theora",6);
|
||||
/*Write the vendor string.*/
|
||||
oggpack_write(_opb,vendor_len,32);
|
||||
oc_pack_octets(_opb,_vendor,vendor_len);
|
||||
oggpack_write(_opb,_tc->comments,32);
|
||||
for(i=0;i<_tc->comments;i++){
|
||||
if(_tc->user_comments[i]!=NULL){
|
||||
oggpack_write(_opb,_tc->comment_lengths[i],32);
|
||||
oc_pack_octets(_opb,_tc->user_comments[i],_tc->comment_lengths[i]);
|
||||
}
|
||||
else oggpack_write(_opb,0,32);
|
||||
}
|
||||
b_o_s=0;
|
||||
}break;
|
||||
/*Codec setup header.*/
|
||||
case OC_PACKET_SETUP_HDR:{
|
||||
int ret;
|
||||
oggpackB_reset(_opb);
|
||||
/*Mark this packet as the setup header.*/
|
||||
oggpackB_write(_opb,0x82,8);
|
||||
/*Write the codec string.*/
|
||||
oc_pack_octets(_opb,"theora",6);
|
||||
/*Write the quantizer tables.*/
|
||||
oc_quant_params_pack(_opb,_qinfo);
|
||||
/*Write the huffman codes.*/
|
||||
ret=oc_huff_codes_pack(_opb,_codes);
|
||||
/*This should never happen, because we validate the tables when they
|
||||
are set.
|
||||
If you see, it's a good chance memory is being corrupted.*/
|
||||
if(ret<0)return ret;
|
||||
b_o_s=0;
|
||||
}break;
|
||||
/*No more headers to emit.*/
|
||||
default:return 0;
|
||||
}
|
||||
/*This is kind of fugly: we hand the user a buffer which they do not own.
|
||||
We will overwrite it when the next packet is output, so the user better be
|
||||
done with it by then.
|
||||
Vorbis is little better: it hands back buffers that it will free the next
|
||||
time the headers are requested, or when the encoder is cleared.
|
||||
Hopefully libogg2 will make this much cleaner.*/
|
||||
packet=oggpackB_get_buffer(_opb);
|
||||
/*If there's no packet, malloc failed while writing.*/
|
||||
if(packet==NULL)return TH_EFAULT;
|
||||
_op->packet=packet;
|
||||
_op->bytes=oggpackB_bytes(_opb);
|
||||
_op->b_o_s=b_o_s;
|
||||
_op->e_o_s=0;
|
||||
_op->granulepos=0;
|
||||
_op->packetno=*_packet_state+3;
|
||||
return ++(*_packet_state)+3;
|
||||
}
|
||||
845
engine/thirdparty/libtheora/encint.h
vendored
Normal file
845
engine/thirdparty/libtheora/encint.h
vendored
Normal file
|
|
@ -0,0 +1,845 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
#if !defined(_encint_H)
|
||||
# define _encint_H (1)
|
||||
# include "theora/theoraenc.h"
|
||||
# include "state.h"
|
||||
# include "mathops.h"
|
||||
# include "enquant.h"
|
||||
# include "huffenc.h"
|
||||
/*# define OC_COLLECT_METRICS*/
|
||||
|
||||
|
||||
|
||||
typedef oc_mv oc_mv2[2];
|
||||
|
||||
typedef struct oc_enc_opt_vtable oc_enc_opt_vtable;
|
||||
typedef struct oc_enc_opt_data oc_enc_opt_data;
|
||||
typedef struct oc_mb_enc_info oc_mb_enc_info;
|
||||
typedef struct oc_mode_scheme_chooser oc_mode_scheme_chooser;
|
||||
typedef struct oc_fr_state oc_fr_state;
|
||||
typedef struct oc_qii_state oc_qii_state;
|
||||
typedef struct oc_enc_pipeline_state oc_enc_pipeline_state;
|
||||
typedef struct oc_mode_rd oc_mode_rd;
|
||||
typedef struct oc_iir_filter oc_iir_filter;
|
||||
typedef struct oc_frame_metrics oc_frame_metrics;
|
||||
typedef struct oc_rc_state oc_rc_state;
|
||||
typedef struct th_enc_ctx oc_enc_ctx;
|
||||
typedef struct oc_token_checkpoint oc_token_checkpoint;
|
||||
|
||||
|
||||
|
||||
/*Encoder-specific accelerated functions.*/
|
||||
# if defined(OC_X86_ASM)
|
||||
# if defined(_MSC_VER)
|
||||
# include "x86_vc/x86enc.h"
|
||||
# else
|
||||
# include "x86/x86enc.h"
|
||||
# endif
|
||||
# endif
|
||||
# if defined(OC_ARM_ASM)
|
||||
# include "arm/armenc.h"
|
||||
# endif
|
||||
|
||||
# if !defined(oc_enc_accel_init)
|
||||
# define oc_enc_accel_init oc_enc_accel_init_c
|
||||
# endif
|
||||
# if defined(OC_ENC_USE_VTABLE)
|
||||
# if !defined(oc_enc_frag_sub)
|
||||
# define oc_enc_frag_sub(_enc,_diff,_src,_ref,_ystride) \
|
||||
((*(_enc)->opt_vtable.frag_sub)(_diff,_src,_ref,_ystride))
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_sub_128)
|
||||
# define oc_enc_frag_sub_128(_enc,_diff,_src,_ystride) \
|
||||
((*(_enc)->opt_vtable.frag_sub_128)(_diff,_src,_ystride))
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_sad)
|
||||
# define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
|
||||
((*(_enc)->opt_vtable.frag_sad)(_src,_ref,_ystride))
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_sad_thresh)
|
||||
# define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
|
||||
((*(_enc)->opt_vtable.frag_sad_thresh)(_src,_ref,_ystride,_thresh))
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_sad2_thresh)
|
||||
# define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
|
||||
((*(_enc)->opt_vtable.frag_sad2_thresh)(_src,_ref1,_ref2,_ystride,_thresh))
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_intra_sad)
|
||||
# define oc_enc_frag_intra_sad(_enc,_src,_ystride) \
|
||||
((*(_enc)->opt_vtable.frag_intra_sad)(_src,_ystride))
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_satd)
|
||||
# define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
|
||||
((*(_enc)->opt_vtable.frag_satd)(_dc,_src,_ref,_ystride))
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_satd2)
|
||||
# define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
|
||||
((*(_enc)->opt_vtable.frag_satd2)(_dc,_src,_ref1,_ref2,_ystride))
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_intra_satd)
|
||||
# define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
|
||||
((*(_enc)->opt_vtable.frag_intra_satd)(_dc,_src,_ystride))
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_ssd)
|
||||
# define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
|
||||
((*(_enc)->opt_vtable.frag_ssd)(_src,_ref,_ystride))
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_border_ssd)
|
||||
# define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
|
||||
((*(_enc)->opt_vtable.frag_border_ssd)(_src,_ref,_ystride,_mask))
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_copy2)
|
||||
# define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
|
||||
((*(_enc)->opt_vtable.frag_copy2)(_dst,_src1,_src2,_ystride))
|
||||
# endif
|
||||
# if !defined(oc_enc_enquant_table_init)
|
||||
# define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
|
||||
((*(_enc)->opt_vtable.enquant_table_init)(_enquant,_dequant))
|
||||
# endif
|
||||
# if !defined(oc_enc_enquant_table_fixup)
|
||||
# define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
|
||||
((*(_enc)->opt_vtable.enquant_table_fixup)(_enquant,_nqis))
|
||||
# endif
|
||||
# if !defined(oc_enc_quantize)
|
||||
# define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
|
||||
((*(_enc)->opt_vtable.quantize)(_qdct,_dct,_dequant,_enquant))
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_recon_intra)
|
||||
# define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
|
||||
((*(_enc)->opt_vtable.frag_recon_intra)(_dst,_ystride,_residue))
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_recon_inter)
|
||||
# define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
|
||||
((*(_enc)->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue))
|
||||
# endif
|
||||
# if !defined(oc_enc_fdct8x8)
|
||||
# define oc_enc_fdct8x8(_enc,_y,_x) \
|
||||
((*(_enc)->opt_vtable.fdct8x8)(_y,_x))
|
||||
# endif
|
||||
# else
|
||||
# if !defined(oc_enc_frag_sub)
|
||||
# define oc_enc_frag_sub(_enc,_diff,_src,_ref,_ystride) \
|
||||
oc_enc_frag_sub_c(_diff,_src,_ref,_ystride)
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_sub_128)
|
||||
# define oc_enc_frag_sub_128(_enc,_diff,_src,_ystride) \
|
||||
oc_enc_frag_sub_128_c(_diff,_src,_ystride)
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_sad)
|
||||
# define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
|
||||
oc_enc_frag_sad_c(_src,_ref,_ystride)
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_sad_thresh)
|
||||
# define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
|
||||
oc_enc_frag_sad_thresh_c(_src,_ref,_ystride,_thresh)
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_sad2_thresh)
|
||||
# define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
|
||||
oc_enc_frag_sad2_thresh_c(_src,_ref1,_ref2,_ystride,_thresh)
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_intra_sad)
|
||||
# define oc_enc_frag_intra_sad(_enc,_src,_ystride) \
|
||||
oc_enc_frag_intra_sad_c(_src,_ystride)
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_satd)
|
||||
# define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
|
||||
oc_enc_frag_satd_c(_dc,_src,_ref,_ystride)
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_satd2)
|
||||
# define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
|
||||
oc_enc_frag_satd2_c(_dc,_src,_ref1,_ref2,_ystride)
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_intra_satd)
|
||||
# define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
|
||||
oc_enc_frag_intra_satd_c(_dc,_src,_ystride)
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_ssd)
|
||||
# define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
|
||||
oc_enc_frag_ssd_c(_src,_ref,_ystride)
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_border_ssd)
|
||||
# define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
|
||||
oc_enc_frag_border_ssd_c(_src,_ref,_ystride,_mask)
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_copy2)
|
||||
# define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
|
||||
oc_enc_frag_copy2_c(_dst,_src1,_src2,_ystride)
|
||||
# endif
|
||||
# if !defined(oc_enc_enquant_table_init)
|
||||
# define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
|
||||
oc_enc_enquant_table_init_c(_enquant,_dequant)
|
||||
# endif
|
||||
# if !defined(oc_enc_enquant_table_fixup)
|
||||
# define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
|
||||
oc_enc_enquant_table_fixup_c(_enquant,_nqis)
|
||||
# endif
|
||||
# if !defined(oc_enc_quantize)
|
||||
# define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
|
||||
oc_enc_quantize_c(_qdct,_dct,_dequant,_enquant)
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_recon_intra)
|
||||
# define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
|
||||
oc_frag_recon_intra_c(_dst,_ystride,_residue)
|
||||
# endif
|
||||
# if !defined(oc_enc_frag_recon_inter)
|
||||
# define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
|
||||
oc_frag_recon_inter_c(_dst,_src,_ystride,_residue)
|
||||
# endif
|
||||
# if !defined(oc_enc_fdct8x8)
|
||||
# define oc_enc_fdct8x8(_enc,_y,_x) oc_enc_fdct8x8_c(_y,_x)
|
||||
# endif
|
||||
# endif
|
||||
|
||||
|
||||
|
||||
/*Constants for the packet-out state machine specific to the encoder.*/
|
||||
|
||||
/*Next packet to emit: Data packet, but none are ready yet.*/
|
||||
#define OC_PACKET_EMPTY (0)
|
||||
/*Next packet to emit: Data packet, and one is ready.*/
|
||||
#define OC_PACKET_READY (1)
|
||||
|
||||
/*All features enabled.*/
|
||||
#define OC_SP_LEVEL_SLOW (0)
|
||||
/*Enable early skip.*/
|
||||
#define OC_SP_LEVEL_EARLY_SKIP (1)
|
||||
/*Use analysis shortcuts, single quantizer, and faster tokenization.*/
|
||||
#define OC_SP_LEVEL_FAST_ANALYSIS (2)
|
||||
/*Use SAD instead of SATD*/
|
||||
#define OC_SP_LEVEL_NOSATD (3)
|
||||
/*Disable motion compensation.*/
|
||||
#define OC_SP_LEVEL_NOMC (4)
|
||||
/*Maximum valid speed level.*/
|
||||
#define OC_SP_LEVEL_MAX (4)
|
||||
|
||||
|
||||
/*The number of extra bits of precision at which to store rate metrics.*/
|
||||
# define OC_BIT_SCALE (6)
|
||||
/*The number of extra bits of precision at which to store RMSE metrics.
|
||||
This must be at least half OC_BIT_SCALE (rounded up).*/
|
||||
# define OC_RMSE_SCALE (5)
|
||||
/*The number of quantizer bins to partition statistics into.*/
|
||||
# define OC_LOGQ_BINS (8)
|
||||
/*The number of SAD/SATD bins to partition statistics into.*/
|
||||
# define OC_COMP_BINS (24)
|
||||
/*The number of bits of precision to drop from SAD and SATD scores
|
||||
to assign them to a bin.*/
|
||||
# define OC_SAD_SHIFT (6)
|
||||
# define OC_SATD_SHIFT (9)
|
||||
|
||||
/*Masking is applied by scaling the D used in R-D optimization (via rd_scale)
|
||||
or the lambda parameter (via rd_iscale).
|
||||
These are only equivalent within a single block; when more than one block is
|
||||
being considered, the former is the interpretation used.*/
|
||||
|
||||
/*This must be at least 4 for OC_RD_SKIP_SCALE() to work below.*/
|
||||
# define OC_RD_SCALE_BITS (12-OC_BIT_SCALE)
|
||||
# define OC_RD_ISCALE_BITS (11)
|
||||
|
||||
/*This macro is applied to _ssd values with just 4 bits of headroom
|
||||
((15-OC_RMSE_SCALE)*2+OC_BIT_SCALE+2); since we want to allow rd_scales as
|
||||
large as 16, and need additional fractional bits, our only recourse that
|
||||
doesn't lose precision on blocks with very small SSDs is to use a wider
|
||||
multiply.*/
|
||||
# if LONG_MAX>2147483647
|
||||
# define OC_RD_SCALE(_ssd,_rd_scale) \
|
||||
((unsigned)((unsigned long)(_ssd)*(_rd_scale) \
|
||||
+((1<<OC_RD_SCALE_BITS)>>1)>>OC_RD_SCALE_BITS))
|
||||
# else
|
||||
# define OC_RD_SCALE(_ssd,_rd_scale) \
|
||||
(((_ssd)>>OC_RD_SCALE_BITS)*(_rd_scale) \
|
||||
+(((_ssd)&(1<<OC_RD_SCALE_BITS)-1)*(_rd_scale) \
|
||||
+((1<<OC_RD_SCALE_BITS)>>1)>>OC_RD_SCALE_BITS))
|
||||
# endif
|
||||
# define OC_RD_SKIP_SCALE(_ssd,_rd_scale) \
|
||||
((_ssd)*(_rd_scale)+((1<<OC_RD_SCALE_BITS-4)>>1)>>OC_RD_SCALE_BITS-4)
|
||||
# define OC_RD_ISCALE(_lambda,_rd_iscale) \
|
||||
((_lambda)*(_rd_iscale)+((1<<OC_RD_ISCALE_BITS)>>1)>>OC_RD_ISCALE_BITS)
|
||||
|
||||
|
||||
/*The bits used for each of the MB mode codebooks.*/
|
||||
extern const unsigned char OC_MODE_BITS[2][OC_NMODES];
|
||||
|
||||
/*The bits used for each of the MV codebooks.*/
|
||||
extern const unsigned char OC_MV_BITS[2][64];
|
||||
|
||||
/*The minimum value that can be stored in a SB run for each codeword.
|
||||
The last entry is the upper bound on the length of a single SB run.*/
|
||||
extern const ogg_uint16_t OC_SB_RUN_VAL_MIN[8];
|
||||
/*The bits used for each SB run codeword.*/
|
||||
extern const unsigned char OC_SB_RUN_CODE_NBITS[7];
|
||||
|
||||
/*The bits used for each block run length (starting with 1).*/
|
||||
extern const unsigned char OC_BLOCK_RUN_CODE_NBITS[30];
|
||||
|
||||
|
||||
|
||||
/*Encoder specific functions with accelerated variants.*/
|
||||
struct oc_enc_opt_vtable{
|
||||
void (*frag_sub)(ogg_int16_t _diff[64],const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
void (*frag_sub_128)(ogg_int16_t _diff[64],
|
||||
const unsigned char *_src,int _ystride);
|
||||
unsigned (*frag_sad)(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned (*frag_sad_thresh)(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _thresh);
|
||||
unsigned (*frag_sad2_thresh)(const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||
unsigned _thresh);
|
||||
unsigned (*frag_intra_sad)(const unsigned char *_src,int _ystride);
|
||||
unsigned (*frag_satd)(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned (*frag_satd2)(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
|
||||
unsigned (*frag_intra_satd)(int *_dc,const unsigned char *_src,int _ystride);
|
||||
unsigned (*frag_ssd)(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned (*frag_border_ssd)(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
|
||||
void (*frag_copy2)(unsigned char *_dst,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
||||
void (*enquant_table_init)(void *_enquant,
|
||||
const ogg_uint16_t _dequant[64]);
|
||||
void (*enquant_table_fixup)(void *_enquant[3][3][2],int _nqis);
|
||||
int (*quantize)(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||
const ogg_uint16_t _dequant[64],const void *_enquant);
|
||||
void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t _residue[64]);
|
||||
void (*frag_recon_inter)(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void (*fdct8x8)(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||
};
|
||||
|
||||
|
||||
/*Encoder specific data that varies according to which variants of the above
|
||||
functions are used.*/
|
||||
struct oc_enc_opt_data{
|
||||
/*The size of a single quantizer table.
|
||||
This must be a multiple of enquant_table_alignment.*/
|
||||
size_t enquant_table_size;
|
||||
/*The alignment required for the quantizer tables.
|
||||
This must be a positive power of two.*/
|
||||
int enquant_table_alignment;
|
||||
};
|
||||
|
||||
|
||||
void oc_enc_accel_init(oc_enc_ctx *_enc);
|
||||
|
||||
|
||||
|
||||
/*Encoder-specific macroblock information.*/
|
||||
struct oc_mb_enc_info{
|
||||
/*Neighboring macro blocks that have MVs available from the current frame.*/
|
||||
unsigned cneighbors[4];
|
||||
/*Neighboring macro blocks to use for MVs from the previous frame.*/
|
||||
unsigned pneighbors[4];
|
||||
/*The number of current-frame neighbors.*/
|
||||
unsigned char ncneighbors;
|
||||
/*The number of previous-frame neighbors.*/
|
||||
unsigned char npneighbors;
|
||||
/*Flags indicating which MB modes have been refined.*/
|
||||
unsigned char refined;
|
||||
/*Motion vectors for a macro block for the current frame and the
|
||||
previous two frames.
|
||||
Each is a set of 2 vectors against OC_FRAME_GOLD and OC_FRAME_PREV, which
|
||||
can be used to estimate constant velocity and constant acceleration
|
||||
predictors.
|
||||
Uninitialized MVs are (0,0).*/
|
||||
oc_mv2 analysis_mv[3];
|
||||
/*Current unrefined analysis MVs.*/
|
||||
oc_mv unref_mv[2];
|
||||
/*Unrefined block MVs.*/
|
||||
oc_mv block_mv[4];
|
||||
/*Refined block MVs.*/
|
||||
oc_mv ref_mv[4];
|
||||
/*Minimum motion estimation error from the analysis stage.*/
|
||||
ogg_uint16_t error[2];
|
||||
/*MB error for half-pel refinement for each frame type.*/
|
||||
unsigned satd[2];
|
||||
/*Block error for half-pel refinement.*/
|
||||
unsigned block_satd[4];
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*State machine to estimate the opportunity cost of coding a MB mode.*/
|
||||
struct oc_mode_scheme_chooser{
|
||||
/*Pointers to the a list containing the index of each mode in the mode
|
||||
alphabet used by each scheme.
|
||||
The first entry points to the dynamic scheme0_ranks, while the remaining 7
|
||||
point to the constant entries stored in OC_MODE_SCHEMES.*/
|
||||
const unsigned char *mode_ranks[8];
|
||||
/*The ranks for each mode when coded with scheme 0.
|
||||
These are optimized so that the more frequent modes have lower ranks.*/
|
||||
unsigned char scheme0_ranks[OC_NMODES];
|
||||
/*The list of modes, sorted in descending order of frequency, that
|
||||
corresponds to the ranks above.*/
|
||||
unsigned char scheme0_list[OC_NMODES];
|
||||
/*The number of times each mode has been chosen so far.*/
|
||||
unsigned mode_counts[OC_NMODES];
|
||||
/*The list of mode coding schemes, sorted in ascending order of bit cost.*/
|
||||
unsigned char scheme_list[8];
|
||||
/*The number of bits used by each mode coding scheme.*/
|
||||
ptrdiff_t scheme_bits[8];
|
||||
};
|
||||
|
||||
|
||||
void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser);
|
||||
|
||||
|
||||
|
||||
/*State to track coded block flags and their bit cost.
|
||||
We use opportunity cost to measure the bits required to code or skip the next
|
||||
block, using the cheaper of the cost to code it fully or partially, so long
|
||||
as both are possible.*/
|
||||
struct oc_fr_state{
|
||||
/*The number of bits required for the coded block flags so far this frame.*/
|
||||
ptrdiff_t bits;
|
||||
/*The length of the current run for the partial super block flag, not
|
||||
including the current super block.*/
|
||||
unsigned sb_partial_count:16;
|
||||
/*The length of the current run for the full super block flag, not
|
||||
including the current super block.*/
|
||||
unsigned sb_full_count:16;
|
||||
/*The length of the coded block flag run when the current super block
|
||||
started.*/
|
||||
unsigned b_coded_count_prev:6;
|
||||
/*The coded block flag when the current super block started.*/
|
||||
signed int b_coded_prev:2;
|
||||
/*The length of the current coded block flag run.*/
|
||||
unsigned b_coded_count:6;
|
||||
/*The current coded block flag.*/
|
||||
signed int b_coded:2;
|
||||
/*The number of blocks processed in the current super block.*/
|
||||
unsigned b_count:5;
|
||||
/*Whether or not it is cheaper to code the current super block partially,
|
||||
even if it could still be coded fully.*/
|
||||
unsigned sb_prefer_partial:1;
|
||||
/*Whether the last super block was coded partially.*/
|
||||
signed int sb_partial:2;
|
||||
/*The number of bits required for the flags for the current super block.*/
|
||||
unsigned sb_bits:6;
|
||||
/*Whether the last non-partial super block was coded fully.*/
|
||||
signed int sb_full:2;
|
||||
};
|
||||
|
||||
|
||||
|
||||
struct oc_qii_state{
|
||||
ptrdiff_t bits;
|
||||
unsigned qi01_count:14;
|
||||
signed int qi01:2;
|
||||
unsigned qi12_count:14;
|
||||
signed int qi12:2;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*Temporary encoder state for the analysis pipeline.*/
|
||||
struct oc_enc_pipeline_state{
|
||||
/*DCT coefficient storage.
|
||||
This is kept off the stack because a) gcc can't align things on the stack
|
||||
reliably on ARM, and b) it avoids (unintentional) data hazards between
|
||||
ARM and NEON code.*/
|
||||
OC_ALIGN16(ogg_int16_t dct_data[64*3]);
|
||||
OC_ALIGN16(signed char bounding_values[256]);
|
||||
oc_fr_state fr[3];
|
||||
oc_qii_state qs[3];
|
||||
/*Skip SSD storage for the current MCU in each plane.*/
|
||||
unsigned *skip_ssd[3];
|
||||
/*Coded/uncoded fragment lists for each plane for the current MCU.*/
|
||||
ptrdiff_t *coded_fragis[3];
|
||||
ptrdiff_t *uncoded_fragis[3];
|
||||
ptrdiff_t ncoded_fragis[3];
|
||||
ptrdiff_t nuncoded_fragis[3];
|
||||
/*The starting fragment for the current MCU in each plane.*/
|
||||
ptrdiff_t froffset[3];
|
||||
/*The starting row for the current MCU in each plane.*/
|
||||
int fragy0[3];
|
||||
/*The ending row for the current MCU in each plane.*/
|
||||
int fragy_end[3];
|
||||
/*The starting superblock for the current MCU in each plane.*/
|
||||
unsigned sbi0[3];
|
||||
/*The ending superblock for the current MCU in each plane.*/
|
||||
unsigned sbi_end[3];
|
||||
/*The number of tokens for zzi=1 for each color plane.*/
|
||||
int ndct_tokens1[3];
|
||||
/*The outstanding eob_run count for zzi=1 for each color plane.*/
|
||||
int eob_run1[3];
|
||||
/*Whether or not the loop filter is enabled.*/
|
||||
int loop_filter;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*Statistics used to estimate R-D cost of a block in a given coding mode.
|
||||
See modedec.h for more details.*/
|
||||
struct oc_mode_rd{
|
||||
/*The expected bits used by the DCT tokens, shifted by OC_BIT_SCALE.*/
|
||||
ogg_int16_t rate;
|
||||
/*The expected square root of the sum of squared errors, shifted by
|
||||
OC_RMSE_SCALE.*/
|
||||
ogg_int16_t rmse;
|
||||
};
|
||||
|
||||
# if defined(OC_COLLECT_METRICS)
|
||||
# include "collect.h"
|
||||
# endif
|
||||
|
||||
|
||||
|
||||
/*A 2nd order low-pass Bessel follower.
|
||||
We use this for rate control because it has fast reaction time, but is
|
||||
critically damped.*/
|
||||
struct oc_iir_filter{
|
||||
ogg_int32_t c[2];
|
||||
ogg_int64_t g;
|
||||
ogg_int32_t x[2];
|
||||
ogg_int32_t y[2];
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*The 2-pass metrics associated with a single frame.*/
|
||||
struct oc_frame_metrics{
|
||||
/*The log base 2 of the scale factor for this frame in Q24 format.*/
|
||||
ogg_int32_t log_scale;
|
||||
/*The number of application-requested duplicates of this frame.*/
|
||||
unsigned dup_count:31;
|
||||
/*The frame type from pass 1.*/
|
||||
unsigned frame_type:1;
|
||||
/*The frame activity average from pass 1.*/
|
||||
unsigned activity_avg;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*Rate control state information.*/
|
||||
struct oc_rc_state{
|
||||
/*The target average bits per frame.*/
|
||||
ogg_int64_t bits_per_frame;
|
||||
/*The current buffer fullness (bits available to be used).*/
|
||||
ogg_int64_t fullness;
|
||||
/*The target buffer fullness.
|
||||
This is where we'd like to be by the last keyframe the appears in the next
|
||||
buf_delay frames.*/
|
||||
ogg_int64_t target;
|
||||
/*The maximum buffer fullness (total size of the buffer).*/
|
||||
ogg_int64_t max;
|
||||
/*The log of the number of pixels in a frame in Q57 format.*/
|
||||
ogg_int64_t log_npixels;
|
||||
/*The exponent used in the rate model in Q8 format.*/
|
||||
unsigned exp[2];
|
||||
/*The number of frames to distribute the buffer usage over.*/
|
||||
int buf_delay;
|
||||
/*The total drop count from the previous frame.
|
||||
This includes duplicates explicitly requested via the
|
||||
TH_ENCCTL_SET_DUP_COUNT API as well as frames we chose to drop ourselves.*/
|
||||
ogg_uint32_t prev_drop_count;
|
||||
/*The log of an estimated scale factor used to obtain the real framerate, for
|
||||
VFR sources or, e.g., 12 fps content doubled to 24 fps, etc.*/
|
||||
ogg_int64_t log_drop_scale;
|
||||
/*The log of estimated scale factor for the rate model in Q57 format.*/
|
||||
ogg_int64_t log_scale[2];
|
||||
/*The log of the target quantizer level in Q57 format.*/
|
||||
ogg_int64_t log_qtarget;
|
||||
/*Will we drop frames to meet bitrate target?*/
|
||||
unsigned char drop_frames;
|
||||
/*Do we respect the maximum buffer fullness?*/
|
||||
unsigned char cap_overflow;
|
||||
/*Can the reservoir go negative?*/
|
||||
unsigned char cap_underflow;
|
||||
/*Second-order lowpass filters to track scale and VFR.*/
|
||||
oc_iir_filter scalefilter[2];
|
||||
int inter_count;
|
||||
int inter_delay;
|
||||
int inter_delay_target;
|
||||
oc_iir_filter vfrfilter;
|
||||
/*Two-pass mode state.
|
||||
0 => 1-pass encoding.
|
||||
1 => 1st pass of 2-pass encoding.
|
||||
2 => 2nd pass of 2-pass encoding.*/
|
||||
int twopass;
|
||||
/*Buffer for current frame metrics.*/
|
||||
unsigned char twopass_buffer[48];
|
||||
/*The number of bytes in the frame metrics buffer.
|
||||
When 2-pass encoding is enabled, this is set to 0 after each frame is
|
||||
submitted, and must be non-zero before the next frame will be accepted.*/
|
||||
int twopass_buffer_bytes;
|
||||
int twopass_buffer_fill;
|
||||
/*Whether or not to force the next frame to be a keyframe.*/
|
||||
unsigned char twopass_force_kf;
|
||||
/*The metrics for the previous frame.*/
|
||||
oc_frame_metrics prev_metrics;
|
||||
/*The metrics for the current frame.*/
|
||||
oc_frame_metrics cur_metrics;
|
||||
/*The buffered metrics for future frames.*/
|
||||
oc_frame_metrics *frame_metrics;
|
||||
int nframe_metrics;
|
||||
int cframe_metrics;
|
||||
/*The index of the current frame in the circular metric buffer.*/
|
||||
int frame_metrics_head;
|
||||
/*The frame count of each type (keyframes, delta frames, and dup frames);
|
||||
32 bits limits us to 2.268 years at 60 fps.*/
|
||||
ogg_uint32_t frames_total[3];
|
||||
/*The number of frames of each type yet to be processed.*/
|
||||
ogg_uint32_t frames_left[3];
|
||||
/*The sum of the scale values for each frame type.*/
|
||||
ogg_int64_t scale_sum[2];
|
||||
/*The start of the window over which the current scale sums are taken.*/
|
||||
int scale_window0;
|
||||
/*The end of the window over which the current scale sums are taken.*/
|
||||
int scale_window_end;
|
||||
/*The frame count of each type in the current 2-pass window; this does not
|
||||
include dup frames.*/
|
||||
int nframes[3];
|
||||
/*The total accumulated estimation bias.*/
|
||||
ogg_int64_t rate_bias;
|
||||
};
|
||||
|
||||
|
||||
void oc_rc_state_init(oc_rc_state *_rc,oc_enc_ctx *_enc);
|
||||
void oc_rc_state_clear(oc_rc_state *_rc);
|
||||
|
||||
void oc_enc_rc_resize(oc_enc_ctx *_enc);
|
||||
int oc_enc_select_qi(oc_enc_ctx *_enc,int _qti,int _clamp);
|
||||
void oc_enc_calc_lambda(oc_enc_ctx *_enc,int _frame_type);
|
||||
int oc_enc_update_rc_state(oc_enc_ctx *_enc,
|
||||
long _bits,int _qti,int _qi,int _trial,int _droppable);
|
||||
int oc_enc_rc_2pass_out(oc_enc_ctx *_enc,unsigned char **_buf);
|
||||
int oc_enc_rc_2pass_in(oc_enc_ctx *_enc,unsigned char *_buf,size_t _bytes);
|
||||
|
||||
|
||||
|
||||
/*The internal encoder state.*/
|
||||
struct th_enc_ctx{
|
||||
/*Shared encoder/decoder state.*/
|
||||
oc_theora_state state;
|
||||
/*Buffer in which to assemble packets.*/
|
||||
oggpack_buffer opb;
|
||||
/*Encoder-specific macroblock information.*/
|
||||
oc_mb_enc_info *mb_info;
|
||||
/*DC coefficients after prediction.*/
|
||||
ogg_int16_t *frag_dc;
|
||||
/*The list of coded macro blocks, in coded order.*/
|
||||
unsigned *coded_mbis;
|
||||
/*The number of coded macro blocks.*/
|
||||
size_t ncoded_mbis;
|
||||
/*Whether or not packets are ready to be emitted.
|
||||
This takes on negative values while there are remaining header packets to
|
||||
be emitted, reaches 0 when the codec is ready for input, and becomes
|
||||
positive when a frame has been processed and data packets are ready.*/
|
||||
int packet_state;
|
||||
/*The maximum distance between keyframes.*/
|
||||
ogg_uint32_t keyframe_frequency_force;
|
||||
/*The number of duplicates to produce for the next frame.*/
|
||||
ogg_uint32_t dup_count;
|
||||
/*The number of duplicates remaining to be emitted for the current frame.*/
|
||||
ogg_uint32_t nqueued_dups;
|
||||
/*The number of duplicates emitted for the last frame.*/
|
||||
ogg_uint32_t prev_dup_count;
|
||||
/*The current speed level.*/
|
||||
int sp_level;
|
||||
/*Whether or not VP3 compatibility mode has been enabled.*/
|
||||
unsigned char vp3_compatible;
|
||||
/*Whether or not any INTER frames have been coded.*/
|
||||
unsigned char coded_inter_frame;
|
||||
/*Whether or not previous frame was dropped.*/
|
||||
unsigned char prevframe_dropped;
|
||||
/*Stores most recently chosen Huffman tables for each frame type, DC and AC
|
||||
coefficients, and luma and chroma tokens.
|
||||
The actual Huffman table used for a given coefficient depends not only on
|
||||
the choice made here, but also its index in the zig-zag ordering.*/
|
||||
unsigned char huff_idxs[2][2][2];
|
||||
/*Current count of bits used by each MV coding mode.*/
|
||||
size_t mv_bits[2];
|
||||
/*The mode scheme chooser for estimating mode coding costs.*/
|
||||
oc_mode_scheme_chooser chooser;
|
||||
/*Temporary encoder state for the analysis pipeline.*/
|
||||
oc_enc_pipeline_state pipe;
|
||||
/*The number of vertical super blocks in an MCU.*/
|
||||
int mcu_nvsbs;
|
||||
/*The SSD error for skipping each fragment in the current MCU.*/
|
||||
unsigned *mcu_skip_ssd;
|
||||
/*The masking scale factors for chroma blocks in the current MCU.*/
|
||||
ogg_uint16_t *mcu_rd_scale;
|
||||
ogg_uint16_t *mcu_rd_iscale;
|
||||
/*The DCT token lists for each coefficient and each plane.*/
|
||||
unsigned char **dct_tokens[3];
|
||||
/*The extra bits associated with each DCT token.*/
|
||||
ogg_uint16_t **extra_bits[3];
|
||||
/*The number of DCT tokens for each coefficient for each plane.*/
|
||||
ptrdiff_t ndct_tokens[3][64];
|
||||
/*Pending EOB runs for each coefficient for each plane.*/
|
||||
ogg_uint16_t eob_run[3][64];
|
||||
/*The offset of the first DCT token for each coefficient for each plane.*/
|
||||
unsigned char dct_token_offs[3][64];
|
||||
/*The last DC coefficient for each plane and reference frame.*/
|
||||
int dc_pred_last[3][4];
|
||||
#if defined(OC_COLLECT_METRICS)
|
||||
/*Fragment SAD statistics for MB mode estimation metrics.*/
|
||||
unsigned *frag_sad;
|
||||
/*Fragment SATD statistics for MB mode estimation metrics.*/
|
||||
unsigned *frag_satd;
|
||||
/*Fragment SSD statistics for MB mode estimation metrics.*/
|
||||
unsigned *frag_ssd;
|
||||
#endif
|
||||
/*The R-D optimization parameter.*/
|
||||
int lambda;
|
||||
/*The average block "activity" of the previous frame.*/
|
||||
unsigned activity_avg;
|
||||
/*The average MB luma of the previous frame.*/
|
||||
unsigned luma_avg;
|
||||
/*The huffman tables in use.*/
|
||||
th_huff_code huff_codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
|
||||
/*The quantization parameters in use.*/
|
||||
th_quant_info qinfo;
|
||||
/*The original DC coefficients saved off from the dequatization tables.*/
|
||||
ogg_uint16_t dequant_dc[64][3][2];
|
||||
/*Condensed dequantization tables.*/
|
||||
const ogg_uint16_t *dequant[3][3][2];
|
||||
/*Condensed quantization tables.*/
|
||||
void *enquant[3][3][2];
|
||||
/*The full set of quantization tables.*/
|
||||
void *enquant_tables[64][3][2];
|
||||
/*Storage for the quantization tables.*/
|
||||
unsigned char *enquant_table_data;
|
||||
/*An "average" quantizer for each frame type (INTRA or INTER) and qi value.
|
||||
This is used to parameterize the rate control decisions.
|
||||
They are kept in the log domain to simplify later processing.
|
||||
These are DCT domain quantizers, and so are scaled by an additional factor
|
||||
of 4 from the pixel domain.*/
|
||||
ogg_int64_t log_qavg[2][64];
|
||||
/*The "average" quantizer futher partitioned by color plane.
|
||||
This is used to parameterize mode decision.
|
||||
These are DCT domain quantizers, and so are scaled by an additional factor
|
||||
of 4 from the pixel domain.*/
|
||||
ogg_int16_t log_plq[64][3][2];
|
||||
/*The R-D scale factors to apply to chroma blocks for a given frame type
|
||||
(INTRA or INTER) and qi value.
|
||||
The first is the "D" modifier (rd_scale), while the second is the "lambda"
|
||||
modifier (rd_iscale).*/
|
||||
ogg_uint16_t chroma_rd_scale[2][64][2];
|
||||
/*The interpolated mode decision R-D lookup tables for the current
|
||||
quantizers, color plane, and quantization type.*/
|
||||
oc_mode_rd mode_rd[3][3][2][OC_COMP_BINS];
|
||||
/*The buffer state used to drive rate control.*/
|
||||
oc_rc_state rc;
|
||||
# if defined(OC_ENC_USE_VTABLE)
|
||||
/*Table for encoder acceleration functions.*/
|
||||
oc_enc_opt_vtable opt_vtable;
|
||||
# endif
|
||||
/*Table for encoder data used by accelerated functions.*/
|
||||
oc_enc_opt_data opt_data;
|
||||
};
|
||||
|
||||
|
||||
void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode);
|
||||
int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode);
|
||||
|
||||
|
||||
|
||||
/*Perform fullpel motion search for a single MB against both reference frames.*/
|
||||
void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi);
|
||||
/*Refine a MB MV for one frame.*/
|
||||
void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame);
|
||||
/*Refine the block MVs.*/
|
||||
void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi);
|
||||
|
||||
|
||||
|
||||
/*Used to rollback a tokenlog transaction when we retroactively decide to skip
|
||||
a fragment.
|
||||
A checkpoint is taken right before each token is added.*/
|
||||
struct oc_token_checkpoint{
|
||||
/*The color plane the token was added to.*/
|
||||
unsigned char pli;
|
||||
/*The zig-zag index the token was added to.*/
|
||||
unsigned char zzi;
|
||||
/*The outstanding EOB run count before the token was added.*/
|
||||
ogg_uint16_t eob_run;
|
||||
/*The token count before the token was added.*/
|
||||
ptrdiff_t ndct_tokens;
|
||||
};
|
||||
|
||||
|
||||
|
||||
void oc_enc_tokenize_start(oc_enc_ctx *_enc);
|
||||
int oc_enc_tokenize_ac(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
|
||||
ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
|
||||
const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
|
||||
int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
|
||||
int oc_enc_tokenize_ac_fast(oc_enc_ctx *_enc,int _pli,ptrdiff_t _fragi,
|
||||
ogg_int16_t *_qdct_out,const ogg_int16_t *_qdct_in,
|
||||
const ogg_uint16_t *_dequant,const ogg_int16_t *_dct,
|
||||
int _zzi,oc_token_checkpoint **_stack,int _lambda,int _acmin);
|
||||
void oc_enc_tokenlog_rollback(oc_enc_ctx *_enc,
|
||||
const oc_token_checkpoint *_stack,int _n);
|
||||
void oc_enc_pred_dc_frag_rows(oc_enc_ctx *_enc,
|
||||
int _pli,int _fragy0,int _frag_yend);
|
||||
void oc_enc_tokenize_dc_frag_list(oc_enc_ctx *_enc,int _pli,
|
||||
const ptrdiff_t *_coded_fragis,ptrdiff_t _ncoded_fragis,
|
||||
int _prev_ndct_tokens1,int _prev_eob_run1);
|
||||
void oc_enc_tokenize_finish(oc_enc_ctx *_enc);
|
||||
|
||||
|
||||
|
||||
/*Utility routine to encode one of the header packets.*/
|
||||
int oc_state_flushheader(oc_theora_state *_state,int *_packet_state,
|
||||
oggpack_buffer *_opb,const th_quant_info *_qinfo,
|
||||
const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS],
|
||||
const char *_vendor,th_comment *_tc,ogg_packet *_op);
|
||||
|
||||
|
||||
|
||||
/*Default pure-C implementations of encoder-specific accelerated functions.*/
|
||||
void oc_enc_accel_init_c(oc_enc_ctx *_enc);
|
||||
|
||||
void oc_enc_frag_sub_c(ogg_int16_t _diff[64],
|
||||
const unsigned char *_src,const unsigned char *_ref,int _ystride);
|
||||
void oc_enc_frag_sub_128_c(ogg_int16_t _diff[64],
|
||||
const unsigned char *_src,int _ystride);
|
||||
unsigned oc_enc_frag_sad_c(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_sad_thresh_c(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _thresh);
|
||||
unsigned oc_enc_frag_sad2_thresh_c(const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||
unsigned _thresh);
|
||||
unsigned oc_enc_frag_intra_sad_c(const unsigned char *_src, int _ystride);
|
||||
unsigned oc_enc_frag_satd_c(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_satd2_c(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
|
||||
unsigned oc_enc_frag_intra_satd_c(int *_dc,
|
||||
const unsigned char *_src,int _ystride);
|
||||
unsigned oc_enc_frag_ssd_c(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_border_ssd_c(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
|
||||
void oc_enc_frag_copy2_c(unsigned char *_dst,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
||||
void oc_enc_enquant_table_init_c(void *_enquant,
|
||||
const ogg_uint16_t _dequant[64]);
|
||||
void oc_enc_enquant_table_fixup_c(void *_enquant[3][3][2],int _nqis);
|
||||
int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||
const ogg_uint16_t _dequant[64],const void *_enquant);
|
||||
void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||
|
||||
#endif
|
||||
1839
engine/thirdparty/libtheora/encode.c
vendored
Normal file
1839
engine/thirdparty/libtheora/encode.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
70
engine/thirdparty/libtheora/encoder_disabled.c
vendored
Normal file
70
engine/thirdparty/libtheora/encoder_disabled.c
vendored
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
#include "apiwrapper.h"
|
||||
#include "encint.h"
|
||||
|
||||
const th_quant_info TH_VP31_QUANT_INFO = {};
|
||||
const th_huff_code TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
|
||||
|
||||
th_enc_ctx *th_encode_alloc(const th_info *_info){
|
||||
return NULL;
|
||||
}
|
||||
|
||||
void th_encode_free(th_enc_ctx *_enc){}
|
||||
|
||||
|
||||
int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int th_encode_flushheader(th_enc_ctx *_enc,th_comment *_tc,ogg_packet *_op){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _img){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int th_encode_packetout(th_enc_ctx *_enc,int _last_p,ogg_packet *_op){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
|
||||
|
||||
int theora_encode_init(theora_state *_te,theora_info *_ci){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int theora_encode_YUVin(theora_state *_te,yuv_buffer *_yuv){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int theora_encode_packetout(theora_state *_te,int _last_p,ogg_packet *_op){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int theora_encode_header(theora_state *_te,ogg_packet *_op){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int theora_encode_comment(theora_comment *_tc,ogg_packet *_op){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
|
||||
int theora_encode_tables(theora_state *_te,ogg_packet *_op){
|
||||
return OC_DISABLED;
|
||||
}
|
||||
370
engine/thirdparty/libtheora/enquant.c
vendored
Normal file
370
engine/thirdparty/libtheora/enquant.c
vendored
Normal file
|
|
@ -0,0 +1,370 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include "encint.h"
|
||||
|
||||
|
||||
|
||||
int oc_quant_params_clone(th_quant_info *_dst,const th_quant_info *_src){
|
||||
int i;
|
||||
memcpy(_dst,_src,sizeof(*_dst));
|
||||
memset(_dst->qi_ranges,0,sizeof(_dst->qi_ranges));
|
||||
for(i=0;i<6;i++){
|
||||
int nranges;
|
||||
int qti;
|
||||
int pli;
|
||||
int qtj;
|
||||
int plj;
|
||||
int pdup;
|
||||
int qdup;
|
||||
qti=i/3;
|
||||
pli=i%3;
|
||||
qtj=(i-1)/3;
|
||||
plj=(i-1)%3;
|
||||
nranges=_src->qi_ranges[qti][pli].nranges;
|
||||
/*Check for those duplicates that can be cleanly handled by
|
||||
oc_quant_params_clear().*/
|
||||
pdup=i>0&&nranges<=_src->qi_ranges[qtj][plj].nranges;
|
||||
qdup=qti>0&&nranges<=_src->qi_ranges[0][pli].nranges;
|
||||
_dst->qi_ranges[qti][pli].nranges=nranges;
|
||||
if(pdup&&_src->qi_ranges[qti][pli].sizes==_src->qi_ranges[qtj][plj].sizes){
|
||||
_dst->qi_ranges[qti][pli].sizes=_dst->qi_ranges[qtj][plj].sizes;
|
||||
}
|
||||
else if(qdup&&_src->qi_ranges[1][pli].sizes==_src->qi_ranges[0][pli].sizes){
|
||||
_dst->qi_ranges[1][pli].sizes=_dst->qi_ranges[0][pli].sizes;
|
||||
}
|
||||
else{
|
||||
int *sizes;
|
||||
sizes=(int *)_ogg_malloc(nranges*sizeof(*sizes));
|
||||
/*Note: The caller is responsible for cleaning up any partially
|
||||
constructed qinfo.*/
|
||||
if(sizes==NULL)return TH_EFAULT;
|
||||
memcpy(sizes,_src->qi_ranges[qti][pli].sizes,nranges*sizeof(*sizes));
|
||||
_dst->qi_ranges[qti][pli].sizes=sizes;
|
||||
}
|
||||
if(pdup&&_src->qi_ranges[qti][pli].base_matrices==
|
||||
_src->qi_ranges[qtj][plj].base_matrices){
|
||||
_dst->qi_ranges[qti][pli].base_matrices=
|
||||
_dst->qi_ranges[qtj][plj].base_matrices;
|
||||
}
|
||||
else if(qdup&&_src->qi_ranges[1][pli].base_matrices==
|
||||
_src->qi_ranges[0][pli].base_matrices){
|
||||
_dst->qi_ranges[1][pli].base_matrices=
|
||||
_dst->qi_ranges[0][pli].base_matrices;
|
||||
}
|
||||
else{
|
||||
th_quant_base *base_matrices;
|
||||
base_matrices=(th_quant_base *)_ogg_malloc(
|
||||
(nranges+1)*sizeof(*base_matrices));
|
||||
/*Note: The caller is responsible for cleaning up any partially
|
||||
constructed qinfo.*/
|
||||
if(base_matrices==NULL)return TH_EFAULT;
|
||||
memcpy(base_matrices,_src->qi_ranges[qti][pli].base_matrices,
|
||||
(nranges+1)*sizeof(*base_matrices));
|
||||
_dst->qi_ranges[qti][pli].base_matrices=
|
||||
(const th_quant_base *)base_matrices;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo){
|
||||
const th_quant_ranges *qranges;
|
||||
const th_quant_base *base_mats[2*3*64];
|
||||
int indices[2][3][64];
|
||||
int nbase_mats;
|
||||
int nbits;
|
||||
int ci;
|
||||
int qi;
|
||||
int qri;
|
||||
int qti;
|
||||
int pli;
|
||||
int qtj;
|
||||
int plj;
|
||||
int bmi;
|
||||
int i;
|
||||
i=_qinfo->loop_filter_limits[0];
|
||||
for(qi=1;qi<64;qi++)i=OC_MAXI(i,_qinfo->loop_filter_limits[qi]);
|
||||
nbits=OC_ILOG_32(i);
|
||||
oggpackB_write(_opb,nbits,3);
|
||||
for(qi=0;qi<64;qi++){
|
||||
oggpackB_write(_opb,_qinfo->loop_filter_limits[qi],nbits);
|
||||
}
|
||||
/*580 bits for VP3.*/
|
||||
i=1;
|
||||
for(qi=0;qi<64;qi++)i=OC_MAXI(_qinfo->ac_scale[qi],i);
|
||||
nbits=OC_ILOGNZ_32(i);
|
||||
oggpackB_write(_opb,nbits-1,4);
|
||||
for(qi=0;qi<64;qi++)oggpackB_write(_opb,_qinfo->ac_scale[qi],nbits);
|
||||
/*516 bits for VP3.*/
|
||||
i=1;
|
||||
for(qi=0;qi<64;qi++)i=OC_MAXI(_qinfo->dc_scale[qi],i);
|
||||
nbits=OC_ILOGNZ_32(i);
|
||||
oggpackB_write(_opb,nbits-1,4);
|
||||
for(qi=0;qi<64;qi++)oggpackB_write(_opb,_qinfo->dc_scale[qi],nbits);
|
||||
/*Consolidate any duplicate base matrices.*/
|
||||
nbase_mats=0;
|
||||
for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
|
||||
qranges=_qinfo->qi_ranges[qti]+pli;
|
||||
for(qri=0;qri<=qranges->nranges;qri++){
|
||||
for(bmi=0;;bmi++){
|
||||
if(bmi>=nbase_mats){
|
||||
base_mats[bmi]=qranges->base_matrices+qri;
|
||||
indices[qti][pli][qri]=nbase_mats++;
|
||||
break;
|
||||
}
|
||||
else if(memcmp(base_mats[bmi][0],qranges->base_matrices[qri],
|
||||
sizeof(base_mats[bmi][0]))==0){
|
||||
indices[qti][pli][qri]=bmi;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
/*Write out the list of unique base matrices.
|
||||
1545 bits for VP3 matrices.*/
|
||||
oggpackB_write(_opb,nbase_mats-1,9);
|
||||
for(bmi=0;bmi<nbase_mats;bmi++){
|
||||
for(ci=0;ci<64;ci++)oggpackB_write(_opb,base_mats[bmi][0][ci],8);
|
||||
}
|
||||
/*Now store quant ranges and their associated indices into the base matrix
|
||||
list.
|
||||
46 bits for VP3 matrices.*/
|
||||
nbits=OC_ILOG_32(nbase_mats-1);
|
||||
for(i=0;i<6;i++){
|
||||
qti=i/3;
|
||||
pli=i%3;
|
||||
qranges=_qinfo->qi_ranges[qti]+pli;
|
||||
if(i>0){
|
||||
if(qti>0){
|
||||
if(qranges->nranges==_qinfo->qi_ranges[qti-1][pli].nranges&&
|
||||
memcmp(qranges->sizes,_qinfo->qi_ranges[qti-1][pli].sizes,
|
||||
qranges->nranges*sizeof(qranges->sizes[0]))==0&&
|
||||
memcmp(indices[qti][pli],indices[qti-1][pli],
|
||||
(qranges->nranges+1)*sizeof(indices[qti][pli][0]))==0){
|
||||
oggpackB_write(_opb,1,2);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
qtj=(i-1)/3;
|
||||
plj=(i-1)%3;
|
||||
if(qranges->nranges==_qinfo->qi_ranges[qtj][plj].nranges&&
|
||||
memcmp(qranges->sizes,_qinfo->qi_ranges[qtj][plj].sizes,
|
||||
qranges->nranges*sizeof(qranges->sizes[0]))==0&&
|
||||
memcmp(indices[qti][pli],indices[qtj][plj],
|
||||
(qranges->nranges+1)*sizeof(indices[qti][pli][0]))==0){
|
||||
oggpackB_write(_opb,0,1+(qti>0));
|
||||
continue;
|
||||
}
|
||||
oggpackB_write(_opb,1,1);
|
||||
}
|
||||
oggpackB_write(_opb,indices[qti][pli][0],nbits);
|
||||
for(qi=qri=0;qi<63;qri++){
|
||||
oggpackB_write(_opb,qranges->sizes[qri]-1,OC_ILOG_32(62-qi));
|
||||
qi+=qranges->sizes[qri];
|
||||
oggpackB_write(_opb,indices[qti][pli][qri+1],nbits);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d){
|
||||
ogg_uint32_t t;
|
||||
int l;
|
||||
_d<<=1;
|
||||
l=OC_ILOGNZ_32(_d)-1;
|
||||
t=1+((ogg_uint32_t)1<<16+l)/_d;
|
||||
_this->m=(ogg_int16_t)(t-0x10000);
|
||||
_this->l=l;
|
||||
}
|
||||
|
||||
void oc_enc_enquant_table_init_c(void *_enquant,
|
||||
const ogg_uint16_t _dequant[64]){
|
||||
oc_iquant *enquant;
|
||||
int zzi;
|
||||
/*In the original VP3.2 code, the rounding offset and the size of the
|
||||
dead zone around 0 were controlled by a "sharpness" parameter.
|
||||
We now R-D optimize the tokens for each block after quantization,
|
||||
so the rounding offset should always be 1/2, and an explicit dead
|
||||
zone is unnecessary.
|
||||
Hence, all of that VP3.2 code is gone from here, and the remaining
|
||||
floating point code has been implemented as equivalent integer
|
||||
code with exact precision.*/
|
||||
enquant=(oc_iquant *)_enquant;
|
||||
for(zzi=0;zzi<64;zzi++)oc_iquant_init(enquant+zzi,_dequant[zzi]);
|
||||
}
|
||||
|
||||
void oc_enc_enquant_table_fixup_c(void *_enquant[3][3][2],int _nqis){
|
||||
int pli;
|
||||
int qii;
|
||||
int qti;
|
||||
for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
|
||||
*((oc_iquant *)_enquant[pli][qii][qti])=
|
||||
*((oc_iquant *)_enquant[pli][0][qti]);
|
||||
}
|
||||
}
|
||||
|
||||
int oc_enc_quantize_c(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||
const ogg_uint16_t _dequant[64],const void *_enquant){
|
||||
const oc_iquant *enquant;
|
||||
int nonzero;
|
||||
int zzi;
|
||||
int val;
|
||||
int d;
|
||||
int s;
|
||||
enquant=(const oc_iquant *)_enquant;
|
||||
nonzero=0;
|
||||
for(zzi=0;zzi<64;zzi++){
|
||||
val=_dct[zzi];
|
||||
d=_dequant[zzi];
|
||||
val=val<<1;
|
||||
if(abs(val)>=d){
|
||||
s=OC_SIGNMASK(val);
|
||||
/*The bias added here rounds ties away from zero, since token
|
||||
optimization can only decrease the magnitude of the quantized
|
||||
value.*/
|
||||
val+=d+s^s;
|
||||
/*Note the arithmetic right shift is not guaranteed by ANSI C.
|
||||
Hopefully no one still uses ones-complement architectures.*/
|
||||
val=((enquant[zzi].m*(ogg_int32_t)val>>16)+val>>enquant[zzi].l)-s;
|
||||
_qdct[zzi]=(ogg_int16_t)val;
|
||||
nonzero=zzi;
|
||||
}
|
||||
else _qdct[zzi]=0;
|
||||
}
|
||||
return nonzero;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*This table gives the square root of the fraction of the squared magnitude of
|
||||
each DCT coefficient relative to the total, scaled by 2**16, for both INTRA
|
||||
and INTER modes.
|
||||
These values were measured after motion-compensated prediction, before
|
||||
quantization, over a large set of test video (from QCIF to 1080p) encoded at
|
||||
all possible rates.
|
||||
The DC coefficient takes into account the DPCM prediction (using the
|
||||
quantized values from neighboring blocks, as the encoder does, but still
|
||||
before quantization of the coefficient in the current block).
|
||||
The results differ significantly from the expected variance (e.g., using an
|
||||
AR(1) model of the signal with rho=0.95, as is frequently done to compute
|
||||
the coding gain of the DCT).
|
||||
We use them to estimate an "average" quantizer for a given quantizer matrix,
|
||||
as this is used to parameterize a number of the rate control decisions.
|
||||
These values are themselves probably quantizer-matrix dependent, since the
|
||||
shape of the matrix affects the noise distribution in the reference frames,
|
||||
but they should at least give us _some_ amount of adaptivity to different
|
||||
matrices, as opposed to hard-coding a table of average Q values for the
|
||||
current set.
|
||||
The main features they capture are that a) only a few of the quantizers in
|
||||
the upper-left corner contribute anything significant at all (though INTER
|
||||
mode is significantly flatter) and b) the DPCM prediction of the DC
|
||||
coefficient gives a very minor improvement in the INTRA case and a quite
|
||||
significant one in the INTER case (over the expected variance).*/
|
||||
static const ogg_uint16_t OC_RPSD[2][64]={
|
||||
{
|
||||
52725,17370,10399, 6867, 5115, 3798, 2942, 2076,
|
||||
17370, 9900, 6948, 4994, 3836, 2869, 2229, 1619,
|
||||
10399, 6948, 5516, 4202, 3376, 2573, 2015, 1461,
|
||||
6867, 4994, 4202, 3377, 2800, 2164, 1718, 1243,
|
||||
5115, 3836, 3376, 2800, 2391, 1884, 1530, 1091,
|
||||
3798, 2869, 2573, 2164, 1884, 1495, 1212, 873,
|
||||
2942, 2229, 2015, 1718, 1530, 1212, 1001, 704,
|
||||
2076, 1619, 1461, 1243, 1091, 873, 704, 474
|
||||
},
|
||||
{
|
||||
23411,15604,13529,11601,10683, 8958, 7840, 6142,
|
||||
15604,11901,10718, 9108, 8290, 6961, 6023, 4487,
|
||||
13529,10718, 9961, 8527, 7945, 6689, 5742, 4333,
|
||||
11601, 9108, 8527, 7414, 7084, 5923, 5175, 3743,
|
||||
10683, 8290, 7945, 7084, 6771, 5754, 4793, 3504,
|
||||
8958, 6961, 6689, 5923, 5754, 4679, 3936, 2989,
|
||||
7840, 6023, 5742, 5175, 4793, 3936, 3522, 2558,
|
||||
6142, 4487, 4333, 3743, 3504, 2989, 2558, 1829
|
||||
}
|
||||
};
|
||||
|
||||
/*The fraction of the squared magnitude of the residuals in each color channel
|
||||
relative to the total, scaled by 2**16, for each pixel format.
|
||||
These values were measured after motion-compensated prediction, before
|
||||
quantization, over a large set of test video encoded at all possible rates.
|
||||
TODO: These values are only from INTER frames; they should be re-measured for
|
||||
INTRA frames.*/
|
||||
static const ogg_uint16_t OC_PCD[4][3]={
|
||||
{59926, 3038, 2572},
|
||||
{55201, 5597, 4738},
|
||||
{55201, 5597, 4738},
|
||||
{47682, 9669, 8185}
|
||||
};
|
||||
|
||||
|
||||
/*Compute "average" quantizers for each qi level to use for rate control.
|
||||
We do one for each color channel, as well as an average across color
|
||||
channels, separately for INTER and INTRA, since their behavior is very
|
||||
different.
|
||||
The basic approach is to compute a harmonic average of the squared quantizer,
|
||||
weighted by the expected squared magnitude of the DCT coefficients.
|
||||
Under the (not quite true) assumption that DCT coefficients are
|
||||
Laplacian-distributed, this preserves the product Q*lambda, where
|
||||
lambda=sqrt(2/sigma**2) is the Laplacian distribution parameter (not to be
|
||||
confused with the lambda used in R-D optimization throughout most of the
|
||||
rest of the code), when the distributions from multiple coefficients are
|
||||
pooled.
|
||||
The value Q*lambda completely determines the entropy of coefficients drawn
|
||||
from a Laplacian distribution, and thus the expected bitrate.*/
|
||||
void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
|
||||
ogg_int16_t _log_plq[64][3][2],ogg_uint16_t _chroma_rd_scale[2][64][2],
|
||||
ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt){
|
||||
int qi;
|
||||
int pli;
|
||||
int qti;
|
||||
int ci;
|
||||
for(qti=0;qti<2;qti++)for(qi=0;qi<64;qi++){
|
||||
ogg_int64_t q2;
|
||||
ogg_uint32_t qp[3];
|
||||
ogg_uint32_t cqp;
|
||||
ogg_uint32_t d;
|
||||
q2=0;
|
||||
for(pli=0;pli<3;pli++){
|
||||
qp[pli]=0;
|
||||
for(ci=0;ci<64;ci++){
|
||||
unsigned rq;
|
||||
unsigned qd;
|
||||
qd=_dequant[qi][pli][qti][OC_IZIG_ZAG[ci]];
|
||||
rq=(OC_RPSD[qti][ci]+(qd>>1))/qd;
|
||||
qp[pli]+=rq*(ogg_uint32_t)rq;
|
||||
}
|
||||
q2+=OC_PCD[_pixel_fmt][pli]*(ogg_int64_t)qp[pli];
|
||||
/*plq=1.0/sqrt(qp)*/
|
||||
_log_plq[qi][pli][qti]=
|
||||
(ogg_int16_t)(OC_Q10(32)-oc_blog32_q10(qp[pli])>>1);
|
||||
}
|
||||
d=OC_PCD[_pixel_fmt][1]+OC_PCD[_pixel_fmt][2];
|
||||
cqp=(ogg_uint32_t)((OC_PCD[_pixel_fmt][1]*(ogg_int64_t)qp[1]+
|
||||
OC_PCD[_pixel_fmt][2]*(ogg_int64_t)qp[2]+(d>>1))/d);
|
||||
/*chroma_rd_scale=clamp(0.25,cqp/qp[0],4)*/
|
||||
d=OC_MAXI(qp[0]+(1<<OC_RD_SCALE_BITS-1)>>OC_RD_SCALE_BITS,1);
|
||||
d=OC_CLAMPI(1<<OC_RD_SCALE_BITS-2,(cqp+(d>>1))/d,4<<OC_RD_SCALE_BITS);
|
||||
_chroma_rd_scale[qti][qi][0]=(ogg_int16_t)d;
|
||||
/*chroma_rd_iscale=clamp(0.25,qp[0]/cqp,4)*/
|
||||
d=OC_MAXI(OC_RD_ISCALE(cqp,1),1);
|
||||
d=OC_CLAMPI(1<<OC_RD_ISCALE_BITS-2,(qp[0]+(d>>1))/d,4<<OC_RD_ISCALE_BITS);
|
||||
_chroma_rd_scale[qti][qi][1]=(ogg_int16_t)d;
|
||||
/*qavg=1.0/sqrt(q2).*/
|
||||
_log_qavg[qti][qi]=OC_Q57(48)-oc_blog64(q2)>>1;
|
||||
}
|
||||
}
|
||||
26
engine/thirdparty/libtheora/enquant.h
vendored
Normal file
26
engine/thirdparty/libtheora/enquant.h
vendored
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
#if !defined(_enquant_H)
|
||||
# define _enquant_H (1)
|
||||
# include "quant.h"
|
||||
|
||||
typedef struct oc_iquant oc_iquant;
|
||||
|
||||
#define OC_QUANT_MAX_LOG (OC_Q57(OC_STATIC_ILOG_32(OC_QUANT_MAX)-1))
|
||||
|
||||
/*Used to compute x/d via ((x*m>>16)+x>>l)+(x<0))
|
||||
(i.e., one 16x16->16 mul, 2 shifts, and 2 adds).
|
||||
This is not an approximation; for 16-bit x and d, it is exact.*/
|
||||
struct oc_iquant{
|
||||
ogg_int16_t m;
|
||||
ogg_int16_t l;
|
||||
};
|
||||
|
||||
|
||||
|
||||
int oc_quant_params_clone(th_quant_info *_dst,const th_quant_info *_src);
|
||||
void oc_quant_params_pack(oggpack_buffer *_opb,const th_quant_info *_qinfo);
|
||||
void oc_iquant_init(oc_iquant *_this,ogg_uint16_t _d);
|
||||
void oc_enquant_qavg_init(ogg_int64_t _log_qavg[2][64],
|
||||
ogg_int16_t _log_plq[64][3][2],ogg_uint16_t _pl_rd_scale[2][64][2],
|
||||
ogg_uint16_t *_dequant[64][3][2],int _pixel_fmt);
|
||||
|
||||
#endif
|
||||
417
engine/thirdparty/libtheora/fdct.c
vendored
Normal file
417
engine/thirdparty/libtheora/fdct.c
vendored
Normal file
|
|
@ -0,0 +1,417 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
#include "encint.h"
|
||||
#include "dct.h"
|
||||
|
||||
|
||||
|
||||
/*Performs a forward 8 point Type-II DCT transform.
|
||||
The output is scaled by a factor of 2 from the orthonormal version of the
|
||||
transform.
|
||||
_y: The buffer to store the result in.
|
||||
Data will be placed the first 8 entries (e.g., in a row of an 8x8 block).
|
||||
_x: The input coefficients.
|
||||
Every 8th entry is used (e.g., from a column of an 8x8 block).*/
|
||||
static void oc_fdct8(ogg_int16_t _y[8],const ogg_int16_t *_x){
|
||||
int t0;
|
||||
int t1;
|
||||
int t2;
|
||||
int t3;
|
||||
int t4;
|
||||
int t5;
|
||||
int t6;
|
||||
int t7;
|
||||
int r;
|
||||
int s;
|
||||
int u;
|
||||
int v;
|
||||
/*Stage 1:*/
|
||||
/*0-7 butterfly.*/
|
||||
t0=_x[0<<3]+(int)_x[7<<3];
|
||||
t7=_x[0<<3]-(int)_x[7<<3];
|
||||
/*1-6 butterfly.*/
|
||||
t1=_x[1<<3]+(int)_x[6<<3];
|
||||
t6=_x[1<<3]-(int)_x[6<<3];
|
||||
/*2-5 butterfly.*/
|
||||
t2=_x[2<<3]+(int)_x[5<<3];
|
||||
t5=_x[2<<3]-(int)_x[5<<3];
|
||||
/*3-4 butterfly.*/
|
||||
t3=_x[3<<3]+(int)_x[4<<3];
|
||||
t4=_x[3<<3]-(int)_x[4<<3];
|
||||
/*Stage 2:*/
|
||||
/*0-3 butterfly.*/
|
||||
r=t0+t3;
|
||||
t3=t0-t3;
|
||||
t0=r;
|
||||
/*1-2 butterfly.*/
|
||||
r=t1+t2;
|
||||
t2=t1-t2;
|
||||
t1=r;
|
||||
/*6-5 butterfly.*/
|
||||
r=t6+t5;
|
||||
t5=t6-t5;
|
||||
t6=r;
|
||||
/*Stages 3 and 4 are where all the approximation occurs.
|
||||
These are chosen to be as close to an exact inverse of the approximations
|
||||
made in the iDCT as possible, while still using mostly 16-bit arithmetic.
|
||||
We use some 16x16->32 signed MACs, but those still commonly execute in 1
|
||||
cycle on a 16-bit DSP.
|
||||
For example, s=(27146*t5+0x4000>>16)+t5+(t5!=0) is an exact inverse of
|
||||
t5=(OC_C4S4*s>>16).
|
||||
That is, applying the latter to the output of the former will recover t5
|
||||
exactly (over the valid input range of t5, -23171...23169).
|
||||
We increase the rounding bias to 0xB500 in this particular case so that
|
||||
errors inverting the subsequent butterfly are not one-sided (e.g., the
|
||||
mean error is very close to zero).
|
||||
The (t5!=0) term could be replaced simply by 1, but we want to send 0 to 0.
|
||||
The fDCT of an all-zeros block will still not be zero, because of the
|
||||
biases we added at the very beginning of the process, but it will be close
|
||||
enough that it is guaranteed to round to zero.*/
|
||||
/*Stage 3:*/
|
||||
/*4-5 butterfly.*/
|
||||
s=(27146*t5+0xB500>>16)+t5+(t5!=0)>>1;
|
||||
r=t4+s;
|
||||
t5=t4-s;
|
||||
t4=r;
|
||||
/*7-6 butterfly.*/
|
||||
s=(27146*t6+0xB500>>16)+t6+(t6!=0)>>1;
|
||||
r=t7+s;
|
||||
t6=t7-s;
|
||||
t7=r;
|
||||
/*Stage 4:*/
|
||||
/*0-1 butterfly.*/
|
||||
r=(27146*t0+0x4000>>16)+t0+(t0!=0);
|
||||
s=(27146*t1+0xB500>>16)+t1+(t1!=0);
|
||||
u=r+s>>1;
|
||||
v=r-u;
|
||||
_y[0]=u;
|
||||
_y[4]=v;
|
||||
/*3-2 rotation by 6pi/16*/
|
||||
u=(OC_C6S2*t2+OC_C2S6*t3+0x6CB7>>16)+(t3!=0);
|
||||
s=(OC_C6S2*u>>16)-t2;
|
||||
v=(s*21600+0x2800>>18)+s+(s!=0);
|
||||
_y[2]=u;
|
||||
_y[6]=v;
|
||||
/*6-5 rotation by 3pi/16*/
|
||||
u=(OC_C5S3*t6+OC_C3S5*t5+0x0E3D>>16)+(t5!=0);
|
||||
s=t6-(OC_C5S3*u>>16);
|
||||
v=(s*26568+0x3400>>17)+s+(s!=0);
|
||||
_y[5]=u;
|
||||
_y[3]=v;
|
||||
/*7-4 rotation by 7pi/16*/
|
||||
u=(OC_C7S1*t4+OC_C1S7*t7+0x7B1B>>16)+(t7!=0);
|
||||
s=(OC_C7S1*u>>16)-t4;
|
||||
v=(s*20539+0x3000>>20)+s+(s!=0);
|
||||
_y[1]=u;
|
||||
_y[7]=v;
|
||||
}
|
||||
|
||||
/*Performs a forward 8x8 Type-II DCT transform.
|
||||
The output is scaled by a factor of 4 relative to the orthonormal version
|
||||
of the transform.
|
||||
_y: The buffer to store the result in.
|
||||
This may be the same as _x.
|
||||
_x: The input coefficients. */
|
||||
void oc_enc_fdct8x8_c(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
const ogg_int16_t *in;
|
||||
ogg_int16_t *end;
|
||||
ogg_int16_t *out;
|
||||
ogg_int16_t w[64];
|
||||
int i;
|
||||
/*Add two extra bits of working precision to improve accuracy; any more and
|
||||
we could overflow.*/
|
||||
for(i=0;i<64;i++)w[i]=_x[i]<<2;
|
||||
/*These biases correct for some systematic error that remains in the full
|
||||
fDCT->iDCT round trip.*/
|
||||
w[0]+=(w[0]!=0)+1;
|
||||
w[1]++;
|
||||
w[8]--;
|
||||
/*Transform columns of w into rows of _y.*/
|
||||
for(in=w,out=_y,end=out+64;out<end;in++,out+=8)oc_fdct8(out,in);
|
||||
/*Transform columns of _y into rows of w.*/
|
||||
for(in=_y,out=w,end=out+64;out<end;in++,out+=8)oc_fdct8(out,in);
|
||||
/*Round the result back to the external working precision (which is still
|
||||
scaled by four relative to the orthogonal result).
|
||||
TODO: We should just update the external working precision.*/
|
||||
for(i=0;i<64;i++)_y[i]=w[OC_FZIG_ZAG[i]]+2>>2;
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*This does not seem to outperform simple LFE border padding before MC.
|
||||
It yields higher PSNR, but much higher bitrate usage.*/
|
||||
#if 0
|
||||
typedef struct oc_extension_info oc_extension_info;
|
||||
|
||||
|
||||
|
||||
/*Information needed to pad boundary blocks.
|
||||
We multiply each row/column by an extension matrix that fills in the padding
|
||||
values as a linear combination of the active values, so that an equivalent
|
||||
number of coefficients are forced to zero.
|
||||
This costs at most 16 multiplies, the same as a 1-D fDCT itself, and as
|
||||
little as 7 multiplies.
|
||||
We compute the extension matrices for every possible shape in advance, as
|
||||
there are only 35.
|
||||
The coefficients for all matrices are stored in a single array to take
|
||||
advantage of the overlap and repetitiveness of many of the shapes.
|
||||
A similar technique is applied to the offsets into this array.
|
||||
This reduces the required table storage by about 48%.
|
||||
See tools/extgen.c for details.
|
||||
We could conceivably do the same for all 256 possible shapes.*/
|
||||
struct oc_extension_info{
|
||||
/*The mask of the active pixels in the shape.*/
|
||||
short mask;
|
||||
/*The number of active pixels in the shape.*/
|
||||
short na;
|
||||
/*The extension matrix.
|
||||
This is (8-na)xna*/
|
||||
const ogg_int16_t *const *ext;
|
||||
/*The pixel indices: na active pixels followed by 8-na padding pixels.*/
|
||||
unsigned char pi[8];
|
||||
/*The coefficient indices: na unconstrained coefficients followed by 8-na
|
||||
coefficients to be forced to zero.*/
|
||||
unsigned char ci[8];
|
||||
};
|
||||
|
||||
|
||||
/*The number of shapes we need.*/
|
||||
#define OC_NSHAPES (35)
|
||||
|
||||
static const ogg_int16_t OC_EXT_COEFFS[229]={
|
||||
0x7FFF,0xE1F8,0x6903,0xAA79,0x5587,0x7FFF,0x1E08,0x7FFF,
|
||||
0x5587,0xAA79,0x6903,0xE1F8,0x7FFF,0x0000,0x0000,0x0000,
|
||||
0x7FFF,0x0000,0x0000,0x7FFF,0x8000,0x7FFF,0x0000,0x0000,
|
||||
0x7FFF,0xE1F8,0x1E08,0xB0A7,0xAA1D,0x337C,0x7FFF,0x4345,
|
||||
0x2267,0x4345,0x7FFF,0x337C,0xAA1D,0xB0A7,0x8A8C,0x4F59,
|
||||
0x03B4,0xE2D6,0x7FFF,0x2CF3,0x7FFF,0xE2D6,0x03B4,0x4F59,
|
||||
0x8A8C,0x1103,0x7AEF,0x5225,0xDF60,0xC288,0xDF60,0x5225,
|
||||
0x7AEF,0x1103,0x668A,0xD6EE,0x3A16,0x0E6C,0xFA07,0x0E6C,
|
||||
0x3A16,0xD6EE,0x668A,0x2A79,0x2402,0x980F,0x50F5,0x4882,
|
||||
0x50F5,0x980F,0x2402,0x2A79,0xF976,0x2768,0x5F22,0x2768,
|
||||
0xF976,0x1F91,0x76C1,0xE9AE,0x76C1,0x1F91,0x7FFF,0xD185,
|
||||
0x0FC8,0xD185,0x7FFF,0x4F59,0x4345,0xED62,0x4345,0x4F59,
|
||||
0xF574,0x5D99,0x2CF3,0x5D99,0xF574,0x5587,0x3505,0x30FC,
|
||||
0xF482,0x953C,0xEAC4,0x7FFF,0x4F04,0x7FFF,0xEAC4,0x953C,
|
||||
0xF482,0x30FC,0x4F04,0x273D,0xD8C3,0x273D,0x1E09,0x61F7,
|
||||
0x1E09,0x273D,0xD8C3,0x273D,0x4F04,0x30FC,0xA57E,0x153C,
|
||||
0x6AC4,0x3C7A,0x1E08,0x3C7A,0x6AC4,0x153C,0xA57E,0x7FFF,
|
||||
0xA57E,0x5A82,0x6AC4,0x153C,0xC386,0xE1F8,0xC386,0x153C,
|
||||
0x6AC4,0x5A82,0xD8C3,0x273D,0x7FFF,0xE1F7,0x7FFF,0x273D,
|
||||
0xD8C3,0x4F04,0x30FC,0xD8C3,0x273D,0xD8C3,0x30FC,0x4F04,
|
||||
0x1FC8,0x67AD,0x1853,0xE038,0x1853,0x67AD,0x1FC8,0x4546,
|
||||
0xE038,0x1FC8,0x3ABA,0x1FC8,0xE038,0x4546,0x3505,0x5587,
|
||||
0xF574,0xBC11,0x78F4,0x4AFB,0xE6F3,0x4E12,0x3C11,0xF8F4,
|
||||
0x4AFB,0x3C7A,0xF88B,0x3C11,0x78F4,0xCAFB,0x7FFF,0x08CC,
|
||||
0x070C,0x236D,0x5587,0x236D,0x070C,0xF88B,0x3C7A,0x4AFB,
|
||||
0xF8F4,0x3C11,0x7FFF,0x153C,0xCAFB,0x153C,0x7FFF,0x1E08,
|
||||
0xE1F8,0x7FFF,0x08CC,0x7FFF,0xCAFB,0x78F4,0x3C11,0x4E12,
|
||||
0xE6F3,0x4AFB,0x78F4,0xBC11,0xFE3D,0x7FFF,0xFE3D,0x2F3A,
|
||||
0x7FFF,0x2F3A,0x89BC,0x7FFF,0x89BC
|
||||
};
|
||||
|
||||
static const ogg_int16_t *const OC_EXT_ROWS[96]={
|
||||
OC_EXT_COEFFS+ 0,OC_EXT_COEFFS+ 0,OC_EXT_COEFFS+ 0,OC_EXT_COEFFS+ 0,
|
||||
OC_EXT_COEFFS+ 0,OC_EXT_COEFFS+ 0,OC_EXT_COEFFS+ 0,OC_EXT_COEFFS+ 6,
|
||||
OC_EXT_COEFFS+ 27,OC_EXT_COEFFS+ 38,OC_EXT_COEFFS+ 43,OC_EXT_COEFFS+ 32,
|
||||
OC_EXT_COEFFS+ 49,OC_EXT_COEFFS+ 58,OC_EXT_COEFFS+ 67,OC_EXT_COEFFS+ 71,
|
||||
OC_EXT_COEFFS+ 62,OC_EXT_COEFFS+ 53,OC_EXT_COEFFS+ 12,OC_EXT_COEFFS+ 15,
|
||||
OC_EXT_COEFFS+ 14,OC_EXT_COEFFS+ 13,OC_EXT_COEFFS+ 76,OC_EXT_COEFFS+ 81,
|
||||
OC_EXT_COEFFS+ 86,OC_EXT_COEFFS+ 91,OC_EXT_COEFFS+ 96,OC_EXT_COEFFS+ 98,
|
||||
OC_EXT_COEFFS+ 93,OC_EXT_COEFFS+ 88,OC_EXT_COEFFS+ 83,OC_EXT_COEFFS+ 78,
|
||||
OC_EXT_COEFFS+ 12,OC_EXT_COEFFS+ 15,OC_EXT_COEFFS+ 15,OC_EXT_COEFFS+ 12,
|
||||
OC_EXT_COEFFS+ 12,OC_EXT_COEFFS+ 15,OC_EXT_COEFFS+ 12,OC_EXT_COEFFS+ 15,
|
||||
OC_EXT_COEFFS+ 15,OC_EXT_COEFFS+ 12,OC_EXT_COEFFS+ 103,OC_EXT_COEFFS+ 108,
|
||||
OC_EXT_COEFFS+ 126,OC_EXT_COEFFS+ 16,OC_EXT_COEFFS+ 137,OC_EXT_COEFFS+ 141,
|
||||
OC_EXT_COEFFS+ 20,OC_EXT_COEFFS+ 130,OC_EXT_COEFFS+ 113,OC_EXT_COEFFS+ 116,
|
||||
OC_EXT_COEFFS+ 146,OC_EXT_COEFFS+ 153,OC_EXT_COEFFS+ 160,OC_EXT_COEFFS+ 167,
|
||||
OC_EXT_COEFFS+ 170,OC_EXT_COEFFS+ 163,OC_EXT_COEFFS+ 156,OC_EXT_COEFFS+ 149,
|
||||
OC_EXT_COEFFS+ 119,OC_EXT_COEFFS+ 122,OC_EXT_COEFFS+ 174,OC_EXT_COEFFS+ 177,
|
||||
OC_EXT_COEFFS+ 182,OC_EXT_COEFFS+ 187,OC_EXT_COEFFS+ 192,OC_EXT_COEFFS+ 197,
|
||||
OC_EXT_COEFFS+ 202,OC_EXT_COEFFS+ 207,OC_EXT_COEFFS+ 210,OC_EXT_COEFFS+ 215,
|
||||
OC_EXT_COEFFS+ 179,OC_EXT_COEFFS+ 189,OC_EXT_COEFFS+ 24,OC_EXT_COEFFS+ 204,
|
||||
OC_EXT_COEFFS+ 184,OC_EXT_COEFFS+ 194,OC_EXT_COEFFS+ 212,OC_EXT_COEFFS+ 199,
|
||||
OC_EXT_COEFFS+ 217,OC_EXT_COEFFS+ 100,OC_EXT_COEFFS+ 134,OC_EXT_COEFFS+ 135,
|
||||
OC_EXT_COEFFS+ 135,OC_EXT_COEFFS+ 12,OC_EXT_COEFFS+ 15,OC_EXT_COEFFS+ 134,
|
||||
OC_EXT_COEFFS+ 134,OC_EXT_COEFFS+ 135,OC_EXT_COEFFS+ 220,OC_EXT_COEFFS+ 223,
|
||||
OC_EXT_COEFFS+ 226,OC_EXT_COEFFS+ 227,OC_EXT_COEFFS+ 224,OC_EXT_COEFFS+ 221
|
||||
};
|
||||
|
||||
static const oc_extension_info OC_EXTENSION_INFO[OC_NSHAPES]={
|
||||
{0x7F,7,OC_EXT_ROWS+ 0,{0,1,2,3,4,5,6,7},{0,1,2,4,5,6,7,3}},
|
||||
{0xFE,7,OC_EXT_ROWS+ 7,{1,2,3,4,5,6,7,0},{0,1,2,4,5,6,7,3}},
|
||||
{0x3F,6,OC_EXT_ROWS+ 8,{0,1,2,3,4,5,7,6},{0,1,3,4,6,7,5,2}},
|
||||
{0xFC,6,OC_EXT_ROWS+ 10,{2,3,4,5,6,7,1,0},{0,1,3,4,6,7,5,2}},
|
||||
{0x1F,5,OC_EXT_ROWS+ 12,{0,1,2,3,4,7,6,5},{0,2,3,5,7,6,4,1}},
|
||||
{0xF8,5,OC_EXT_ROWS+ 15,{3,4,5,6,7,2,1,0},{0,2,3,5,7,6,4,1}},
|
||||
{0x0F,4,OC_EXT_ROWS+ 18,{0,1,2,3,7,6,5,4},{0,2,4,6,7,5,3,1}},
|
||||
{0xF0,4,OC_EXT_ROWS+ 18,{4,5,6,7,3,2,1,0},{0,2,4,6,7,5,3,1}},
|
||||
{0x07,3,OC_EXT_ROWS+ 22,{0,1,2,7,6,5,4,3},{0,3,6,7,5,4,2,1}},
|
||||
{0xE0,3,OC_EXT_ROWS+ 27,{5,6,7,4,3,2,1,0},{0,3,6,7,5,4,2,1}},
|
||||
{0x03,2,OC_EXT_ROWS+ 32,{0,1,7,6,5,4,3,2},{0,4,7,6,5,3,2,1}},
|
||||
{0xC0,2,OC_EXT_ROWS+ 32,{6,7,5,4,3,2,1,0},{0,4,7,6,5,3,2,1}},
|
||||
{0x01,1,OC_EXT_ROWS+ 0,{0,7,6,5,4,3,2,1},{0,7,6,5,4,3,2,1}},
|
||||
{0x80,1,OC_EXT_ROWS+ 0,{7,6,5,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
|
||||
{0x7E,6,OC_EXT_ROWS+ 42,{1,2,3,4,5,6,7,0},{0,1,2,5,6,7,4,3}},
|
||||
{0x7C,5,OC_EXT_ROWS+ 44,{2,3,4,5,6,7,1,0},{0,1,4,5,7,6,3,2}},
|
||||
{0x3E,5,OC_EXT_ROWS+ 47,{1,2,3,4,5,7,6,0},{0,1,4,5,7,6,3,2}},
|
||||
{0x78,4,OC_EXT_ROWS+ 50,{3,4,5,6,7,2,1,0},{0,4,5,7,6,3,2,1}},
|
||||
{0x3C,4,OC_EXT_ROWS+ 54,{2,3,4,5,7,6,1,0},{0,3,4,7,6,5,2,1}},
|
||||
{0x1E,4,OC_EXT_ROWS+ 58,{1,2,3,4,7,6,5,0},{0,4,5,7,6,3,2,1}},
|
||||
{0x70,3,OC_EXT_ROWS+ 62,{4,5,6,7,3,2,1,0},{0,5,7,6,4,3,2,1}},
|
||||
{0x38,3,OC_EXT_ROWS+ 67,{3,4,5,7,6,2,1,0},{0,5,6,7,4,3,2,1}},
|
||||
{0x1C,3,OC_EXT_ROWS+ 72,{2,3,4,7,6,5,1,0},{0,5,6,7,4,3,2,1}},
|
||||
{0x0E,3,OC_EXT_ROWS+ 77,{1,2,3,7,6,5,4,0},{0,5,7,6,4,3,2,1}},
|
||||
{0x60,2,OC_EXT_ROWS+ 82,{5,6,7,4,3,2,1,0},{0,2,7,6,5,4,3,1}},
|
||||
{0x30,2,OC_EXT_ROWS+ 36,{4,5,7,6,3,2,1,0},{0,4,7,6,5,3,2,1}},
|
||||
{0x18,2,OC_EXT_ROWS+ 90,{3,4,7,6,5,2,1,0},{0,1,7,6,5,4,3,2}},
|
||||
{0x0C,2,OC_EXT_ROWS+ 34,{2,3,7,6,5,4,1,0},{0,4,7,6,5,3,2,1}},
|
||||
{0x06,2,OC_EXT_ROWS+ 84,{1,2,7,6,5,4,3,0},{0,2,7,6,5,4,3,1}},
|
||||
{0x40,1,OC_EXT_ROWS+ 0,{6,7,5,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
|
||||
{0x20,1,OC_EXT_ROWS+ 0,{5,7,6,4,3,2,1,0},{0,7,6,5,4,3,2,1}},
|
||||
{0x10,1,OC_EXT_ROWS+ 0,{4,7,6,5,3,2,1,0},{0,7,6,5,4,3,2,1}},
|
||||
{0x08,1,OC_EXT_ROWS+ 0,{3,7,6,5,4,2,1,0},{0,7,6,5,4,3,2,1}},
|
||||
{0x04,1,OC_EXT_ROWS+ 0,{2,7,6,5,4,3,1,0},{0,7,6,5,4,3,2,1}},
|
||||
{0x02,1,OC_EXT_ROWS+ 0,{1,7,6,5,4,3,2,0},{0,7,6,5,4,3,2,1}}
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*Pads a single column of a partial block and then performs a forward Type-II
|
||||
DCT on the result.
|
||||
The input is scaled by a factor of 4 and biased appropriately for the current
|
||||
fDCT implementation.
|
||||
The output is scaled by an additional factor of 2 from the orthonormal
|
||||
version of the transform.
|
||||
_y: The buffer to store the result in.
|
||||
Data will be placed the first 8 entries (e.g., in a row of an 8x8 block).
|
||||
_x: The input coefficients.
|
||||
Every 8th entry is used (e.g., from a column of an 8x8 block).
|
||||
_e: The extension information for the shape.*/
|
||||
static void oc_fdct8_ext(ogg_int16_t _y[8],ogg_int16_t *_x,
|
||||
const oc_extension_info *_e){
|
||||
const unsigned char *pi;
|
||||
int na;
|
||||
na=_e->na;
|
||||
pi=_e->pi;
|
||||
if(na==1){
|
||||
int ci;
|
||||
/*While the branch below is still correct for shapes with na==1, we can
|
||||
perform the entire transform with just 1 multiply in this case instead
|
||||
of 23.*/
|
||||
_y[0]=(ogg_int16_t)(OC_DIV2_16(OC_C4S4*(_x[pi[0]])));
|
||||
for(ci=1;ci<8;ci++)_y[ci]=0;
|
||||
}
|
||||
else{
|
||||
const ogg_int16_t *const *ext;
|
||||
int zpi;
|
||||
int api;
|
||||
int nz;
|
||||
/*First multiply by the extension matrix to compute the padding values.*/
|
||||
nz=8-na;
|
||||
ext=_e->ext;
|
||||
for(zpi=0;zpi<nz;zpi++){
|
||||
ogg_int32_t v;
|
||||
v=0;
|
||||
for(api=0;api<na;api++){
|
||||
v+=ext[zpi][api]*(ogg_int32_t)(_x[pi[api]<<3]<<1);
|
||||
}
|
||||
_x[pi[na+zpi]<<3]=(ogg_int16_t)(v+0x8000>>16)+1>>1;
|
||||
}
|
||||
oc_fdct8(_y,_x);
|
||||
}
|
||||
}
|
||||
|
||||
/*Performs a forward 8x8 Type-II DCT transform on blocks which overlap the
|
||||
border of the picture region.
|
||||
This method ONLY works with rectangular regions.
|
||||
_border: A description of which pixels are inside the border.
|
||||
_y: The buffer to store the result in.
|
||||
This may be the same as _x.
|
||||
_x: The input pixel values.
|
||||
Pixel values outside the border will be ignored.*/
|
||||
void oc_fdct8x8_border(const oc_border_info *_border,
|
||||
ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
ogg_int16_t *in;
|
||||
ogg_int16_t *out;
|
||||
ogg_int16_t w[64];
|
||||
ogg_int64_t mask;
|
||||
const oc_extension_info *cext;
|
||||
const oc_extension_info *rext;
|
||||
int cmask;
|
||||
int rmask;
|
||||
int ri;
|
||||
int ci;
|
||||
/*Identify the shapes of the non-zero rows and columns.*/
|
||||
rmask=cmask=0;
|
||||
mask=_border->mask;
|
||||
for(ri=0;ri<8;ri++){
|
||||
/*This aggregation is _only_ correct for rectangular masks.*/
|
||||
cmask|=((mask&0xFF)!=0)<<ri;
|
||||
rmask|=mask&0xFF;
|
||||
mask>>=8;
|
||||
}
|
||||
/*Find the associated extension info for these shapes.*/
|
||||
if(cmask==0xFF)cext=NULL;
|
||||
else for(cext=OC_EXTENSION_INFO;cext->mask!=cmask;){
|
||||
/*If we somehow can't find the shape, then just do an unpadded fDCT.
|
||||
It won't be efficient, but it should still be correct.*/
|
||||
if(++cext>=OC_EXTENSION_INFO+OC_NSHAPES){
|
||||
oc_enc_fdct8x8_c(_y,_x);
|
||||
return;
|
||||
}
|
||||
}
|
||||
if(rmask==0xFF)rext=NULL;
|
||||
else for(rext=OC_EXTENSION_INFO;rext->mask!=rmask;){
|
||||
/*If we somehow can't find the shape, then just do an unpadded fDCT.
|
||||
It won't be efficient, but it should still be correct.*/
|
||||
if(++rext>=OC_EXTENSION_INFO+OC_NSHAPES){
|
||||
oc_enc_fdct8x8_c(_y,_x);
|
||||
return;
|
||||
}
|
||||
}
|
||||
/*Add two extra bits of working precision to improve accuracy; any more and
|
||||
we could overflow.*/
|
||||
for(ci=0;ci<64;ci++)w[ci]=_x[ci]<<2;
|
||||
/*These biases correct for some systematic error that remains in the full
|
||||
fDCT->iDCT round trip.
|
||||
We can safely add them before padding, since if these pixel values are
|
||||
overwritten, we didn't care what they were anyway (and the unbiased values
|
||||
will usually yield smaller DCT coefficient magnitudes).*/
|
||||
w[0]+=(w[0]!=0)+1;
|
||||
w[1]++;
|
||||
w[8]--;
|
||||
/*Transform the columns.
|
||||
We can ignore zero columns without a problem.*/
|
||||
in=w;
|
||||
out=_y;
|
||||
if(cext==NULL)for(ci=0;ci<8;ci++)oc_fdct8(out+(ci<<3),in+ci);
|
||||
else for(ci=0;ci<8;ci++)if(rmask&(1<<ci))oc_fdct8_ext(out+(ci<<3),in+ci,cext);
|
||||
/*Transform the rows.
|
||||
We transform even rows that are supposedly zero, because rounding errors
|
||||
may make them slightly non-zero, and this will give a more precise
|
||||
reconstruction with very small quantizers.*/
|
||||
in=_y;
|
||||
out=w;
|
||||
if(rext==NULL)for(ri=0;ri<8;ri++)oc_fdct8(out+(ri<<3),in+ri);
|
||||
else for(ri=0;ri<8;ri++)oc_fdct8_ext(out+(ri<<3),in+ri,rext);
|
||||
/*Round the result back to the external working precision (which is still
|
||||
scaled by four relative to the orthogonal result).
|
||||
TODO: We should just update the external working precision.*/
|
||||
for(ci=0;ci<64;ci++)_y[ci]=w[ci]+2>>2;
|
||||
}
|
||||
#endif
|
||||
82
engine/thirdparty/libtheora/fragment.c
vendored
Normal file
82
engine/thirdparty/libtheora/fragment.c
vendored
Normal file
|
|
@ -0,0 +1,82 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
#include <string.h>
|
||||
#include "internal.h"
|
||||
|
||||
void oc_frag_copy_c(unsigned char *_dst,const unsigned char *_src,int _ystride){
|
||||
int i;
|
||||
for(i=8;i-->0;){
|
||||
memcpy(_dst,_src,8*sizeof(*_dst));
|
||||
_dst+=_ystride;
|
||||
_src+=_ystride;
|
||||
}
|
||||
}
|
||||
|
||||
/*Copies the fragments specified by the lists of fragment indices from one
|
||||
frame to another.
|
||||
_dst_frame: The reference frame to copy to.
|
||||
_src_frame: The reference frame to copy from.
|
||||
_ystride: The row stride of the reference frames.
|
||||
_fragis: A pointer to a list of fragment indices.
|
||||
_nfragis: The number of fragment indices to copy.
|
||||
_frag_buf_offs: The offsets of fragments in the reference frames.*/
|
||||
void oc_frag_copy_list_c(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
|
||||
ptrdiff_t fragii;
|
||||
for(fragii=0;fragii<_nfragis;fragii++){
|
||||
ptrdiff_t frag_buf_off;
|
||||
frag_buf_off=_frag_buf_offs[_fragis[fragii]];
|
||||
oc_frag_copy_c(_dst_frame+frag_buf_off,
|
||||
_src_frame+frag_buf_off,_ystride);
|
||||
}
|
||||
}
|
||||
|
||||
void oc_frag_recon_intra_c(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t _residue[64]){
|
||||
int i;
|
||||
for(i=0;i<8;i++){
|
||||
int j;
|
||||
for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+128);
|
||||
_dst+=_ystride;
|
||||
}
|
||||
}
|
||||
|
||||
void oc_frag_recon_inter_c(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]){
|
||||
int i;
|
||||
for(i=0;i<8;i++){
|
||||
int j;
|
||||
for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+_src[j]);
|
||||
_dst+=_ystride;
|
||||
_src+=_ystride;
|
||||
}
|
||||
}
|
||||
|
||||
void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]){
|
||||
int i;
|
||||
for(i=0;i<8;i++){
|
||||
int j;
|
||||
for(j=0;j<8;j++)_dst[j]=OC_CLAMP255(_residue[i*8+j]+(_src1[j]+_src2[j]>>1));
|
||||
_dst+=_ystride;
|
||||
_src1+=_ystride;
|
||||
_src2+=_ystride;
|
||||
}
|
||||
}
|
||||
|
||||
void oc_restore_fpu_c(void){}
|
||||
515
engine/thirdparty/libtheora/huffdec.c
vendored
Normal file
515
engine/thirdparty/libtheora/huffdec.c
vendored
Normal file
|
|
@ -0,0 +1,515 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <ogg/ogg.h>
|
||||
#include "huffdec.h"
|
||||
#include "decint.h"
|
||||
|
||||
|
||||
|
||||
/*Instead of storing every branching in the tree, subtrees can be collapsed
|
||||
into one node, with a table of size 1<<nbits pointing directly to its
|
||||
descedents nbits levels down.
|
||||
This allows more than one bit to be read at a time, and avoids following all
|
||||
the intermediate branches with next to no increased code complexity once
|
||||
the collapsed tree has been built.
|
||||
We do _not_ require that a subtree be complete to be collapsed, but instead
|
||||
store duplicate pointers in the table, and record the actual depth of the
|
||||
node below its parent.
|
||||
This tells us the number of bits to advance the stream after reaching it.
|
||||
|
||||
This turns out to be equivalent to the method described in \cite{Hash95},
|
||||
without the requirement that codewords be sorted by length.
|
||||
If the codewords were sorted by length (so-called ``canonical-codes''), they
|
||||
could be decoded much faster via either Lindell and Moffat's approach or
|
||||
Hashemian's Condensed Huffman Code approach, the latter of which has an
|
||||
extremely small memory footprint.
|
||||
We can't use Choueka et al.'s finite state machine approach, which is
|
||||
extremely fast, because we can't allow multiple symbols to be output at a
|
||||
time; the codebook can and does change between symbols.
|
||||
It also has very large memory requirements, which impairs cache coherency.
|
||||
|
||||
We store the tree packed in an array of 16-bit integers (words).
|
||||
Each node consists of a single word, followed consecutively by two or more
|
||||
indices of its children.
|
||||
Let n be the value of this first word.
|
||||
This is the number of bits that need to be read to traverse the node, and
|
||||
must be positive.
|
||||
1<<n entries follow in the array, each an index to a child node.
|
||||
If the child is positive, then it is the index of another internal node in
|
||||
the table.
|
||||
If the child is negative or zero, then it is a leaf node.
|
||||
These are stored directly in the child pointer to save space, since they only
|
||||
require a single word.
|
||||
If a leaf node would have been encountered before reading n bits, then it is
|
||||
duplicated the necessary number of times in this table.
|
||||
Leaf nodes pack both a token value and their actual depth in the tree.
|
||||
The token in the leaf node is (-leaf&255).
|
||||
The number of bits that need to be consumed to reach the leaf, starting from
|
||||
the current node, is (-leaf>>8).
|
||||
|
||||
@ARTICLE{Hash95,
|
||||
author="Reza Hashemian",
|
||||
title="Memory Efficient and High-Speed Search {Huffman} Coding",
|
||||
journal="{IEEE} Transactions on Communications",
|
||||
volume=43,
|
||||
number=10,
|
||||
pages="2576--2581",
|
||||
month=Oct,
|
||||
year=1995
|
||||
}*/
|
||||
|
||||
|
||||
|
||||
/*The map from external spec-defined tokens to internal tokens.
|
||||
This is constructed so that any extra bits read with the original token value
|
||||
can be masked off the least significant bits of its internal token index.
|
||||
In addition, all of the tokens which require additional extra bits are placed
|
||||
at the start of the list, and grouped by type.
|
||||
OC_DCT_REPEAT_RUN3_TOKEN is placed first, as it is an extra-special case, so
|
||||
giving it index 0 may simplify comparisons on some architectures.
|
||||
These requirements require some substantial reordering.*/
|
||||
static const unsigned char OC_DCT_TOKEN_MAP[TH_NDCT_TOKENS]={
|
||||
/*OC_DCT_EOB1_TOKEN (0 extra bits)*/
|
||||
15,
|
||||
/*OC_DCT_EOB2_TOKEN (0 extra bits)*/
|
||||
16,
|
||||
/*OC_DCT_EOB3_TOKEN (0 extra bits)*/
|
||||
17,
|
||||
/*OC_DCT_REPEAT_RUN0_TOKEN (2 extra bits)*/
|
||||
88,
|
||||
/*OC_DCT_REPEAT_RUN1_TOKEN (3 extra bits)*/
|
||||
80,
|
||||
/*OC_DCT_REPEAT_RUN2_TOKEN (4 extra bits)*/
|
||||
1,
|
||||
/*OC_DCT_REPEAT_RUN3_TOKEN (12 extra bits)*/
|
||||
0,
|
||||
/*OC_DCT_SHORT_ZRL_TOKEN (3 extra bits)*/
|
||||
48,
|
||||
/*OC_DCT_ZRL_TOKEN (6 extra bits)*/
|
||||
14,
|
||||
/*OC_ONE_TOKEN (0 extra bits)*/
|
||||
56,
|
||||
/*OC_MINUS_ONE_TOKEN (0 extra bits)*/
|
||||
57,
|
||||
/*OC_TWO_TOKEN (0 extra bits)*/
|
||||
58,
|
||||
/*OC_MINUS_TWO_TOKEN (0 extra bits)*/
|
||||
59,
|
||||
/*OC_DCT_VAL_CAT2 (1 extra bit)*/
|
||||
60,
|
||||
62,
|
||||
64,
|
||||
66,
|
||||
/*OC_DCT_VAL_CAT3 (2 extra bits)*/
|
||||
68,
|
||||
/*OC_DCT_VAL_CAT4 (3 extra bits)*/
|
||||
72,
|
||||
/*OC_DCT_VAL_CAT5 (4 extra bits)*/
|
||||
2,
|
||||
/*OC_DCT_VAL_CAT6 (5 extra bits)*/
|
||||
4,
|
||||
/*OC_DCT_VAL_CAT7 (6 extra bits)*/
|
||||
6,
|
||||
/*OC_DCT_VAL_CAT8 (10 extra bits)*/
|
||||
8,
|
||||
/*OC_DCT_RUN_CAT1A (1 extra bit)*/
|
||||
18,
|
||||
20,
|
||||
22,
|
||||
24,
|
||||
26,
|
||||
/*OC_DCT_RUN_CAT1B (3 extra bits)*/
|
||||
32,
|
||||
/*OC_DCT_RUN_CAT1C (4 extra bits)*/
|
||||
12,
|
||||
/*OC_DCT_RUN_CAT2A (2 extra bits)*/
|
||||
28,
|
||||
/*OC_DCT_RUN_CAT2B (3 extra bits)*/
|
||||
40
|
||||
};
|
||||
|
||||
/*The log base 2 of number of internal tokens associated with each of the spec
|
||||
tokens (i.e., how many of the extra bits are folded into the token value).
|
||||
Increasing the maximum value beyond 3 will enlarge the amount of stack
|
||||
required for tree construction.*/
|
||||
static const unsigned char OC_DCT_TOKEN_MAP_LOG_NENTRIES[TH_NDCT_TOKENS]={
|
||||
0,0,0,2,3,0,0,3,0,0,0,0,0,1,1,1,1,2,3,1,1,1,2,1,1,1,1,1,3,1,2,3
|
||||
};
|
||||
|
||||
|
||||
/*The size a lookup table is allowed to grow to relative to the number of
|
||||
unique nodes it contains.
|
||||
E.g., if OC_HUFF_SLUSH is 4, then at most 75% of the space in the tree is
|
||||
wasted (1/4 of the space must be used).
|
||||
Larger numbers can decode tokens with fewer read operations, while smaller
|
||||
numbers may save more space.
|
||||
With a sample file:
|
||||
32233473 read calls are required when no tree collapsing is done (100.0%).
|
||||
19269269 read calls are required when OC_HUFF_SLUSH is 1 (59.8%).
|
||||
11144969 read calls are required when OC_HUFF_SLUSH is 2 (34.6%).
|
||||
10538563 read calls are required when OC_HUFF_SLUSH is 4 (32.7%).
|
||||
10192578 read calls are required when OC_HUFF_SLUSH is 8 (31.6%).
|
||||
Since a value of 2 gets us the vast majority of the speed-up with only a
|
||||
small amount of wasted memory, this is what we use.
|
||||
This value must be less than 128, or you could create a tree with more than
|
||||
32767 entries, which would overflow the 16-bit words used to index it.*/
|
||||
#define OC_HUFF_SLUSH (2)
|
||||
/*The root of the tree is on the fast path, and a larger value here is more
|
||||
beneficial than elsewhere in the tree.
|
||||
7 appears to give the best performance, trading off between increased use of
|
||||
the single-read fast path and cache footprint for the tables, though
|
||||
obviously this will depend on your cache size.
|
||||
Using 7 here, the VP3 tables are about twice as large compared to using 2.*/
|
||||
#define OC_ROOT_HUFF_SLUSH (7)
|
||||
|
||||
|
||||
|
||||
/*Unpacks a Huffman codebook.
|
||||
_opb: The buffer to unpack from.
|
||||
_tokens: Stores a list of internal tokens, in the order they were found in
|
||||
the codebook, and the lengths of their corresponding codewords.
|
||||
This is enough to completely define the codebook, while minimizing
|
||||
stack usage and avoiding temporary allocations (for platforms
|
||||
where free() is a no-op).
|
||||
Return: The number of internal tokens in the codebook, or a negative value
|
||||
on error.*/
|
||||
int oc_huff_tree_unpack(oc_pack_buf *_opb,unsigned char _tokens[256][2]){
|
||||
ogg_uint32_t code;
|
||||
int len;
|
||||
int ntokens;
|
||||
int nleaves;
|
||||
code=0;
|
||||
len=ntokens=nleaves=0;
|
||||
for(;;){
|
||||
long bits;
|
||||
bits=oc_pack_read1(_opb);
|
||||
/*Only process nodes so long as there's more bits in the buffer.*/
|
||||
if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
|
||||
/*Read an internal node:*/
|
||||
if(!bits){
|
||||
len++;
|
||||
/*Don't allow codewords longer than 32 bits.*/
|
||||
if(len>32)return TH_EBADHEADER;
|
||||
}
|
||||
/*Read a leaf node:*/
|
||||
else{
|
||||
ogg_uint32_t code_bit;
|
||||
int neb;
|
||||
int nentries;
|
||||
int token;
|
||||
/*Don't allow more than 32 spec-tokens per codebook.*/
|
||||
if(++nleaves>32)return TH_EBADHEADER;
|
||||
bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
|
||||
neb=OC_DCT_TOKEN_MAP_LOG_NENTRIES[bits];
|
||||
token=OC_DCT_TOKEN_MAP[bits];
|
||||
nentries=1<<neb;
|
||||
while(nentries-->0){
|
||||
_tokens[ntokens][0]=(unsigned char)token++;
|
||||
_tokens[ntokens][1]=(unsigned char)(len+neb);
|
||||
ntokens++;
|
||||
}
|
||||
code_bit=0x80000000U>>len-1;
|
||||
while(len>0&&(code&code_bit)){
|
||||
code^=code_bit;
|
||||
code_bit<<=1;
|
||||
len--;
|
||||
}
|
||||
if(len<=0)break;
|
||||
code|=code_bit;
|
||||
}
|
||||
}
|
||||
return ntokens;
|
||||
}
|
||||
|
||||
/*Count how many tokens would be required to fill a subtree at depth _depth.
|
||||
_tokens: A list of internal tokens, in the order they are found in the
|
||||
codebook, and the lengths of their corresponding codewords.
|
||||
_depth: The depth of the desired node in the corresponding tree structure.
|
||||
Return: The number of tokens that belong to that subtree.*/
|
||||
static int oc_huff_subtree_tokens(unsigned char _tokens[][2],int _depth){
|
||||
ogg_uint32_t code;
|
||||
int ti;
|
||||
code=0;
|
||||
ti=0;
|
||||
do{
|
||||
if(_tokens[ti][1]-_depth<32)code+=0x80000000U>>_tokens[ti++][1]-_depth;
|
||||
else{
|
||||
/*Because of the expanded internal tokens, we can have codewords as long
|
||||
as 35 bits.
|
||||
A single recursion here is enough to advance past them.*/
|
||||
code++;
|
||||
ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+31);
|
||||
}
|
||||
}
|
||||
while(code<0x80000000U);
|
||||
return ti;
|
||||
}
|
||||
|
||||
/*Compute the number of bits to use for a collapsed tree node at the given
|
||||
depth.
|
||||
_tokens: A list of internal tokens, in the order they are found in the
|
||||
codebook, and the lengths of their corresponding codewords.
|
||||
_ntokens: The number of tokens corresponding to this tree node.
|
||||
_depth: The depth of this tree node.
|
||||
Return: The number of bits to use for a collapsed tree node rooted here.
|
||||
This is always at least one, even if this was a leaf node.*/
|
||||
static int oc_huff_tree_collapse_depth(unsigned char _tokens[][2],
|
||||
int _ntokens,int _depth){
|
||||
int got_leaves;
|
||||
int loccupancy;
|
||||
int occupancy;
|
||||
int slush;
|
||||
int nbits;
|
||||
int best_nbits;
|
||||
slush=_depth>0?OC_HUFF_SLUSH:OC_ROOT_HUFF_SLUSH;
|
||||
/*It's legal to have a tree with just a single node, which requires no bits
|
||||
to decode and always returns the same token.
|
||||
However, no encoder actually does this (yet).
|
||||
To avoid a special case in oc_huff_token_decode(), we force the number of
|
||||
lookahead bits to be at least one.
|
||||
This will produce a tree that looks ahead one bit and then advances the
|
||||
stream zero bits.*/
|
||||
nbits=1;
|
||||
occupancy=2;
|
||||
got_leaves=1;
|
||||
do{
|
||||
int ti;
|
||||
if(got_leaves)best_nbits=nbits;
|
||||
nbits++;
|
||||
got_leaves=0;
|
||||
loccupancy=occupancy;
|
||||
for(occupancy=ti=0;ti<_ntokens;occupancy++){
|
||||
if(_tokens[ti][1]<_depth+nbits)ti++;
|
||||
else if(_tokens[ti][1]==_depth+nbits){
|
||||
got_leaves=1;
|
||||
ti++;
|
||||
}
|
||||
else ti+=oc_huff_subtree_tokens(_tokens+ti,_depth+nbits);
|
||||
}
|
||||
}
|
||||
while(occupancy>loccupancy&&occupancy*slush>=1<<nbits);
|
||||
return best_nbits;
|
||||
}
|
||||
|
||||
/*Determines the size in words of a Huffman tree node that represents a
|
||||
subtree of depth _nbits.
|
||||
_nbits: The depth of the subtree.
|
||||
This must be greater than zero.
|
||||
Return: The number of words required to store the node.*/
|
||||
static size_t oc_huff_node_size(int _nbits){
|
||||
return 1+(1<<_nbits);
|
||||
}
|
||||
|
||||
/*Produces a collapsed-tree representation of the given token list.
|
||||
_tree: The storage for the collapsed Huffman tree.
|
||||
This may be NULL to compute the required storage size instead of
|
||||
constructing the tree.
|
||||
_tokens: A list of internal tokens, in the order they are found in the
|
||||
codebook, and the lengths of their corresponding codewords.
|
||||
_ntokens: The number of tokens corresponding to this tree node.
|
||||
Return: The number of words required to store the tree.*/
|
||||
static size_t oc_huff_tree_collapse(ogg_int16_t *_tree,
|
||||
unsigned char _tokens[][2],int _ntokens){
|
||||
ogg_int16_t node[34];
|
||||
unsigned char depth[34];
|
||||
unsigned char last[34];
|
||||
size_t ntree;
|
||||
int ti;
|
||||
int l;
|
||||
depth[0]=0;
|
||||
last[0]=(unsigned char)(_ntokens-1);
|
||||
ntree=0;
|
||||
ti=0;
|
||||
l=0;
|
||||
do{
|
||||
int nbits;
|
||||
nbits=oc_huff_tree_collapse_depth(_tokens+ti,last[l]+1-ti,depth[l]);
|
||||
node[l]=(ogg_int16_t)ntree;
|
||||
ntree+=oc_huff_node_size(nbits);
|
||||
if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)nbits;
|
||||
do{
|
||||
while(ti<=last[l]&&_tokens[ti][1]<=depth[l]+nbits){
|
||||
if(_tree!=NULL){
|
||||
ogg_int16_t leaf;
|
||||
int nentries;
|
||||
nentries=1<<depth[l]+nbits-_tokens[ti][1];
|
||||
leaf=(ogg_int16_t)-(_tokens[ti][1]-depth[l]<<8|_tokens[ti][0]);
|
||||
while(nentries-->0)_tree[node[l]++]=leaf;
|
||||
}
|
||||
ti++;
|
||||
}
|
||||
if(ti<=last[l]){
|
||||
/*We need to recurse*/
|
||||
depth[l+1]=(unsigned char)(depth[l]+nbits);
|
||||
if(_tree!=NULL)_tree[node[l]++]=(ogg_int16_t)ntree;
|
||||
l++;
|
||||
last[l]=
|
||||
(unsigned char)(ti+oc_huff_subtree_tokens(_tokens+ti,depth[l])-1);
|
||||
break;
|
||||
}
|
||||
/*Pop back up a level of recursion.*/
|
||||
else if(l-->0)nbits=depth[l+1]-depth[l];
|
||||
}
|
||||
while(l>=0);
|
||||
}
|
||||
while(l>=0);
|
||||
return ntree;
|
||||
}
|
||||
|
||||
/*Unpacks a set of Huffman trees, and reduces them to a collapsed
|
||||
representation.
|
||||
_opb: The buffer to unpack the trees from.
|
||||
_nodes: The table to fill with the Huffman trees.
|
||||
Return: 0 on success, or a negative value on error.
|
||||
The caller is responsible for cleaning up any partially initialized
|
||||
_nodes on failure.*/
|
||||
int oc_huff_trees_unpack(oc_pack_buf *_opb,
|
||||
ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
|
||||
int i;
|
||||
for(i=0;i<TH_NHUFFMAN_TABLES;i++){
|
||||
unsigned char tokens[256][2];
|
||||
int ntokens;
|
||||
ogg_int16_t *tree;
|
||||
size_t size;
|
||||
/*Unpack the full tree into a temporary buffer.*/
|
||||
ntokens=oc_huff_tree_unpack(_opb,tokens);
|
||||
if(ntokens<0)return ntokens;
|
||||
/*Figure out how big the collapsed tree will be and allocate space for it.*/
|
||||
size=oc_huff_tree_collapse(NULL,tokens,ntokens);
|
||||
/*This should never happen; if it does it means you set OC_HUFF_SLUSH or
|
||||
OC_ROOT_HUFF_SLUSH too large.*/
|
||||
if(size>32767)return TH_EIMPL;
|
||||
tree=(ogg_int16_t *)_ogg_malloc(size*sizeof(*tree));
|
||||
if(tree==NULL)return TH_EFAULT;
|
||||
/*Construct the collapsed the tree.*/
|
||||
oc_huff_tree_collapse(tree,tokens,ntokens);
|
||||
_nodes[i]=tree;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*Determines the size in words of a Huffman subtree.
|
||||
_tree: The complete Huffman tree.
|
||||
_node: The index of the root of the desired subtree.
|
||||
Return: The number of words required to store the tree.*/
|
||||
static size_t oc_huff_tree_size(const ogg_int16_t *_tree,int _node){
|
||||
size_t size;
|
||||
int nchildren;
|
||||
int n;
|
||||
int i;
|
||||
n=_tree[_node];
|
||||
size=oc_huff_node_size(n);
|
||||
nchildren=1<<n;
|
||||
i=0;
|
||||
do{
|
||||
int child;
|
||||
child=_tree[_node+i+1];
|
||||
if(child<=0)i+=1<<n-(-child>>8);
|
||||
else{
|
||||
size+=oc_huff_tree_size(_tree,child);
|
||||
i++;
|
||||
}
|
||||
}
|
||||
while(i<nchildren);
|
||||
return size;
|
||||
}
|
||||
|
||||
/*Makes a copy of the given set of Huffman trees.
|
||||
_dst: The array to store the copy in.
|
||||
_src: The array of trees to copy.*/
|
||||
int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
|
||||
const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]){
|
||||
int total;
|
||||
int i;
|
||||
total=0;
|
||||
for(i=0;i<TH_NHUFFMAN_TABLES;i++){
|
||||
size_t size;
|
||||
size=oc_huff_tree_size(_src[i],0);
|
||||
total+=size;
|
||||
_dst[i]=(ogg_int16_t *)_ogg_malloc(size*sizeof(*_dst[i]));
|
||||
if(_dst[i]==NULL){
|
||||
while(i-->0)_ogg_free(_dst[i]);
|
||||
return TH_EFAULT;
|
||||
}
|
||||
memcpy(_dst[i],_src[i],size*sizeof(*_dst[i]));
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*Frees the memory used by a set of Huffman trees.
|
||||
_nodes: The array of trees to free.*/
|
||||
void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]){
|
||||
int i;
|
||||
for(i=0;i<TH_NHUFFMAN_TABLES;i++)_ogg_free(_nodes[i]);
|
||||
}
|
||||
|
||||
|
||||
/*Unpacks a single token using the given Huffman tree.
|
||||
_opb: The buffer to unpack the token from.
|
||||
_node: The tree to unpack the token with.
|
||||
Return: The token value.*/
|
||||
int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_tree){
|
||||
const unsigned char *ptr;
|
||||
const unsigned char *stop;
|
||||
oc_pb_window window;
|
||||
int available;
|
||||
long bits;
|
||||
int node;
|
||||
int n;
|
||||
ptr=_opb->ptr;
|
||||
window=_opb->window;
|
||||
stop=_opb->stop;
|
||||
available=_opb->bits;
|
||||
node=0;
|
||||
for(;;){
|
||||
n=_tree[node];
|
||||
if(n>available){
|
||||
unsigned shift;
|
||||
shift=OC_PB_WINDOW_SIZE-available;
|
||||
do{
|
||||
/*We don't bother setting eof because we won't check for it after we've
|
||||
started decoding DCT tokens.*/
|
||||
if(ptr>=stop){
|
||||
shift=(unsigned)-OC_LOTS_OF_BITS;
|
||||
break;
|
||||
}
|
||||
shift-=8;
|
||||
window|=(oc_pb_window)*ptr++<<shift;
|
||||
}
|
||||
while(shift>=8);
|
||||
/*Note: We never request more than 24 bits, so there's no need to fill in
|
||||
the last partial byte here.*/
|
||||
available=OC_PB_WINDOW_SIZE-shift;
|
||||
}
|
||||
bits=window>>OC_PB_WINDOW_SIZE-n;
|
||||
node=_tree[node+1+bits];
|
||||
if(node<=0)break;
|
||||
window<<=n;
|
||||
available-=n;
|
||||
}
|
||||
node=-node;
|
||||
n=node>>8;
|
||||
window<<=n;
|
||||
available-=n;
|
||||
_opb->ptr=ptr;
|
||||
_opb->window=window;
|
||||
_opb->bits=available;
|
||||
return node&255;
|
||||
}
|
||||
32
engine/thirdparty/libtheora/huffdec.h
vendored
Normal file
32
engine/thirdparty/libtheora/huffdec.h
vendored
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_huffdec_H)
|
||||
# define _huffdec_H (1)
|
||||
# include "huffman.h"
|
||||
# include "bitpack.h"
|
||||
|
||||
|
||||
|
||||
int oc_huff_trees_unpack(oc_pack_buf *_opb,
|
||||
ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
|
||||
int oc_huff_trees_copy(ogg_int16_t *_dst[TH_NHUFFMAN_TABLES],
|
||||
const ogg_int16_t *const _src[TH_NHUFFMAN_TABLES]);
|
||||
void oc_huff_trees_clear(ogg_int16_t *_nodes[TH_NHUFFMAN_TABLES]);
|
||||
int oc_huff_token_decode_c(oc_pack_buf *_opb,const ogg_int16_t *_node);
|
||||
|
||||
#endif
|
||||
966
engine/thirdparty/libtheora/huffenc.c
vendored
Normal file
966
engine/thirdparty/libtheora/huffenc.c
vendored
Normal file
|
|
@ -0,0 +1,966 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <ogg/ogg.h>
|
||||
#include "huffenc.h"
|
||||
|
||||
|
||||
|
||||
/*The default Huffman codes used for VP3.1.*/
|
||||
const th_huff_code TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]={
|
||||
{
|
||||
{0x002D, 6},{0x0026, 7},{0x0166, 9},{0x004E, 8},
|
||||
{0x02CE,10},{0x059E,11},{0x027D,11},{0x0008, 5},
|
||||
{0x04F9,12},{0x000F, 4},{0x000E, 4},{0x001B, 5},
|
||||
{0x0006, 4},{0x0008, 4},{0x0005, 4},{0x001A, 5},
|
||||
{0x0015, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
|
||||
{0x0000, 3},{0x0009, 4},{0x0017, 5},{0x0029, 6},
|
||||
{0x0028, 6},{0x00B2, 8},{0x04F8,12},{0x059F,11},
|
||||
{0x009E, 9},{0x013F,10},{0x0012, 6},{0x0058, 7}
|
||||
},
|
||||
{
|
||||
{0x0010, 5},{0x0047, 7},{0x01FF, 9},{0x008C, 8},
|
||||
{0x03FC,10},{0x046A,11},{0x0469,11},{0x0022, 6},
|
||||
{0x11A1,13},{0x000E, 4},{0x000D, 4},{0x0004, 4},
|
||||
{0x0005, 4},{0x0009, 4},{0x0006, 4},{0x001E, 5},
|
||||
{0x0016, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
|
||||
{0x0000, 3},{0x000A, 4},{0x0017, 5},{0x007D, 7},
|
||||
{0x007E, 7},{0x011B, 9},{0x08D1,12},{0x03FD,10},
|
||||
{0x046B,11},{0x11A0,13},{0x007C, 7},{0x00FE, 8}
|
||||
},
|
||||
{
|
||||
{0x0016, 5},{0x0020, 6},{0x0086, 8},{0x0087, 8},
|
||||
{0x0367,10},{0x06CC,11},{0x06CB,11},{0x006E, 7},
|
||||
{0x366D,14},{0x000F, 4},{0x000E, 4},{0x0004, 4},
|
||||
{0x0005, 4},{0x000A, 4},{0x0006, 4},{0x001A, 5},
|
||||
{0x0011, 5},{0x0007, 4},{0x000C, 4},{0x0001, 3},
|
||||
{0x0000, 3},{0x0009, 4},{0x0017, 5},{0x006F, 7},
|
||||
{0x006D, 7},{0x0364,10},{0x0D9A,12},{0x06CA,11},
|
||||
{0x1B37,13},{0x366C,14},{0x0042, 7},{0x00D8, 8}
|
||||
},
|
||||
{
|
||||
{0x0000, 4},{0x002D, 6},{0x00F7, 8},{0x0058, 7},
|
||||
{0x0167, 9},{0x02CB,10},{0x02CA,10},{0x000E, 6},
|
||||
{0x1661,13},{0x0003, 3},{0x0002, 3},{0x0008, 4},
|
||||
{0x0009, 4},{0x000D, 4},{0x0002, 4},{0x001F, 5},
|
||||
{0x0017, 5},{0x0001, 4},{0x000C, 4},{0x000E, 4},
|
||||
{0x000A, 4},{0x0006, 5},{0x0078, 7},{0x000F, 6},
|
||||
{0x007A, 7},{0x0164, 9},{0x0599,11},{0x02CD,10},
|
||||
{0x0B31,12},{0x1660,13},{0x0079, 7},{0x00F6, 8}
|
||||
},
|
||||
{
|
||||
{0x0003, 4},{0x003C, 6},{0x000F, 7},{0x007A, 7},
|
||||
{0x001D, 8},{0x0020, 9},{0x0072,10},{0x0006, 6},
|
||||
{0x0399,13},{0x0004, 3},{0x0005, 3},{0x0005, 4},
|
||||
{0x0006, 4},{0x000E, 4},{0x0004, 4},{0x0000, 4},
|
||||
{0x0019, 5},{0x0002, 4},{0x000D, 4},{0x0007, 4},
|
||||
{0x001F, 5},{0x0030, 6},{0x0011, 8},{0x0031, 6},
|
||||
{0x0005, 6},{0x0021, 9},{0x00E7,11},{0x0038, 9},
|
||||
{0x01CD,12},{0x0398,13},{0x007B, 7},{0x0009, 7}
|
||||
},
|
||||
{
|
||||
{0x0009, 4},{0x0002, 5},{0x0074, 7},{0x0007, 6},
|
||||
{0x00EC, 8},{0x00D1, 9},{0x01A6,10},{0x0006, 6},
|
||||
{0x0D21,13},{0x0005, 3},{0x0006, 3},{0x0008, 4},
|
||||
{0x0007, 4},{0x000F, 4},{0x0004, 4},{0x0000, 4},
|
||||
{0x001C, 5},{0x0002, 4},{0x0005, 4},{0x0003, 4},
|
||||
{0x000C, 5},{0x0035, 7},{0x01A7,10},{0x001B, 6},
|
||||
{0x0077, 7},{0x01A5,10},{0x0349,11},{0x00D0, 9},
|
||||
{0x0691,12},{0x0D20,13},{0x0075, 7},{0x00ED, 8}
|
||||
},
|
||||
{
|
||||
{0x000A, 4},{0x000C, 5},{0x0012, 6},{0x001B, 6},
|
||||
{0x00B7, 8},{0x016C, 9},{0x0099, 9},{0x005A, 7},
|
||||
{0x16D8,13},{0x0007, 3},{0x0006, 3},{0x0009, 4},
|
||||
{0x0008, 4},{0x0000, 3},{0x0005, 4},{0x0017, 5},
|
||||
{0x000E, 5},{0x0002, 4},{0x0003, 4},{0x000F, 5},
|
||||
{0x001A, 6},{0x004D, 8},{0x2DB3,14},{0x002C, 6},
|
||||
{0x0011, 6},{0x02DA,10},{0x05B7,11},{0x0098, 9},
|
||||
{0x0B6D,12},{0x2DB2,14},{0x0010, 6},{0x0027, 7}
|
||||
},
|
||||
{
|
||||
{0x000D, 4},{0x000F, 5},{0x001D, 6},{0x0008, 5},
|
||||
{0x0051, 7},{0x0056, 8},{0x00AF, 9},{0x002A, 7},
|
||||
{0x148A,13},{0x0007, 3},{0x0000, 2},{0x0008, 4},
|
||||
{0x0009, 4},{0x000C, 4},{0x0006, 4},{0x0017, 5},
|
||||
{0x000B, 5},{0x0016, 5},{0x0015, 5},{0x0009, 5},
|
||||
{0x0050, 7},{0x00AE, 9},{0x2917,14},{0x001C, 6},
|
||||
{0x0014, 6},{0x0290,10},{0x0523,11},{0x0149, 9},
|
||||
{0x0A44,12},{0x2916,14},{0x0053, 7},{0x00A5, 8}
|
||||
},
|
||||
{
|
||||
{0x0001, 4},{0x001D, 6},{0x00F5, 8},{0x00F4, 8},
|
||||
{0x024D,10},{0x0499,11},{0x0498,11},{0x0001, 5},
|
||||
{0x0021, 6},{0x0006, 3},{0x0005, 3},{0x0006, 4},
|
||||
{0x0005, 4},{0x0002, 4},{0x0007, 5},{0x0025, 6},
|
||||
{0x007B, 7},{0x001C, 6},{0x0020, 6},{0x000D, 6},
|
||||
{0x0048, 7},{0x0092, 8},{0x0127, 9},{0x000E, 4},
|
||||
{0x0004, 4},{0x0011, 5},{0x000C, 6},{0x003C, 6},
|
||||
{0x000F, 5},{0x0000, 5},{0x001F, 5},{0x0013, 5}
|
||||
},
|
||||
{
|
||||
{0x0005, 4},{0x003C, 6},{0x0040, 7},{0x000D, 7},
|
||||
{0x0031, 9},{0x0061,10},{0x0060,10},{0x0002, 5},
|
||||
{0x00F5, 8},{0x0006, 3},{0x0005, 3},{0x0007, 4},
|
||||
{0x0006, 4},{0x0002, 4},{0x0009, 5},{0x0025, 6},
|
||||
{0x0007, 6},{0x0021, 6},{0x0024, 6},{0x0010, 6},
|
||||
{0x0041, 7},{0x00F4, 8},{0x0019, 8},{0x000E, 4},
|
||||
{0x0003, 4},{0x0011, 5},{0x0011, 6},{0x003F, 6},
|
||||
{0x003E, 6},{0x007B, 7},{0x0000, 4},{0x0013, 5}
|
||||
},
|
||||
{
|
||||
{0x000A, 4},{0x0007, 5},{0x0001, 6},{0x0009, 6},
|
||||
{0x0131, 9},{0x0261,10},{0x0260,10},{0x0015, 6},
|
||||
{0x0001, 7},{0x0007, 3},{0x0006, 3},{0x0008, 4},
|
||||
{0x0007, 4},{0x0006, 4},{0x0012, 5},{0x002F, 6},
|
||||
{0x0014, 6},{0x0027, 6},{0x002D, 6},{0x0016, 6},
|
||||
{0x004D, 7},{0x0099, 8},{0x0000, 7},{0x0004, 4},
|
||||
{0x0001, 4},{0x0005, 5},{0x0017, 6},{0x002E, 6},
|
||||
{0x002C, 6},{0x0008, 6},{0x0006, 5},{0x0001, 5}
|
||||
},
|
||||
{
|
||||
{0x0000, 3},{0x000E, 5},{0x0017, 6},{0x002A, 6},
|
||||
{0x0010, 7},{0x00F9,10},{0x00F8,10},{0x001E, 7},
|
||||
{0x003F, 8},{0x0007, 3},{0x0006, 3},{0x0009, 4},
|
||||
{0x0008, 4},{0x0006, 4},{0x000F, 5},{0x0005, 5},
|
||||
{0x0016, 6},{0x0029, 6},{0x002B, 6},{0x0015, 6},
|
||||
{0x0050, 7},{0x0011, 7},{0x007D, 9},{0x0004, 4},
|
||||
{0x0017, 5},{0x0006, 5},{0x0014, 6},{0x002C, 6},
|
||||
{0x002D, 6},{0x000E, 6},{0x0009, 6},{0x0051, 7}
|
||||
},
|
||||
{
|
||||
{0x0002, 3},{0x0018, 5},{0x002F, 6},{0x000D, 5},
|
||||
{0x0053, 7},{0x0295,10},{0x0294,10},{0x00A4, 8},
|
||||
{0x007C, 8},{0x0000, 2},{0x0007, 3},{0x0009, 4},
|
||||
{0x0008, 4},{0x001B, 5},{0x000C, 5},{0x0028, 6},
|
||||
{0x006A, 7},{0x001E, 6},{0x001D, 6},{0x0069, 7},
|
||||
{0x00D7, 8},{0x007D, 8},{0x014B, 9},{0x0019, 5},
|
||||
{0x0016, 5},{0x002E, 6},{0x001C, 6},{0x002B, 6},
|
||||
{0x002A, 6},{0x0068, 7},{0x003F, 7},{0x00D6, 8}
|
||||
},
|
||||
{
|
||||
{0x0002, 3},{0x001B, 5},{0x000C, 5},{0x0018, 5},
|
||||
{0x0029, 6},{0x007F, 8},{0x02F0,10},{0x0198, 9},
|
||||
{0x0179, 9},{0x0000, 2},{0x0007, 3},{0x0009, 4},
|
||||
{0x0008, 4},{0x001A, 5},{0x000D, 5},{0x002A, 6},
|
||||
{0x0064, 7},{0x001E, 6},{0x0067, 7},{0x005F, 7},
|
||||
{0x00CD, 8},{0x007E, 8},{0x02F1,10},{0x0016, 5},
|
||||
{0x000E, 5},{0x002E, 6},{0x0065, 7},{0x002B, 6},
|
||||
{0x0028, 6},{0x003E, 7},{0x00BD, 8},{0x0199, 9}
|
||||
},
|
||||
{
|
||||
{0x0002, 3},{0x0007, 4},{0x0016, 5},{0x0006, 4},
|
||||
{0x0036, 6},{0x005C, 7},{0x015D, 9},{0x015C, 9},
|
||||
{0x02BF,10},{0x0000, 2},{0x0007, 3},{0x0009, 4},
|
||||
{0x0008, 4},{0x0018, 5},{0x0034, 6},{0x002A, 6},
|
||||
{0x005E, 7},{0x006A, 7},{0x0064, 7},{0x005D, 7},
|
||||
{0x00CB, 8},{0x00AD, 8},{0x02BE,10},{0x0014, 5},
|
||||
{0x0033, 6},{0x006E, 7},{0x005F, 7},{0x006F, 7},
|
||||
{0x006B, 7},{0x00CA, 8},{0x00AC, 8},{0x015E, 9}
|
||||
},
|
||||
{
|
||||
{0x000F, 4},{0x001D, 5},{0x0018, 5},{0x000B, 4},
|
||||
{0x0019, 5},{0x0029, 6},{0x00D6, 8},{0x0551,11},
|
||||
{0x0AA1,12},{0x0001, 2},{0x0000, 2},{0x0009, 4},
|
||||
{0x0008, 4},{0x001B, 5},{0x0038, 6},{0x0028, 6},
|
||||
{0x0057, 7},{0x006A, 7},{0x0068, 7},{0x0056, 7},
|
||||
{0x00E5, 8},{0x0155, 9},{0x0AA0,12},{0x0073, 7},
|
||||
{0x0069, 7},{0x00D7, 8},{0x00AB, 8},{0x00E4, 8},
|
||||
{0x00A9, 8},{0x0151, 9},{0x0150, 9},{0x02A9,10}
|
||||
},
|
||||
{
|
||||
{0x0008, 5},{0x0025, 7},{0x017A, 9},{0x02F7,10},
|
||||
{0x0BDB,12},{0x17B4,13},{0x2F6B,14},{0x001D, 5},
|
||||
{0x2F6A,14},{0x0008, 4},{0x0007, 4},{0x0001, 4},
|
||||
{0x0002, 4},{0x000A, 4},{0x0006, 4},{0x0000, 4},
|
||||
{0x001C, 5},{0x0009, 4},{0x000D, 4},{0x000F, 4},
|
||||
{0x000C, 4},{0x0003, 4},{0x000A, 5},{0x0016, 5},
|
||||
{0x0013, 6},{0x005D, 7},{0x0024, 7},{0x00BC, 8},
|
||||
{0x005C, 7},{0x05EC,11},{0x000B, 5},{0x005F, 7}
|
||||
},
|
||||
{
|
||||
{0x000F, 5},{0x0010, 6},{0x004B, 8},{0x00C6, 8},
|
||||
{0x031D,10},{0x0C71,12},{0x0C70,12},{0x0001, 4},
|
||||
{0x0C73,12},{0x0008, 4},{0x0009, 4},{0x0002, 4},
|
||||
{0x0003, 4},{0x000B, 4},{0x0006, 4},{0x0000, 4},
|
||||
{0x001C, 5},{0x0005, 4},{0x000D, 4},{0x000F, 4},
|
||||
{0x000A, 4},{0x0019, 5},{0x0013, 6},{0x001D, 5},
|
||||
{0x0030, 6},{0x0062, 7},{0x0024, 7},{0x004A, 8},
|
||||
{0x018F, 9},{0x0C72,12},{0x000E, 5},{0x0011, 6}
|
||||
},
|
||||
{
|
||||
{0x001B, 5},{0x0003, 6},{0x008D, 8},{0x0040, 7},
|
||||
{0x0239,10},{0x0471,11},{0x08E0,12},{0x0003, 4},
|
||||
{0x11C3,13},{0x000A, 4},{0x0009, 4},{0x0004, 4},
|
||||
{0x0005, 4},{0x000E, 4},{0x0007, 4},{0x0001, 4},
|
||||
{0x001E, 5},{0x0006, 4},{0x000C, 4},{0x000B, 4},
|
||||
{0x0002, 4},{0x0000, 5},{0x0041, 7},{0x001F, 5},
|
||||
{0x0022, 6},{0x0002, 6},{0x008F, 8},{0x008C, 8},
|
||||
{0x011D, 9},{0x11C2,13},{0x001A, 5},{0x0021, 6}
|
||||
},
|
||||
{
|
||||
{0x001F, 5},{0x0003, 6},{0x0003, 7},{0x0043, 7},
|
||||
{0x000B, 9},{0x0015,10},{0x0051,12},{0x0003, 4},
|
||||
{0x0050,12},{0x000D, 4},{0x000C, 4},{0x0004, 4},
|
||||
{0x0006, 4},{0x000E, 4},{0x000A, 4},{0x0001, 4},
|
||||
{0x001E, 5},{0x0005, 4},{0x0009, 4},{0x0007, 4},
|
||||
{0x0011, 5},{0x0002, 6},{0x0004, 8},{0x0002, 4},
|
||||
{0x002D, 6},{0x0020, 6},{0x0042, 7},{0x0001, 7},
|
||||
{0x0000, 7},{0x0029,11},{0x0017, 5},{0x002C, 6}
|
||||
},
|
||||
{
|
||||
{0x0003, 4},{0x001F, 6},{0x003A, 7},{0x005D, 7},
|
||||
{0x0173, 9},{0x02E4,10},{0x172D,13},{0x0004, 4},
|
||||
{0x172C,13},{0x000F, 4},{0x000E, 4},{0x0009, 4},
|
||||
{0x0008, 4},{0x000C, 4},{0x000A, 4},{0x0001, 4},
|
||||
{0x0016, 5},{0x0002, 4},{0x0005, 4},{0x001A, 5},
|
||||
{0x002F, 6},{0x0038, 7},{0x05CA,11},{0x0006, 4},
|
||||
{0x0037, 6},{0x001E, 6},{0x003B, 7},{0x0039, 7},
|
||||
{0x00B8, 8},{0x0B97,12},{0x0000, 4},{0x0036, 6}
|
||||
},
|
||||
{
|
||||
{0x0006, 4},{0x0037, 6},{0x005D, 7},{0x000C, 6},
|
||||
{0x00B9, 8},{0x02E3,10},{0x05C4,11},{0x0004, 4},
|
||||
{0x1715,13},{0x0000, 3},{0x000F, 4},{0x0008, 4},
|
||||
{0x0007, 4},{0x000C, 4},{0x0009, 4},{0x001D, 5},
|
||||
{0x0016, 5},{0x001C, 5},{0x001A, 5},{0x000B, 5},
|
||||
{0x005E, 7},{0x0170, 9},{0x1714,13},{0x000A, 4},
|
||||
{0x000A, 5},{0x0036, 6},{0x005F, 7},{0x001B, 7},
|
||||
{0x001A, 7},{0x0B8B,12},{0x0002, 4},{0x0007, 5}
|
||||
},
|
||||
{
|
||||
{0x000C, 4},{0x000B, 5},{0x0079, 7},{0x0022, 6},
|
||||
{0x00F0, 8},{0x0119, 9},{0x0230,10},{0x001D, 5},
|
||||
{0x08C4,12},{0x0001, 3},{0x0000, 3},{0x000A, 4},
|
||||
{0x0009, 4},{0x000B, 4},{0x0007, 4},{0x001C, 5},
|
||||
{0x003D, 6},{0x000D, 5},{0x0008, 5},{0x0015, 6},
|
||||
{0x008D, 8},{0x118B,13},{0x118A,13},{0x000D, 4},
|
||||
{0x0010, 5},{0x0009, 5},{0x0014, 6},{0x0047, 7},
|
||||
{0x00F1, 8},{0x0463,11},{0x001F, 5},{0x000C, 5}
|
||||
},
|
||||
{
|
||||
{0x0000, 3},{0x001A, 5},{0x0033, 6},{0x000C, 5},
|
||||
{0x0046, 7},{0x01E3, 9},{0x03C5,10},{0x0017, 5},
|
||||
{0x1E21,13},{0x0002, 3},{0x0001, 3},{0x0009, 4},
|
||||
{0x000A, 4},{0x0007, 4},{0x001B, 5},{0x003D, 6},
|
||||
{0x001B, 6},{0x0022, 6},{0x0079, 7},{0x00F0, 8},
|
||||
{0x1E20,13},{0x1E23,13},{0x1E22,13},{0x000E, 4},
|
||||
{0x0016, 5},{0x0018, 5},{0x0032, 6},{0x001A, 6},
|
||||
{0x0047, 7},{0x0789,11},{0x001F, 5},{0x0010, 5}
|
||||
},
|
||||
{
|
||||
{0x001D, 5},{0x0061, 7},{0x004E, 8},{0x009E, 9},
|
||||
{0x027C,11},{0x09F5,13},{0x09F4,13},{0x0003, 4},
|
||||
{0x0060, 7},{0x0000, 3},{0x000F, 4},{0x000B, 4},
|
||||
{0x000A, 4},{0x0009, 4},{0x0005, 4},{0x000D, 5},
|
||||
{0x0031, 6},{0x0008, 5},{0x0038, 6},{0x0012, 6},
|
||||
{0x0026, 7},{0x013F,10},{0x04FB,12},{0x000D, 4},
|
||||
{0x0002, 4},{0x000C, 5},{0x0039, 6},{0x001C, 6},
|
||||
{0x000F, 5},{0x001D, 6},{0x0008, 4},{0x0019, 5}
|
||||
},
|
||||
{
|
||||
{0x0007, 4},{0x0019, 6},{0x00AB, 8},{0x00AA, 8},
|
||||
{0x0119,10},{0x0461,12},{0x0460,12},{0x001B, 5},
|
||||
{0x0047, 8},{0x0001, 3},{0x0000, 3},{0x000C, 4},
|
||||
{0x000B, 4},{0x0009, 4},{0x0005, 4},{0x000D, 5},
|
||||
{0x0035, 6},{0x003D, 6},{0x003C, 6},{0x0018, 6},
|
||||
{0x0022, 7},{0x008D, 9},{0x0231,11},{0x000E, 4},
|
||||
{0x001F, 5},{0x0009, 5},{0x002B, 6},{0x0010, 6},
|
||||
{0x0034, 6},{0x0054, 7},{0x0008, 4},{0x0014, 5}
|
||||
},
|
||||
{
|
||||
{0x000C, 4},{0x0005, 5},{0x0008, 6},{0x005B, 7},
|
||||
{0x004D, 9},{0x0131,11},{0x0261,12},{0x001A, 5},
|
||||
{0x0012, 7},{0x0000, 3},{0x000F, 4},{0x000A, 4},
|
||||
{0x0009, 4},{0x0006, 4},{0x001B, 5},{0x0006, 5},
|
||||
{0x001C, 6},{0x002C, 6},{0x0015, 6},{0x005A, 7},
|
||||
{0x0027, 8},{0x0099,10},{0x0260,12},{0x000E, 4},
|
||||
{0x0004, 4},{0x000F, 5},{0x0007, 5},{0x001D, 6},
|
||||
{0x000B, 5},{0x0014, 6},{0x0008, 4},{0x0017, 5}
|
||||
},
|
||||
{
|
||||
{0x000F, 4},{0x0013, 5},{0x0075, 7},{0x0024, 6},
|
||||
{0x0095, 8},{0x0251,10},{0x04A0,11},{0x0010, 5},
|
||||
{0x00C8, 8},{0x0002, 3},{0x0001, 3},{0x0001, 4},
|
||||
{0x0000, 4},{0x001A, 5},{0x0011, 5},{0x002C, 6},
|
||||
{0x0065, 7},{0x0074, 7},{0x004B, 7},{0x00C9, 8},
|
||||
{0x0129, 9},{0x0943,12},{0x0942,12},{0x0003, 3},
|
||||
{0x000A, 4},{0x001C, 5},{0x0018, 5},{0x0033, 6},
|
||||
{0x0017, 5},{0x002D, 6},{0x001B, 5},{0x003B, 6}
|
||||
},
|
||||
{
|
||||
{0x0003, 3},{0x001A, 5},{0x002D, 6},{0x0038, 6},
|
||||
{0x0028, 7},{0x0395,10},{0x0E51,12},{0x0037, 6},
|
||||
{0x00E4, 8},{0x0001, 3},{0x0000, 3},{0x001F, 5},
|
||||
{0x001E, 5},{0x0017, 5},{0x003A, 6},{0x0073, 7},
|
||||
{0x002A, 7},{0x002B, 7},{0x0029, 7},{0x01CB, 9},
|
||||
{0x0729,11},{0x1CA1,13},{0x1CA0,13},{0x0004, 3},
|
||||
{0x000A, 4},{0x0004, 4},{0x0018, 5},{0x0036, 6},
|
||||
{0x000B, 5},{0x002C, 6},{0x0019, 5},{0x003B, 6}
|
||||
},
|
||||
{
|
||||
{0x0004, 3},{0x0004, 4},{0x003F, 6},{0x0017, 5},
|
||||
{0x0075, 7},{0x01F5, 9},{0x07D1,11},{0x0017, 6},
|
||||
{0x01F6, 9},{0x0001, 3},{0x0000, 3},{0x001B, 5},
|
||||
{0x001A, 5},{0x000A, 5},{0x0032, 6},{0x0074, 7},
|
||||
{0x00F8, 8},{0x00F9, 8},{0x01F7, 9},{0x03E9,10},
|
||||
{0x0FA0,12},{0x1F43,13},{0x1F42,13},{0x0003, 3},
|
||||
{0x000A, 4},{0x001E, 5},{0x001C, 5},{0x003B, 6},
|
||||
{0x0018, 5},{0x0016, 6},{0x0016, 5},{0x0033, 6}
|
||||
},
|
||||
{
|
||||
{0x0004, 3},{0x0007, 4},{0x0018, 5},{0x001E, 5},
|
||||
{0x0036, 6},{0x0031, 7},{0x0177, 9},{0x0077, 7},
|
||||
{0x0176, 9},{0x0001, 3},{0x0000, 3},{0x001A, 5},
|
||||
{0x0019, 5},{0x003A, 6},{0x0019, 6},{0x005C, 7},
|
||||
{0x00BA, 8},{0x0061, 8},{0x00C1, 9},{0x0180,10},
|
||||
{0x0302,11},{0x0607,12},{0x0606,12},{0x0002, 3},
|
||||
{0x000A, 4},{0x001F, 5},{0x001C, 5},{0x0037, 6},
|
||||
{0x0016, 5},{0x0076, 7},{0x000D, 5},{0x002F, 6}
|
||||
},
|
||||
{
|
||||
{0x0000, 3},{0x000A, 4},{0x001A, 5},{0x000C, 4},
|
||||
{0x001D, 5},{0x0039, 6},{0x0078, 7},{0x005E, 7},
|
||||
{0x0393,11},{0x0002, 3},{0x0001, 3},{0x0016, 5},
|
||||
{0x000F, 5},{0x002E, 6},{0x005F, 7},{0x0073, 8},
|
||||
{0x00E5, 9},{0x01C8,10},{0x0E4A,13},{0x1C97,14},
|
||||
{0x1C96,14},{0x0E49,13},{0x0E48,13},{0x0004, 3},
|
||||
{0x0006, 4},{0x001F, 5},{0x001B, 5},{0x001D, 6},
|
||||
{0x0038, 6},{0x0038, 7},{0x003D, 6},{0x0079, 7}
|
||||
},
|
||||
{
|
||||
{0x000B, 5},{0x002B, 7},{0x0054, 8},{0x01B7, 9},
|
||||
{0x06D9,11},{0x0DB1,12},{0x0DB0,12},{0x0002, 4},
|
||||
{0x00AB, 9},{0x0009, 4},{0x000A, 4},{0x0007, 4},
|
||||
{0x0008, 4},{0x000F, 4},{0x000C, 4},{0x0003, 4},
|
||||
{0x001D, 5},{0x0004, 4},{0x000B, 4},{0x0006, 4},
|
||||
{0x001A, 5},{0x0003, 6},{0x00AA, 9},{0x0001, 4},
|
||||
{0x0000, 5},{0x0014, 6},{0x006C, 7},{0x00DA, 8},
|
||||
{0x0002, 6},{0x036D,10},{0x001C, 5},{0x0037, 6}
|
||||
},
|
||||
{
|
||||
{0x001D, 5},{0x0004, 6},{0x00B6, 8},{0x006A, 8},
|
||||
{0x05B9,11},{0x16E1,13},{0x16E0,13},{0x0007, 4},
|
||||
{0x016F, 9},{0x000C, 4},{0x000D, 4},{0x0009, 4},
|
||||
{0x0008, 4},{0x000F, 4},{0x000A, 4},{0x0003, 4},
|
||||
{0x0017, 5},{0x0002, 4},{0x0004, 4},{0x001C, 5},
|
||||
{0x002C, 6},{0x006B, 8},{0x0B71,12},{0x0005, 4},
|
||||
{0x0003, 5},{0x001B, 6},{0x005A, 7},{0x0034, 7},
|
||||
{0x0005, 6},{0x02DD,10},{0x0000, 4},{0x000C, 5}
|
||||
},
|
||||
{
|
||||
{0x0003, 4},{0x007F, 7},{0x00A1, 8},{0x00A0, 8},
|
||||
{0x020C,10},{0x0834,12},{0x106B,13},{0x0007, 4},
|
||||
{0x0082, 8},{0x000E, 4},{0x000D, 4},{0x000B, 4},
|
||||
{0x000C, 4},{0x0000, 3},{0x0009, 4},{0x0002, 4},
|
||||
{0x0011, 5},{0x001E, 5},{0x0015, 5},{0x003E, 6},
|
||||
{0x0040, 7},{0x041B,11},{0x106A,13},{0x0006, 4},
|
||||
{0x000A, 5},{0x0029, 6},{0x007E, 7},{0x0051, 7},
|
||||
{0x0021, 6},{0x0107, 9},{0x0004, 4},{0x000B, 5}
|
||||
},
|
||||
{
|
||||
{0x0007, 4},{0x001B, 6},{0x00F6, 8},{0x00E9, 8},
|
||||
{0x03A1,10},{0x0740,11},{0x0E82,12},{0x001F, 5},
|
||||
{0x01EF, 9},{0x0001, 3},{0x0002, 3},{0x000B, 4},
|
||||
{0x000C, 4},{0x000D, 4},{0x0008, 4},{0x001C, 5},
|
||||
{0x0003, 5},{0x0012, 5},{0x0002, 5},{0x0075, 7},
|
||||
{0x01D1, 9},{0x1D07,13},{0x1D06,13},{0x000A, 4},
|
||||
{0x0013, 5},{0x003B, 6},{0x001A, 6},{0x007A, 7},
|
||||
{0x003C, 6},{0x01EE, 9},{0x0000, 4},{0x000C, 5}
|
||||
},
|
||||
{
|
||||
{0x000D, 4},{0x003D, 6},{0x0042, 7},{0x0037, 7},
|
||||
{0x00D9, 9},{0x0362,11},{0x06C6,12},{0x001F, 5},
|
||||
{0x0086, 8},{0x0001, 3},{0x0002, 3},{0x000C, 4},
|
||||
{0x000B, 4},{0x000A, 4},{0x0001, 4},{0x000F, 5},
|
||||
{0x0025, 6},{0x003C, 6},{0x001A, 6},{0x0087, 8},
|
||||
{0x01B0,10},{0x0D8F,13},{0x0D8E,13},{0x000E, 4},
|
||||
{0x0013, 5},{0x000C, 5},{0x0024, 6},{0x0020, 6},
|
||||
{0x0011, 5},{0x006D, 8},{0x0000, 4},{0x000E, 5}
|
||||
},
|
||||
{
|
||||
{0x0000, 3},{0x0012, 5},{0x0076, 7},{0x0077, 7},
|
||||
{0x014D, 9},{0x0533,11},{0x14C9,13},{0x0013, 5},
|
||||
{0x00A5, 8},{0x0002, 3},{0x0003, 3},{0x000B, 4},
|
||||
{0x000C, 4},{0x0008, 4},{0x001A, 5},{0x002B, 6},
|
||||
{0x0075, 7},{0x0074, 7},{0x00A7, 8},{0x0298,10},
|
||||
{0x14C8,13},{0x14CB,13},{0x14CA,13},{0x000F, 4},
|
||||
{0x001C, 5},{0x0007, 5},{0x002A, 6},{0x0028, 6},
|
||||
{0x001B, 5},{0x00A4, 8},{0x0002, 4},{0x0006, 5}
|
||||
},
|
||||
{
|
||||
{0x0002, 3},{0x001A, 5},{0x002B, 6},{0x003A, 6},
|
||||
{0x00ED, 8},{0x0283,10},{0x0A0A,12},{0x0004, 5},
|
||||
{0x00A1, 8},{0x0004, 3},{0x0003, 3},{0x000B, 4},
|
||||
{0x000C, 4},{0x001F, 5},{0x0006, 5},{0x0077, 7},
|
||||
{0x00A3, 8},{0x00A2, 8},{0x0140, 9},{0x1417,13},
|
||||
{0x1416,13},{0x0A09,12},{0x0A08,12},{0x0000, 3},
|
||||
{0x001E, 5},{0x0007, 5},{0x002A, 6},{0x0029, 6},
|
||||
{0x001C, 5},{0x00EC, 8},{0x001B, 5},{0x0005, 5}
|
||||
},
|
||||
{
|
||||
{0x0002, 3},{0x0002, 4},{0x0018, 5},{0x001D, 5},
|
||||
{0x0035, 6},{0x00E4, 8},{0x01CF,11},{0x001D, 7},
|
||||
{0x0072, 9},{0x0004, 3},{0x0005, 3},{0x0006, 4},
|
||||
{0x0007, 4},{0x0006, 5},{0x0073, 7},{0x0038, 8},
|
||||
{0x01CE,11},{0x039B,12},{0x0398,12},{0x0733,13},
|
||||
{0x0732,13},{0x0735,13},{0x0734,13},{0x0000, 3},
|
||||
{0x001F, 5},{0x001B, 5},{0x0034, 6},{0x000F, 6},
|
||||
{0x001E, 5},{0x00E5, 8},{0x0019, 5},{0x0038, 6}
|
||||
},
|
||||
{
|
||||
{0x0016, 5},{0x0050, 7},{0x0172, 9},{0x02E7,10},
|
||||
{0x1732,13},{0x2E67,14},{0x2E66,14},{0x0006, 4},
|
||||
{0x0051, 7},{0x0001, 3},{0x0000, 3},{0x000D, 4},
|
||||
{0x000C, 4},{0x0009, 4},{0x001C, 5},{0x0009, 5},
|
||||
{0x001C, 6},{0x001D, 6},{0x005D, 7},{0x00B8, 8},
|
||||
{0x05CD,11},{0x1731,13},{0x1730,13},{0x000F, 4},
|
||||
{0x0005, 4},{0x000F, 5},{0x0008, 5},{0x0029, 6},
|
||||
{0x001D, 5},{0x002F, 6},{0x0008, 4},{0x0015, 5}
|
||||
},
|
||||
{
|
||||
{0x0009, 4},{0x0021, 6},{0x0040, 7},{0x00AD, 8},
|
||||
{0x02B0,10},{0x1589,13},{0x1588,13},{0x001C, 5},
|
||||
{0x005F, 7},{0x0000, 3},{0x000F, 4},{0x000D, 4},
|
||||
{0x000C, 4},{0x0006, 4},{0x0011, 5},{0x002A, 6},
|
||||
{0x0057, 7},{0x005E, 7},{0x0041, 7},{0x0159, 9},
|
||||
{0x0563,11},{0x158B,13},{0x158A,13},{0x0001, 3},
|
||||
{0x0005, 4},{0x0014, 5},{0x003B, 6},{0x002E, 6},
|
||||
{0x0004, 4},{0x003A, 6},{0x0007, 4},{0x0016, 5}
|
||||
},
|
||||
{
|
||||
{0x000E, 4},{0x0007, 5},{0x0046, 7},{0x0045, 7},
|
||||
{0x0064, 9},{0x032A,12},{0x0657,13},{0x0018, 5},
|
||||
{0x000D, 6},{0x0000, 3},{0x000F, 4},{0x000A, 4},
|
||||
{0x000B, 4},{0x001A, 5},{0x0036, 6},{0x0047, 7},
|
||||
{0x0044, 7},{0x0018, 7},{0x0033, 8},{0x00CB,10},
|
||||
{0x0656,13},{0x0329,12},{0x0328,12},{0x0002, 3},
|
||||
{0x0006, 4},{0x0019, 5},{0x000E, 5},{0x0037, 6},
|
||||
{0x0009, 4},{0x000F, 5},{0x0002, 4},{0x0010, 5}
|
||||
},
|
||||
{
|
||||
{0x0003, 3},{0x0018, 5},{0x0023, 6},{0x0077, 7},
|
||||
{0x0194, 9},{0x1956,13},{0x32AF,14},{0x003A, 6},
|
||||
{0x0076, 7},{0x0002, 3},{0x0001, 3},{0x001F, 5},
|
||||
{0x001E, 5},{0x0014, 5},{0x0022, 6},{0x0064, 7},
|
||||
{0x0197, 9},{0x0196, 9},{0x032B,10},{0x0654,11},
|
||||
{0x32AE,14},{0x1955,13},{0x1954,13},{0x0000, 3},
|
||||
{0x0009, 4},{0x001C, 5},{0x0015, 5},{0x0010, 5},
|
||||
{0x000D, 4},{0x0017, 5},{0x0016, 5},{0x0033, 6}
|
||||
},
|
||||
{
|
||||
{0x0005, 3},{0x0006, 4},{0x003E, 6},{0x0010, 5},
|
||||
{0x0048, 7},{0x093F,12},{0x24FA,14},{0x0032, 6},
|
||||
{0x0067, 7},{0x0002, 3},{0x0001, 3},{0x001B, 5},
|
||||
{0x001E, 5},{0x0034, 6},{0x0066, 7},{0x0092, 8},
|
||||
{0x0126, 9},{0x024E,10},{0x049E,11},{0x49F7,15},
|
||||
{0x49F6,15},{0x24F9,14},{0x24F8,14},{0x0000, 3},
|
||||
{0x0007, 4},{0x0018, 5},{0x0011, 5},{0x003F, 6},
|
||||
{0x000E, 4},{0x0013, 5},{0x0035, 6},{0x0025, 6}
|
||||
},
|
||||
{
|
||||
{0x0005, 3},{0x0008, 4},{0x0012, 5},{0x001C, 5},
|
||||
{0x001C, 6},{0x00EA, 9},{0x1D75,14},{0x001E, 6},
|
||||
{0x0066, 7},{0x0001, 3},{0x0002, 3},{0x001B, 5},
|
||||
{0x001A, 5},{0x001F, 6},{0x003B, 7},{0x0074, 8},
|
||||
{0x01D6,10},{0x03AF,11},{0x1D74,14},{0x1D77,14},
|
||||
{0x1D76,14},{0x0EB9,13},{0x0EB8,13},{0x000F, 4},
|
||||
{0x0006, 4},{0x0013, 5},{0x003B, 6},{0x003A, 6},
|
||||
{0x0000, 3},{0x0018, 5},{0x0032, 6},{0x0067, 7}
|
||||
},
|
||||
{
|
||||
{0x0004, 3},{0x000A, 4},{0x001B, 5},{0x000C, 4},
|
||||
{0x000D, 5},{0x00E6, 8},{0x0684,11},{0x0072, 7},
|
||||
{0x00E7, 8},{0x0002, 3},{0x0001, 3},{0x0017, 5},
|
||||
{0x0016, 5},{0x0018, 6},{0x00D1, 8},{0x01A0, 9},
|
||||
{0x0686,11},{0x0D0F,12},{0x0D0A,12},{0x1A17,13},
|
||||
{0x1A16,13},{0x1A1D,13},{0x1A1C,13},{0x000F, 4},
|
||||
{0x001D, 5},{0x000E, 5},{0x0035, 6},{0x0038, 6},
|
||||
{0x0000, 3},{0x000F, 5},{0x0019, 6},{0x0069, 7}
|
||||
},
|
||||
{
|
||||
{0x0003, 3},{0x000C, 4},{0x001B, 5},{0x0000, 3},
|
||||
{0x0003, 4},{0x002E, 6},{0x0051, 9},{0x00BC, 8},
|
||||
{0x0053, 9},{0x0004, 3},{0x0002, 3},{0x0016, 5},
|
||||
{0x0015, 5},{0x0015, 7},{0x0050, 9},{0x00A4,10},
|
||||
{0x0294,12},{0x052B,13},{0x052A,13},{0x052D,13},
|
||||
{0x052C,13},{0x052F,13},{0x052E,13},{0x000E, 4},
|
||||
{0x001A, 5},{0x0004, 5},{0x0028, 6},{0x0029, 6},
|
||||
{0x000F, 4},{0x000B, 6},{0x005F, 7},{0x00BD, 8}
|
||||
},
|
||||
{
|
||||
{0x0003, 4},{0x0009, 6},{0x00D0, 8},{0x01A3, 9},
|
||||
{0x0344,10},{0x0D14,12},{0x1A2B,13},{0x0004, 4},
|
||||
{0x0015, 7},{0x0000, 3},{0x000F, 4},{0x000B, 4},
|
||||
{0x000C, 4},{0x000E, 4},{0x0009, 4},{0x001B, 5},
|
||||
{0x000A, 5},{0x0014, 5},{0x000D, 5},{0x002A, 6},
|
||||
{0x0014, 7},{0x068B,11},{0x1A2A,13},{0x0008, 4},
|
||||
{0x000B, 5},{0x002B, 6},{0x000B, 6},{0x0069, 7},
|
||||
{0x0035, 6},{0x0008, 6},{0x0007, 4},{0x000C, 5}
|
||||
},
|
||||
{
|
||||
{0x000A, 4},{0x003C, 6},{0x0032, 7},{0x0030, 7},
|
||||
{0x00C5, 9},{0x0621,12},{0x0620,12},{0x001F, 5},
|
||||
{0x0033, 7},{0x0001, 3},{0x0000, 3},{0x000E, 4},
|
||||
{0x000D, 4},{0x000C, 4},{0x0004, 4},{0x000D, 5},
|
||||
{0x0026, 6},{0x0027, 6},{0x0014, 6},{0x0063, 8},
|
||||
{0x0189,10},{0x0623,12},{0x0622,12},{0x000B, 4},
|
||||
{0x0012, 5},{0x003D, 6},{0x0022, 6},{0x0015, 6},
|
||||
{0x000B, 5},{0x0023, 6},{0x0007, 4},{0x0010, 5}
|
||||
},
|
||||
{
|
||||
{0x000F, 4},{0x000C, 5},{0x0043, 7},{0x0010, 6},
|
||||
{0x0044, 8},{0x0114,10},{0x0455,12},{0x0018, 5},
|
||||
{0x0023, 7},{0x0001, 3},{0x0000, 3},{0x000E, 4},
|
||||
{0x000D, 4},{0x0009, 4},{0x0019, 5},{0x0009, 5},
|
||||
{0x0017, 6},{0x0016, 6},{0x0042, 7},{0x008B, 9},
|
||||
{0x0454,12},{0x0457,12},{0x0456,12},{0x000B, 4},
|
||||
{0x0015, 5},{0x000A, 5},{0x0029, 6},{0x0020, 6},
|
||||
{0x000D, 5},{0x0028, 6},{0x0007, 4},{0x0011, 5}
|
||||
},
|
||||
{
|
||||
{0x0001, 3},{0x001A, 5},{0x0029, 6},{0x002A, 6},
|
||||
{0x00A0, 8},{0x0285,10},{0x1425,13},{0x0002, 5},
|
||||
{0x0000, 7},{0x0002, 3},{0x0003, 3},{0x000C, 4},
|
||||
{0x000B, 4},{0x0008, 4},{0x0012, 5},{0x0001, 6},
|
||||
{0x0051, 7},{0x0001, 7},{0x0143, 9},{0x0508,11},
|
||||
{0x1424,13},{0x1427,13},{0x1426,13},{0x000F, 4},
|
||||
{0x001C, 5},{0x0003, 5},{0x0037, 6},{0x002B, 6},
|
||||
{0x0013, 5},{0x0036, 6},{0x001D, 5},{0x0001, 5}
|
||||
},
|
||||
{
|
||||
{0x0004, 3},{0x001F, 5},{0x003D, 6},{0x0006, 5},
|
||||
{0x0016, 7},{0x0053, 9},{0x014A,11},{0x0034, 6},
|
||||
{0x002A, 8},{0x0002, 3},{0x0003, 3},{0x000B, 4},
|
||||
{0x000C, 4},{0x001C, 5},{0x0037, 6},{0x0017, 7},
|
||||
{0x002B, 8},{0x0028, 8},{0x00A4,10},{0x052D,13},
|
||||
{0x052C,13},{0x052F,13},{0x052E,13},{0x0000, 3},
|
||||
{0x001D, 5},{0x0007, 5},{0x0004, 5},{0x0035, 6},
|
||||
{0x0014, 5},{0x0036, 6},{0x0015, 5},{0x003C, 6}
|
||||
},
|
||||
{
|
||||
{0x0004, 3},{0x000A, 4},{0x0007, 5},{0x001D, 5},
|
||||
{0x0009, 6},{0x01F3, 9},{0x07C7,11},{0x0008, 6},
|
||||
{0x01F0, 9},{0x0003, 3},{0x0002, 3},{0x000D, 4},
|
||||
{0x000C, 4},{0x0017, 5},{0x007D, 7},{0x01F2, 9},
|
||||
{0x07C6,11},{0x07C5,11},{0x1F12,13},{0x3E27,14},
|
||||
{0x3E26,14},{0x1F11,13},{0x1F10,13},{0x0000, 3},
|
||||
{0x001E, 5},{0x0006, 5},{0x0039, 6},{0x0038, 6},
|
||||
{0x003F, 6},{0x002C, 6},{0x0005, 5},{0x002D, 6}
|
||||
},
|
||||
{
|
||||
{0x0002, 3},{0x0007, 4},{0x0018, 5},{0x0003, 4},
|
||||
{0x0005, 5},{0x0035, 7},{0x004F, 9},{0x0012, 7},
|
||||
{0x04E5,13},{0x0005, 3},{0x0004, 3},{0x000D, 4},
|
||||
{0x000E, 4},{0x0033, 6},{0x0026, 8},{0x009D,10},
|
||||
{0x04E4,13},{0x04E7,13},{0x04E6,13},{0x04E1,13},
|
||||
{0x04E0,13},{0x04E3,13},{0x04E2,13},{0x0000, 3},
|
||||
{0x001F, 5},{0x000C, 5},{0x003D, 6},{0x003C, 6},
|
||||
{0x0032, 6},{0x0034, 7},{0x001B, 6},{0x0008, 6}
|
||||
},
|
||||
{
|
||||
{0x0000, 3},{0x0004, 4},{0x001C, 5},{0x000F, 4},
|
||||
{0x0002, 4},{0x0007, 5},{0x0075, 7},{0x00E8, 8},
|
||||
{0x1D2A,13},{0x0005, 3},{0x0004, 3},{0x000D, 4},
|
||||
{0x000C, 4},{0x0077, 7},{0x0E96,12},{0x3A57,14},
|
||||
{0x3A56,14},{0x3A5D,14},{0x3A5C,14},{0x3A5F,14},
|
||||
{0x3A5E,14},{0x1D29,13},{0x1D28,13},{0x0003, 3},
|
||||
{0x0006, 5},{0x000A, 5},{0x002C, 7},{0x0017, 6},
|
||||
{0x0076, 7},{0x01D3, 9},{0x03A4,10},{0x002D, 7}
|
||||
},
|
||||
{
|
||||
{0x000A, 4},{0x0024, 6},{0x00BF, 8},{0x0085, 8},
|
||||
{0x0211,10},{0x0842,12},{0x1087,13},{0x0018, 5},
|
||||
{0x0020, 6},{0x0001, 3},{0x0002, 3},{0x000E, 4},
|
||||
{0x000D, 4},{0x0007, 4},{0x0013, 5},{0x0025, 6},
|
||||
{0x005E, 7},{0x0043, 7},{0x00BE, 8},{0x0109, 9},
|
||||
{0x1086,13},{0x0841,12},{0x0840,12},{0x000F, 4},
|
||||
{0x0001, 4},{0x0011, 5},{0x0000, 5},{0x002E, 6},
|
||||
{0x0019, 5},{0x0001, 5},{0x0006, 4},{0x0016, 5}
|
||||
},
|
||||
{
|
||||
{0x0002, 3},{0x000F, 5},{0x006F, 7},{0x0061, 7},
|
||||
{0x0374,10},{0x1BA8,13},{0x3753,14},{0x0012, 5},
|
||||
{0x0036, 6},{0x0000, 3},{0x0001, 3},{0x000A, 4},
|
||||
{0x000B, 4},{0x001A, 5},{0x0031, 6},{0x0060, 7},
|
||||
{0x00DC, 8},{0x01BB, 9},{0x06EB,11},{0x1BAB,13},
|
||||
{0x3752,14},{0x3755,14},{0x3754,14},{0x000E, 4},
|
||||
{0x0006, 4},{0x0013, 5},{0x000E, 5},{0x003E, 6},
|
||||
{0x0008, 4},{0x001E, 5},{0x0019, 5},{0x003F, 6}
|
||||
},
|
||||
{
|
||||
{0x0003, 3},{0x001C, 5},{0x0025, 6},{0x0024, 6},
|
||||
{0x01DA, 9},{0x1DBD,13},{0x3B7C,14},{0x003C, 6},
|
||||
{0x003D, 6},{0x0000, 3},{0x0001, 3},{0x000B, 4},
|
||||
{0x000A, 4},{0x000B, 5},{0x0077, 7},{0x00EC, 8},
|
||||
{0x03B6,10},{0x076E,11},{0x1DBF,13},{0x76FB,15},
|
||||
{0x76FA,15},{0x3B79,14},{0x3B78,14},{0x000D, 4},
|
||||
{0x001F, 5},{0x0013, 5},{0x000A, 5},{0x0008, 5},
|
||||
{0x000C, 4},{0x0008, 4},{0x0009, 5},{0x003A, 6}
|
||||
},
|
||||
{
|
||||
{0x0005, 3},{0x0003, 4},{0x0004, 5},{0x0010, 5},
|
||||
{0x008F, 8},{0x0475,11},{0x11D1,13},{0x0079, 7},
|
||||
{0x0027, 6},{0x0002, 3},{0x0003, 3},{0x0001, 4},
|
||||
{0x0000, 4},{0x0026, 6},{0x0046, 7},{0x011C, 9},
|
||||
{0x0477,11},{0x08ED,12},{0x11D0,13},{0x11D3,13},
|
||||
{0x11D2,13},{0x11D9,13},{0x11D8,13},{0x000D, 4},
|
||||
{0x001F, 5},{0x0012, 5},{0x0005, 5},{0x003D, 6},
|
||||
{0x000C, 4},{0x000E, 4},{0x0022, 6},{0x0078, 7}
|
||||
},
|
||||
{
|
||||
{0x0005, 3},{0x000C, 4},{0x001B, 5},{0x0000, 4},
|
||||
{0x0006, 6},{0x03E2,10},{0x3E3D,14},{0x000F, 7},
|
||||
{0x0034, 6},{0x0003, 3},{0x0002, 3},{0x001E, 5},
|
||||
{0x001D, 5},{0x007D, 7},{0x01F0, 9},{0x07C6,11},
|
||||
{0x3E3C,14},{0x3E3F,14},{0x3E3E,14},{0x3E39,14},
|
||||
{0x3E38,14},{0x3E3B,14},{0x3E3A,14},{0x0008, 4},
|
||||
{0x001C, 5},{0x0002, 5},{0x003F, 6},{0x0035, 6},
|
||||
{0x0009, 4},{0x0001, 3},{0x000E, 7},{0x00F9, 8}
|
||||
},
|
||||
{
|
||||
{0x0004, 3},{0x000B, 4},{0x0001, 4},{0x000A, 4},
|
||||
{0x001E, 6},{0x00E0, 9},{0x0E1E,13},{0x0071, 8},
|
||||
{0x0039, 7},{0x0007, 3},{0x0006, 3},{0x000D, 5},
|
||||
{0x000C, 5},{0x0020, 7},{0x01C2,10},{0x1C3F,14},
|
||||
{0x1C3E,14},{0x0E19,13},{0x0E18,13},{0x0E1B,13},
|
||||
{0x0E1A,13},{0x0E1D,13},{0x0E1C,13},{0x0000, 4},
|
||||
{0x0009, 5},{0x001D, 6},{0x001F, 6},{0x0011, 6},
|
||||
{0x0005, 4},{0x0001, 3},{0x0043, 8},{0x0042, 8}
|
||||
},
|
||||
{
|
||||
{0x0004, 3},{0x000D, 4},{0x0007, 4},{0x0002, 3},
|
||||
{0x0014, 5},{0x016C, 9},{0x16D1,13},{0x02DF,10},
|
||||
{0x016E, 9},{0x0000, 2},{0x0007, 3},{0x002C, 6},
|
||||
{0x002B, 6},{0x02DE,10},{0x16D0,13},{0x16D3,13},
|
||||
{0x16D2,13},{0x2DB5,14},{0x2DB4,14},{0x2DB7,14},
|
||||
{0x2DB6,14},{0x16D9,13},{0x16D8,13},{0x000C, 5},
|
||||
{0x002A, 6},{0x005A, 7},{0x001B, 6},{0x001A, 6},
|
||||
{0x0017, 5},{0x000C, 4},{0x05B7,11},{0x05B5,11}
|
||||
},
|
||||
{
|
||||
{0x0002, 2},{0x000F, 4},{0x001C, 5},{0x000C, 4},
|
||||
{0x003B, 6},{0x01AC, 9},{0x1AD8,13},{0x35B3,14},
|
||||
{0x35B2,14},{0x0001, 2},{0x0000, 2},{0x0069, 7},
|
||||
{0x0068, 7},{0x35BD,14},{0x35BC,14},{0x35BF,14},
|
||||
{0x35BE,14},{0x35B9,14},{0x35B8,14},{0x35BB,14},
|
||||
{0x35BA,14},{0x35B5,14},{0x35B4,14},{0x01A9, 9},
|
||||
{0x01A8, 9},{0x035A,10},{0x00D7, 8},{0x00D5, 8},
|
||||
{0x003A, 6},{0x001B, 5},{0x35B7,14},{0x35B6,14}
|
||||
},
|
||||
{
|
||||
{0x0000, 3},{0x0010, 5},{0x0072, 7},{0x0071, 7},
|
||||
{0x0154, 9},{0x0AAB,12},{0x0AA8,12},{0x0014, 5},
|
||||
{0x0070, 7},{0x0002, 3},{0x0003, 3},{0x000C, 4},
|
||||
{0x000B, 4},{0x0003, 4},{0x0011, 5},{0x0073, 7},
|
||||
{0x0054, 7},{0x00AB, 8},{0x02AB,10},{0x1553,13},
|
||||
{0x1552,13},{0x1555,13},{0x1554,13},{0x000D, 4},
|
||||
{0x001E, 5},{0x0012, 5},{0x003E, 6},{0x002B, 6},
|
||||
{0x0002, 4},{0x003F, 6},{0x001D, 5},{0x0013, 5}
|
||||
},
|
||||
{
|
||||
{0x0003, 3},{0x001F, 5},{0x0029, 6},{0x003D, 6},
|
||||
{0x000C, 7},{0x0069,10},{0x0345,13},{0x0002, 5},
|
||||
{0x0028, 6},{0x0002, 3},{0x0001, 3},{0x000E, 4},
|
||||
{0x000C, 4},{0x0015, 5},{0x0007, 6},{0x001B, 8},
|
||||
{0x006B,10},{0x006A,10},{0x0344,13},{0x0347,13},
|
||||
{0x0346,13},{0x01A1,12},{0x01A0,12},{0x000B, 4},
|
||||
{0x001A, 5},{0x0012, 5},{0x0000, 5},{0x003C, 6},
|
||||
{0x0008, 4},{0x001B, 5},{0x0013, 5},{0x0001, 5}
|
||||
},
|
||||
{
|
||||
{0x0004, 3},{0x0004, 4},{0x003F, 6},{0x0014, 5},
|
||||
{0x0056, 7},{0x015C, 9},{0x15D5,13},{0x003C, 6},
|
||||
{0x002A, 6},{0x0000, 3},{0x0001, 3},{0x000E, 4},
|
||||
{0x000D, 4},{0x000C, 5},{0x00AF, 8},{0x02BB,10},
|
||||
{0x15D4,13},{0x15D7,13},{0x15D6,13},{0x15D1,13},
|
||||
{0x15D0,13},{0x15D3,13},{0x15D2,13},{0x000B, 4},
|
||||
{0x0019, 5},{0x000D, 5},{0x003E, 6},{0x0031, 6},
|
||||
{0x0007, 4},{0x0005, 4},{0x003D, 6},{0x0030, 6}
|
||||
},
|
||||
{
|
||||
{0x0005, 3},{0x0008, 4},{0x001A, 5},{0x0000, 4},
|
||||
{0x0036, 6},{0x0011, 8},{0x0106,12},{0x000A, 7},
|
||||
{0x006E, 7},{0x0002, 3},{0x0003, 3},{0x0003, 4},
|
||||
{0x0002, 4},{0x006F, 7},{0x0021, 9},{0x020F,13},
|
||||
{0x020E,13},{0x0101,12},{0x0100,12},{0x0103,12},
|
||||
{0x0102,12},{0x0105,12},{0x0104,12},{0x000C, 4},
|
||||
{0x001E, 5},{0x0003, 5},{0x003E, 6},{0x003F, 6},
|
||||
{0x0009, 4},{0x000E, 4},{0x000B, 7},{0x0009, 7}
|
||||
},
|
||||
{
|
||||
{0x0002, 3},{0x000E, 4},{0x001E, 5},{0x000C, 4},
|
||||
{0x001F, 5},{0x006E, 7},{0x00AD,10},{0x00AF,10},
|
||||
{0x0014, 7},{0x0004, 3},{0x0003, 3},{0x001A, 5},
|
||||
{0x0017, 5},{0x002A, 8},{0x0576,13},{0x0AEF,14},
|
||||
{0x0AEE,14},{0x0571,13},{0x0570,13},{0x0573,13},
|
||||
{0x0572,13},{0x0575,13},{0x0574,13},{0x0003, 4},
|
||||
{0x0016, 5},{0x0004, 5},{0x0036, 6},{0x000B, 6},
|
||||
{0x000A, 4},{0x0000, 3},{0x006F, 7},{0x00AC,10}
|
||||
},
|
||||
{
|
||||
{0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
|
||||
{0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
|
||||
{0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
|
||||
{0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
|
||||
{0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
|
||||
{0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
|
||||
{0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
|
||||
{0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
|
||||
},
|
||||
{
|
||||
{0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
|
||||
{0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
|
||||
{0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
|
||||
{0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
|
||||
{0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
|
||||
{0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
|
||||
{0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
|
||||
{0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
|
||||
},
|
||||
{
|
||||
{0x0004, 3},{0x0005, 4},{0x0003, 3},{0x0001, 3},
|
||||
{0x0004, 4},{0x002F, 6},{0x0526,11},{0x1495,13},
|
||||
{0x00A6, 8},{0x0007, 3},{0x0006, 3},{0x002D, 6},
|
||||
{0x002C, 6},{0x1494,13},{0x1497,13},{0x1496,13},
|
||||
{0x1491,13},{0x1490,13},{0x1493,13},{0x1492,13},
|
||||
{0x293D,14},{0x293C,14},{0x293F,14},{0x0000, 3},
|
||||
{0x0028, 6},{0x00A5, 8},{0x0148, 9},{0x00A7, 8},
|
||||
{0x002E, 6},{0x0015, 5},{0x0A4E,12},{0x293E,14}
|
||||
},
|
||||
{
|
||||
{0x0003, 3},{0x0011, 5},{0x0020, 6},{0x0074, 7},
|
||||
{0x010D, 9},{0x0863,12},{0x0860,12},{0x000A, 5},
|
||||
{0x0075, 7},{0x0001, 3},{0x0000, 3},{0x000B, 4},
|
||||
{0x000A, 4},{0x0018, 5},{0x0038, 6},{0x0042, 7},
|
||||
{0x010F, 9},{0x010E, 9},{0x0219,10},{0x10C3,13},
|
||||
{0x10C2,13},{0x10C5,13},{0x10C4,13},{0x000F, 4},
|
||||
{0x0004, 4},{0x0019, 5},{0x000B, 5},{0x0039, 6},
|
||||
{0x0009, 4},{0x001B, 5},{0x001A, 5},{0x003B, 6}
|
||||
},
|
||||
{
|
||||
{0x0005, 3},{0x0001, 4},{0x003E, 6},{0x0001, 5},
|
||||
{0x00E2, 8},{0x1C6F,13},{0x38D9,14},{0x0039, 6},
|
||||
{0x001F, 6},{0x0002, 3},{0x0001, 3},{0x0009, 4},
|
||||
{0x0008, 4},{0x0000, 5},{0x0070, 7},{0x01C7, 9},
|
||||
{0x038C,10},{0x071A,11},{0x38D8,14},{0x38DB,14},
|
||||
{0x38DA,14},{0x38DD,14},{0x38DC,14},{0x000D, 4},
|
||||
{0x001D, 5},{0x000E, 5},{0x003F, 6},{0x003C, 6},
|
||||
{0x000C, 4},{0x0006, 4},{0x003D, 6},{0x001E, 6}
|
||||
},
|
||||
{
|
||||
{0x0006, 3},{0x000B, 4},{0x0011, 5},{0x001E, 5},
|
||||
{0x0074, 7},{0x03AA,10},{0x1D5C,13},{0x0001, 6},
|
||||
{0x0021, 6},{0x0001, 3},{0x0002, 3},{0x0007, 4},
|
||||
{0x0006, 4},{0x003E, 6},{0x00EB, 8},{0x01D4, 9},
|
||||
{0x0EAF,12},{0x3ABB,14},{0x3ABA,14},{0x1D59,13},
|
||||
{0x1D58,13},{0x1D5B,13},{0x1D5A,13},{0x000A, 4},
|
||||
{0x001C, 5},{0x0001, 5},{0x003F, 6},{0x003B, 6},
|
||||
{0x0001, 4},{0x0009, 4},{0x0020, 6},{0x0000, 6}
|
||||
},
|
||||
{
|
||||
{0x0004, 3},{0x000A, 4},{0x0017, 5},{0x0004, 4},
|
||||
{0x0016, 6},{0x016A, 9},{0x16B1,13},{0x0017, 7},
|
||||
{0x005B, 7},{0x0006, 3},{0x0007, 3},{0x0001, 4},
|
||||
{0x0000, 4},{0x000A, 6},{0x02D7,10},{0x0B5A,12},
|
||||
{0x16B0,13},{0x16B3,13},{0x16B2,13},{0x2D6D,14},
|
||||
{0x2D6C,14},{0x2D6F,14},{0x2D6E,14},{0x0006, 4},
|
||||
{0x000A, 5},{0x0004, 5},{0x002C, 6},{0x0017, 6},
|
||||
{0x0003, 4},{0x0007, 4},{0x0016, 7},{0x00B4, 8}
|
||||
},
|
||||
{
|
||||
{0x0005, 3},{0x000D, 4},{0x0005, 4},{0x0009, 4},
|
||||
{0x0033, 6},{0x0193, 9},{0x192C,13},{0x0061, 8},
|
||||
{0x0031, 7},{0x0000, 2},{0x0007, 3},{0x0010, 5},
|
||||
{0x0011, 5},{0x00C8, 8},{0x192F,13},{0x325B,14},
|
||||
{0x325A,14},{0x1929,13},{0x1928,13},{0x192B,13},
|
||||
{0x192A,13},{0x325D,14},{0x325C,14},{0x0018, 5},
|
||||
{0x001A, 6},{0x001B, 6},{0x0065, 7},{0x0019, 6},
|
||||
{0x0004, 4},{0x0007, 4},{0x0060, 8},{0x0324,10}
|
||||
},
|
||||
{
|
||||
{0x0006, 3},{0x0000, 3},{0x0002, 4},{0x000F, 4},
|
||||
{0x0039, 6},{0x01D9, 9},{0x1D82,13},{0x0761,11},
|
||||
{0x03BE,10},{0x0001, 2},{0x0002, 2},{0x000F, 6},
|
||||
{0x000E, 6},{0x0762,11},{0x3B07,14},{0x3B06,14},
|
||||
{0x3B1D,14},{0x3B1C,14},{0x3B1F,14},{0x3B1E,14},
|
||||
{0x3B19,14},{0x3B18,14},{0x3B1B,14},{0x0038, 6},
|
||||
{0x01DE, 9},{0x00ED, 8},{0x03BF,10},{0x00EE, 8},
|
||||
{0x003A, 6},{0x0006, 5},{0x0EC0,12},{0x3B1A,14}
|
||||
},
|
||||
{
|
||||
{0x0000, 2},{0x0002, 3},{0x000F, 5},{0x0006, 4},
|
||||
{0x001C, 6},{0x01D0,10},{0x0E8C,13},{0x1D1B,14},
|
||||
{0x1D1A,14},{0x0003, 2},{0x0002, 2},{0x00EA, 9},
|
||||
{0x00E9, 9},{0x0E89,13},{0x0E88,13},{0x0E8B,13},
|
||||
{0x0E8A,13},{0x1D65,14},{0x1D64,14},{0x1D67,14},
|
||||
{0x1D66,14},{0x1D61,14},{0x1D60,14},{0x03AD,11},
|
||||
{0x1D63,14},{0x1D62,14},{0x1D1D,14},{0x1D1C,14},
|
||||
{0x003B, 7},{0x01D7,10},{0x1D1F,14},{0x1D1E,14}
|
||||
},
|
||||
{
|
||||
{0x0002, 2},{0x000F, 4},{0x001C, 5},{0x000C, 4},
|
||||
{0x003B, 6},{0x01AC, 9},{0x1AD8,13},{0x35B3,14},
|
||||
{0x35B2,14},{0x0001, 2},{0x0000, 2},{0x0069, 7},
|
||||
{0x0068, 7},{0x35BD,14},{0x35BC,14},{0x35BF,14},
|
||||
{0x35BE,14},{0x35B9,14},{0x35B8,14},{0x35BB,14},
|
||||
{0x35BA,14},{0x35B5,14},{0x35B4,14},{0x01A9, 9},
|
||||
{0x01A8, 9},{0x035A,10},{0x00D7, 8},{0x00D5, 8},
|
||||
{0x003A, 6},{0x001B, 5},{0x35B7,14},{0x35B6,14}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*A description of a Huffman code value used when encoding the tree.*/
|
||||
typedef struct{
|
||||
/*The bit pattern, left-shifted so that the MSB of all patterns is
|
||||
aligned.*/
|
||||
ogg_uint32_t pattern;
|
||||
/*The amount the bit pattern was shifted.*/
|
||||
int shift;
|
||||
/*The token this bit pattern represents.*/
|
||||
int token;
|
||||
}oc_huff_entry;
|
||||
|
||||
|
||||
|
||||
/*Compares two oc_huff_entry structures by their bit patterns.
|
||||
_c1: The first entry to compare.
|
||||
_c2: The second entry to compare.
|
||||
Return: <0 if _c1<_c2, >0 if _c1>_c2.*/
|
||||
static int huff_entry_cmp(const void *_c1,const void *_c2){
|
||||
ogg_uint32_t b1;
|
||||
ogg_uint32_t b2;
|
||||
b1=((const oc_huff_entry *)_c1)->pattern;
|
||||
b2=((const oc_huff_entry *)_c2)->pattern;
|
||||
return b1<b2?-1:b1>b2?1:0;
|
||||
}
|
||||
|
||||
/*Encodes a description of the given Huffman tables.
|
||||
Although the codes are stored in the encoder as flat arrays, in the bit
|
||||
stream and in the decoder they are structured as a tree.
|
||||
This function recovers the tree structure from the flat array and then
|
||||
writes it out.
|
||||
Note that the codes MUST form a Huffman code, and not merely a prefix-free
|
||||
code, since the binary tree is assumed to be full.
|
||||
_opb: The buffer to store the tree in.
|
||||
_codes: The Huffman tables to pack.
|
||||
Return: 0 on success, or a negative value if one of the given Huffman tables
|
||||
does not form a full, prefix-free code.*/
|
||||
int oc_huff_codes_pack(oggpack_buffer *_opb,
|
||||
const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]){
|
||||
int i;
|
||||
for(i=0;i<TH_NHUFFMAN_TABLES;i++){
|
||||
oc_huff_entry entries[TH_NDCT_TOKENS];
|
||||
int bpos;
|
||||
int maxlen;
|
||||
int mask;
|
||||
int j;
|
||||
/*First, find the maximum code length so we can align all the bit
|
||||
patterns.*/
|
||||
maxlen=_codes[i][0].nbits;
|
||||
for(j=1;j<TH_NDCT_TOKENS;j++)maxlen=OC_MAXI(_codes[i][j].nbits,maxlen);
|
||||
/*It's improbable that a code with more than 32 bits could pass the
|
||||
validation below, but abort early in any case.*/
|
||||
if(maxlen>32)return TH_EINVAL;
|
||||
mask=(1<<(maxlen>>1)<<(maxlen+1>>1))-1;
|
||||
/*Copy over the codes into our temporary workspace.
|
||||
The bit patterns are aligned, and the original entry each code is from
|
||||
is stored as well.*/
|
||||
for(j=0;j<TH_NDCT_TOKENS;j++){
|
||||
entries[j].shift=maxlen-_codes[i][j].nbits;
|
||||
entries[j].pattern=_codes[i][j].pattern<<entries[j].shift&mask;
|
||||
entries[j].token=j;
|
||||
}
|
||||
/*Sort the codes into ascending order.
|
||||
This is the order the leaves of the tree will be traversed.*/
|
||||
qsort(entries,TH_NDCT_TOKENS,sizeof(entries[0]),huff_entry_cmp);
|
||||
/*For each leaf of the tree:*/
|
||||
bpos=maxlen;
|
||||
for(j=0;j<TH_NDCT_TOKENS;j++){
|
||||
ogg_uint32_t bit;
|
||||
/*Fail if this code has no bits at all.
|
||||
Technically a codebook with a single 0-bit entry is legal, but the
|
||||
encoder currently does not support codebooks which do not contain all
|
||||
the tokens.*/
|
||||
if(entries[j].shift>=maxlen)return TH_EINVAL;
|
||||
/*Descend into the tree, writing a bit for each branch.*/
|
||||
for(;bpos>entries[j].shift;bpos--)oggpackB_write(_opb,0,1);
|
||||
/*Mark this as a leaf node, and write its value.*/
|
||||
oggpackB_write(_opb,1,1);
|
||||
oggpackB_write(_opb,entries[j].token,5);
|
||||
/*For each 1 branch we've descended, back up the tree until we reach a
|
||||
0 branch.*/
|
||||
bit=(ogg_uint32_t)1<<bpos;
|
||||
for(;entries[j].pattern&bit;bpos++)bit<<=1;
|
||||
/*Validate the code.*/
|
||||
if(j+1<TH_NDCT_TOKENS){
|
||||
mask=~(bit-1)<<1;
|
||||
/*The next entry should have a 1 bit where we had a 0, and should
|
||||
match our code above that bit.
|
||||
This verifies both fullness and prefix-freeness simultaneously.*/
|
||||
if(!(entries[j+1].pattern&bit)||
|
||||
(entries[j].pattern&mask)!=(entries[j+1].pattern&mask)){
|
||||
return TH_EINVAL;
|
||||
}
|
||||
}
|
||||
/*If there are no more codes, we should have ascended back to the top
|
||||
of the tree.*/
|
||||
else if(bpos<maxlen)return TH_EINVAL;
|
||||
}
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*This is used to copy the configuration of an existing setup header for use by
|
||||
the encoder.
|
||||
The decoder uses a completely different data structure for the Huffman
|
||||
codebooks.*/
|
||||
int oc_huff_codes_unpack(oc_pack_buf *_opb,
|
||||
th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]){
|
||||
int i;
|
||||
for(i=0;i<TH_NHUFFMAN_TABLES;i++){
|
||||
ogg_uint32_t code;
|
||||
int len;
|
||||
int nleaves;
|
||||
code=0;
|
||||
len=nleaves=0;
|
||||
memset(_codes[i],0,TH_NDCT_TOKENS*sizeof(*_codes[i]));
|
||||
for(;;){
|
||||
long bits;
|
||||
bits=oc_pack_read1(_opb);
|
||||
/*Only process nodes so long as there's more bits in the buffer.*/
|
||||
if(oc_pack_bytes_left(_opb)<0)return TH_EBADHEADER;
|
||||
/*Read an internal node:*/
|
||||
if(!bits){
|
||||
len++;
|
||||
/*Don't allow codewords longer than 32 bits.*/
|
||||
if(len>32)return TH_EBADHEADER;
|
||||
}
|
||||
/*Read a leaf node:*/
|
||||
else{
|
||||
ogg_uint32_t code_bit;
|
||||
/*Don't allow more than 32 tokens per codebook.*/
|
||||
if(++nleaves>32)return TH_EBADHEADER;
|
||||
bits=oc_pack_read(_opb,OC_NDCT_TOKEN_BITS);
|
||||
/*The current encoder does not support codebooks that do not contain
|
||||
all of the tokens.*/
|
||||
if(_codes[i][bits].nbits>0)return TH_EINVAL;
|
||||
_codes[i][bits].pattern=code>>32-len;
|
||||
_codes[i][bits].nbits=len;
|
||||
code_bit=0x80000000U>>len-1;
|
||||
while(len>0&&(code&code_bit)){
|
||||
code^=code_bit;
|
||||
code_bit<<=1;
|
||||
len--;
|
||||
}
|
||||
if(len<=0)break;
|
||||
code|=code_bit;
|
||||
}
|
||||
}
|
||||
/*The current encoder does not support codebooks that do not contain all of
|
||||
the tokens.*/
|
||||
if(nleaves<32)return TH_EINVAL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
22
engine/thirdparty/libtheora/huffenc.h
vendored
Normal file
22
engine/thirdparty/libtheora/huffenc.h
vendored
Normal file
|
|
@ -0,0 +1,22 @@
|
|||
#if !defined(_huffenc_H)
|
||||
# define _huffenc_H (1)
|
||||
# include "huffman.h"
|
||||
# include "bitpack.h"
|
||||
|
||||
|
||||
|
||||
typedef th_huff_code th_huff_table[TH_NDCT_TOKENS];
|
||||
|
||||
|
||||
|
||||
extern const th_huff_code
|
||||
TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
|
||||
|
||||
|
||||
|
||||
int oc_huff_codes_pack(oggpack_buffer *_opb,
|
||||
const th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]);
|
||||
int oc_huff_codes_unpack(oc_pack_buf *_opb,
|
||||
th_huff_code _codes[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS]);
|
||||
|
||||
#endif
|
||||
70
engine/thirdparty/libtheora/huffman.h
vendored
Normal file
70
engine/thirdparty/libtheora/huffman.h
vendored
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_huffman_H)
|
||||
# define _huffman_H (1)
|
||||
# include "theora/codec.h"
|
||||
# include "ocintrin.h"
|
||||
|
||||
/*The range of valid quantized DCT coefficient values.
|
||||
VP3 used 511 in the encoder, but the bitstream is capable of 580.*/
|
||||
#define OC_DCT_VAL_RANGE (580)
|
||||
|
||||
#define OC_NDCT_TOKEN_BITS (5)
|
||||
|
||||
#define OC_DCT_EOB1_TOKEN (0)
|
||||
#define OC_DCT_EOB2_TOKEN (1)
|
||||
#define OC_DCT_EOB3_TOKEN (2)
|
||||
#define OC_DCT_REPEAT_RUN0_TOKEN (3)
|
||||
#define OC_DCT_REPEAT_RUN1_TOKEN (4)
|
||||
#define OC_DCT_REPEAT_RUN2_TOKEN (5)
|
||||
#define OC_DCT_REPEAT_RUN3_TOKEN (6)
|
||||
|
||||
#define OC_DCT_SHORT_ZRL_TOKEN (7)
|
||||
#define OC_DCT_ZRL_TOKEN (8)
|
||||
|
||||
#define OC_ONE_TOKEN (9)
|
||||
#define OC_MINUS_ONE_TOKEN (10)
|
||||
#define OC_TWO_TOKEN (11)
|
||||
#define OC_MINUS_TWO_TOKEN (12)
|
||||
|
||||
#define OC_DCT_VAL_CAT2 (13)
|
||||
#define OC_DCT_VAL_CAT3 (17)
|
||||
#define OC_DCT_VAL_CAT4 (18)
|
||||
#define OC_DCT_VAL_CAT5 (19)
|
||||
#define OC_DCT_VAL_CAT6 (20)
|
||||
#define OC_DCT_VAL_CAT7 (21)
|
||||
#define OC_DCT_VAL_CAT8 (22)
|
||||
|
||||
#define OC_DCT_RUN_CAT1A (23)
|
||||
#define OC_DCT_RUN_CAT1B (28)
|
||||
#define OC_DCT_RUN_CAT1C (29)
|
||||
#define OC_DCT_RUN_CAT2A (30)
|
||||
#define OC_DCT_RUN_CAT2B (31)
|
||||
|
||||
#define OC_NDCT_EOB_TOKEN_MAX (7)
|
||||
#define OC_NDCT_ZRL_TOKEN_MAX (9)
|
||||
#define OC_NDCT_VAL_MAX (23)
|
||||
#define OC_NDCT_VAL_CAT1_MAX (13)
|
||||
#define OC_NDCT_VAL_CAT2_MAX (17)
|
||||
#define OC_NDCT_VAL_CAT2_SIZE (OC_NDCT_VAL_CAT2_MAX-OC_DCT_VAL_CAT2)
|
||||
#define OC_NDCT_RUN_MAX (32)
|
||||
#define OC_NDCT_RUN_CAT1A_MAX (28)
|
||||
|
||||
extern const unsigned char OC_DCT_TOKEN_EXTRA_BITS[TH_NDCT_TOKENS];
|
||||
|
||||
#endif
|
||||
330
engine/thirdparty/libtheora/idct.c
vendored
Normal file
330
engine/thirdparty/libtheora/idct.c
vendored
Normal file
|
|
@ -0,0 +1,330 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include <string.h>
|
||||
#include "internal.h"
|
||||
#include "dct.h"
|
||||
|
||||
/*Performs an inverse 8 point Type-II DCT transform.
|
||||
The output is scaled by a factor of 2 relative to the orthonormal version of
|
||||
the transform.
|
||||
_y: The buffer to store the result in.
|
||||
Data will be placed in every 8th entry (e.g., in a column of an 8x8
|
||||
block).
|
||||
_x: The input coefficients.
|
||||
The first 8 entries are used (e.g., from a row of an 8x8 block).*/
|
||||
static void idct8(ogg_int16_t *_y,const ogg_int16_t _x[8]){
|
||||
ogg_int32_t t[8];
|
||||
ogg_int32_t r;
|
||||
/*Stage 1:*/
|
||||
/*0-1 butterfly.*/
|
||||
t[0]=OC_C4S4*(ogg_int16_t)(_x[0]+_x[4])>>16;
|
||||
t[1]=OC_C4S4*(ogg_int16_t)(_x[0]-_x[4])>>16;
|
||||
/*2-3 rotation by 6pi/16.*/
|
||||
t[2]=(OC_C6S2*_x[2]>>16)-(OC_C2S6*_x[6]>>16);
|
||||
t[3]=(OC_C2S6*_x[2]>>16)+(OC_C6S2*_x[6]>>16);
|
||||
/*4-7 rotation by 7pi/16.*/
|
||||
t[4]=(OC_C7S1*_x[1]>>16)-(OC_C1S7*_x[7]>>16);
|
||||
/*5-6 rotation by 3pi/16.*/
|
||||
t[5]=(OC_C3S5*_x[5]>>16)-(OC_C5S3*_x[3]>>16);
|
||||
t[6]=(OC_C5S3*_x[5]>>16)+(OC_C3S5*_x[3]>>16);
|
||||
t[7]=(OC_C1S7*_x[1]>>16)+(OC_C7S1*_x[7]>>16);
|
||||
/*Stage 2:*/
|
||||
/*4-5 butterfly.*/
|
||||
r=t[4]+t[5];
|
||||
t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16;
|
||||
t[4]=r;
|
||||
/*7-6 butterfly.*/
|
||||
r=t[7]+t[6];
|
||||
t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16;
|
||||
t[7]=r;
|
||||
/*Stage 3:*/
|
||||
/*0-3 butterfly.*/
|
||||
r=t[0]+t[3];
|
||||
t[3]=t[0]-t[3];
|
||||
t[0]=r;
|
||||
/*1-2 butterfly.*/
|
||||
r=t[1]+t[2];
|
||||
t[2]=t[1]-t[2];
|
||||
t[1]=r;
|
||||
/*6-5 butterfly.*/
|
||||
r=t[6]+t[5];
|
||||
t[5]=t[6]-t[5];
|
||||
t[6]=r;
|
||||
/*Stage 4:*/
|
||||
/*0-7 butterfly.*/
|
||||
_y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
|
||||
/*1-6 butterfly.*/
|
||||
_y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
|
||||
/*2-5 butterfly.*/
|
||||
_y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
|
||||
/*3-4 butterfly.*/
|
||||
_y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
|
||||
_y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
|
||||
_y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
|
||||
_y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
|
||||
_y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
|
||||
}
|
||||
|
||||
/*Performs an inverse 8 point Type-II DCT transform.
|
||||
The output is scaled by a factor of 2 relative to the orthonormal version of
|
||||
the transform.
|
||||
_y: The buffer to store the result in.
|
||||
Data will be placed in every 8th entry (e.g., in a column of an 8x8
|
||||
block).
|
||||
_x: The input coefficients.
|
||||
Only the first 4 entries are used.
|
||||
The other 4 are assumed to be 0.*/
|
||||
static void idct8_4(ogg_int16_t *_y,const ogg_int16_t _x[8]){
|
||||
ogg_int32_t t[8];
|
||||
ogg_int32_t r;
|
||||
/*Stage 1:*/
|
||||
t[0]=OC_C4S4*_x[0]>>16;
|
||||
t[2]=OC_C6S2*_x[2]>>16;
|
||||
t[3]=OC_C2S6*_x[2]>>16;
|
||||
t[4]=OC_C7S1*_x[1]>>16;
|
||||
t[5]=-(OC_C5S3*_x[3]>>16);
|
||||
t[6]=OC_C3S5*_x[3]>>16;
|
||||
t[7]=OC_C1S7*_x[1]>>16;
|
||||
/*Stage 2:*/
|
||||
r=t[4]+t[5];
|
||||
t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16;
|
||||
t[4]=r;
|
||||
r=t[7]+t[6];
|
||||
t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16;
|
||||
t[7]=r;
|
||||
/*Stage 3:*/
|
||||
t[1]=t[0]+t[2];
|
||||
t[2]=t[0]-t[2];
|
||||
r=t[0]+t[3];
|
||||
t[3]=t[0]-t[3];
|
||||
t[0]=r;
|
||||
r=t[6]+t[5];
|
||||
t[5]=t[6]-t[5];
|
||||
t[6]=r;
|
||||
/*Stage 4:*/
|
||||
_y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
|
||||
_y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
|
||||
_y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
|
||||
_y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
|
||||
_y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
|
||||
_y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
|
||||
_y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
|
||||
_y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
|
||||
}
|
||||
|
||||
/*Performs an inverse 8 point Type-II DCT transform.
|
||||
The output is scaled by a factor of 2 relative to the orthonormal version of
|
||||
the transform.
|
||||
_y: The buffer to store the result in.
|
||||
Data will be placed in every 8th entry (e.g., in a column of an 8x8
|
||||
block).
|
||||
_x: The input coefficients.
|
||||
Only the first 3 entries are used.
|
||||
The other 5 are assumed to be 0.*/
|
||||
static void idct8_3(ogg_int16_t *_y,const ogg_int16_t _x[8]){
|
||||
ogg_int32_t t[8];
|
||||
ogg_int32_t r;
|
||||
/*Stage 1:*/
|
||||
t[0]=OC_C4S4*_x[0]>>16;
|
||||
t[2]=OC_C6S2*_x[2]>>16;
|
||||
t[3]=OC_C2S6*_x[2]>>16;
|
||||
t[4]=OC_C7S1*_x[1]>>16;
|
||||
t[7]=OC_C1S7*_x[1]>>16;
|
||||
/*Stage 2:*/
|
||||
t[5]=OC_C4S4*t[4]>>16;
|
||||
t[6]=OC_C4S4*t[7]>>16;
|
||||
/*Stage 3:*/
|
||||
t[1]=t[0]+t[2];
|
||||
t[2]=t[0]-t[2];
|
||||
r=t[0]+t[3];
|
||||
t[3]=t[0]-t[3];
|
||||
t[0]=r;
|
||||
r=t[6]+t[5];
|
||||
t[5]=t[6]-t[5];
|
||||
t[6]=r;
|
||||
/*Stage 4:*/
|
||||
_y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
|
||||
_y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
|
||||
_y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
|
||||
_y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
|
||||
_y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
|
||||
_y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
|
||||
_y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
|
||||
_y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
|
||||
}
|
||||
|
||||
/*Performs an inverse 8 point Type-II DCT transform.
|
||||
The output is scaled by a factor of 2 relative to the orthonormal version of
|
||||
the transform.
|
||||
_y: The buffer to store the result in.
|
||||
Data will be placed in every 8th entry (e.g., in a column of an 8x8
|
||||
block).
|
||||
_x: The input coefficients.
|
||||
Only the first 2 entries are used.
|
||||
The other 6 are assumed to be 0.*/
|
||||
static void idct8_2(ogg_int16_t *_y,const ogg_int16_t _x[8]){
|
||||
ogg_int32_t t[8];
|
||||
ogg_int32_t r;
|
||||
/*Stage 1:*/
|
||||
t[0]=OC_C4S4*_x[0]>>16;
|
||||
t[4]=OC_C7S1*_x[1]>>16;
|
||||
t[7]=OC_C1S7*_x[1]>>16;
|
||||
/*Stage 2:*/
|
||||
t[5]=OC_C4S4*t[4]>>16;
|
||||
t[6]=OC_C4S4*t[7]>>16;
|
||||
/*Stage 3:*/
|
||||
r=t[6]+t[5];
|
||||
t[5]=t[6]-t[5];
|
||||
t[6]=r;
|
||||
/*Stage 4:*/
|
||||
_y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
|
||||
_y[1<<3]=(ogg_int16_t)(t[0]+t[6]);
|
||||
_y[2<<3]=(ogg_int16_t)(t[0]+t[5]);
|
||||
_y[3<<3]=(ogg_int16_t)(t[0]+t[4]);
|
||||
_y[4<<3]=(ogg_int16_t)(t[0]-t[4]);
|
||||
_y[5<<3]=(ogg_int16_t)(t[0]-t[5]);
|
||||
_y[6<<3]=(ogg_int16_t)(t[0]-t[6]);
|
||||
_y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
|
||||
}
|
||||
|
||||
/*Performs an inverse 8 point Type-II DCT transform.
|
||||
The output is scaled by a factor of 2 relative to the orthonormal version of
|
||||
the transform.
|
||||
_y: The buffer to store the result in.
|
||||
Data will be placed in every 8th entry (e.g., in a column of an 8x8
|
||||
block).
|
||||
_x: The input coefficients.
|
||||
Only the first entry is used.
|
||||
The other 7 are assumed to be 0.*/
|
||||
static void idct8_1(ogg_int16_t *_y,const ogg_int16_t _x[1]){
|
||||
_y[0<<3]=_y[1<<3]=_y[2<<3]=_y[3<<3]=
|
||||
_y[4<<3]=_y[5<<3]=_y[6<<3]=_y[7<<3]=(ogg_int16_t)(OC_C4S4*_x[0]>>16);
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||
version of the transform.
|
||||
All coefficients but the first 3 in zig-zag scan order are assumed to be 0:
|
||||
x x 0 0 0 0 0 0
|
||||
x 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0
|
||||
_y: The buffer to store the result in.
|
||||
This may be the same as _x.
|
||||
_x: The input coefficients.*/
|
||||
static void oc_idct8x8_3(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
ogg_int16_t w[64];
|
||||
int i;
|
||||
/*Transform rows of x into columns of w.*/
|
||||
idct8_2(w,_x);
|
||||
idct8_1(w+1,_x+8);
|
||||
/*Transform rows of w into columns of y.*/
|
||||
for(i=0;i<8;i++)idct8_2(_y+i,w+i*8);
|
||||
/*Adjust for the scale factor.*/
|
||||
for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
|
||||
/*Clear input data for next block.*/
|
||||
_x[0]=_x[1]=_x[8]=0;
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||
version of the transform.
|
||||
All coefficients but the first 10 in zig-zag scan order are assumed to be 0:
|
||||
x x x x 0 0 0 0
|
||||
x x x 0 0 0 0 0
|
||||
x x 0 0 0 0 0 0
|
||||
x 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0
|
||||
0 0 0 0 0 0 0 0
|
||||
_y: The buffer to store the result in.
|
||||
This may be the same as _x.
|
||||
_x: The input coefficients.*/
|
||||
static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
ogg_int16_t w[64];
|
||||
int i;
|
||||
/*Transform rows of x into columns of w.*/
|
||||
idct8_4(w,_x);
|
||||
idct8_3(w+1,_x+8);
|
||||
idct8_2(w+2,_x+16);
|
||||
idct8_1(w+3,_x+24);
|
||||
/*Transform rows of w into columns of y.*/
|
||||
for(i=0;i<8;i++)idct8_4(_y+i,w+i*8);
|
||||
/*Adjust for the scale factor.*/
|
||||
for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
|
||||
/*Clear input data for next block.*/
|
||||
_x[0]=_x[1]=_x[2]=_x[3]=_x[8]=_x[9]=_x[10]=_x[16]=_x[17]=_x[24]=0;
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||
version of the transform.
|
||||
_y: The buffer to store the result in.
|
||||
This may be the same as _x.
|
||||
_x: The input coefficients.*/
|
||||
static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
ogg_int16_t w[64];
|
||||
int i;
|
||||
/*Transform rows of x into columns of w.*/
|
||||
for(i=0;i<8;i++)idct8(w+i,_x+i*8);
|
||||
/*Transform rows of w into columns of y.*/
|
||||
for(i=0;i<8;i++)idct8(_y+i,w+i*8);
|
||||
/*Adjust for the scale factor.*/
|
||||
for(i=0;i<64;i++)_y[i]=(ogg_int16_t)(_y[i]+8>>4);
|
||||
/*Clear input data for next block.*/
|
||||
for(i=0;i<64;i++)_x[i]=0;
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||
version of the transform.*/
|
||||
void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||
/*_last_zzi is subtly different from an actual count of the number of
|
||||
coefficients we decoded for this block.
|
||||
It contains the value of zzi BEFORE the final token in the block was
|
||||
decoded.
|
||||
In most cases this is an EOB token (the continuation of an EOB run from a
|
||||
previous block counts), and so this is the same as the coefficient count.
|
||||
However, in the case that the last token was NOT an EOB token, but filled
|
||||
the block up with exactly 64 coefficients, _last_zzi will be less than 64.
|
||||
Provided the last token was not a pure zero run, the minimum value it can
|
||||
be is 46, and so that doesn't affect any of the cases in this routine.
|
||||
However, if the last token WAS a pure zero run of length 63, then _last_zzi
|
||||
will be 1 while the number of coefficients decoded is 64.
|
||||
Thus, we will trigger the following special case, where the real
|
||||
coefficient count would not.
|
||||
Note also that a zero run of length 64 will give _last_zzi a value of 0,
|
||||
but we still process the DC coefficient, which might have a non-zero value
|
||||
due to DC prediction.
|
||||
Although convoluted, this is arguably the correct behavior: it allows us to
|
||||
use a smaller transform when the block ends with a long zero run instead
|
||||
of a normal EOB token.
|
||||
It could be smarter... multiple separate zero runs at the end of a block
|
||||
will fool it, but an encoder that generates these really deserves what it
|
||||
gets.
|
||||
Needless to say we inherited this approach from VP3.*/
|
||||
/*Then perform the iDCT.*/
|
||||
if(_last_zzi<=3)oc_idct8x8_3(_y,_x);
|
||||
else if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
|
||||
else oc_idct8x8_slow(_y,_x);
|
||||
}
|
||||
131
engine/thirdparty/libtheora/info.c
vendored
Normal file
131
engine/thirdparty/libtheora/info.c
vendored
Normal file
|
|
@ -0,0 +1,131 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <string.h>
|
||||
#include "internal.h"
|
||||
|
||||
|
||||
|
||||
/*This is more or less the same as strncasecmp, but that doesn't exist
|
||||
everywhere, and this is a fairly trivial function, so we include it.
|
||||
Note: We take advantage of the fact that we know _n is less than or equal to
|
||||
the length of at least one of the strings.*/
|
||||
static int oc_tagcompare(const char *_s1,const char *_s2,int _n){
|
||||
int c;
|
||||
for(c=0;c<_n;c++){
|
||||
if(toupper(_s1[c])!=toupper(_s2[c]))return !0;
|
||||
}
|
||||
return _s1[c]!='=';
|
||||
}
|
||||
|
||||
|
||||
|
||||
void th_info_init(th_info *_info){
|
||||
memset(_info,0,sizeof(*_info));
|
||||
_info->version_major=TH_VERSION_MAJOR;
|
||||
_info->version_minor=TH_VERSION_MINOR;
|
||||
_info->version_subminor=TH_VERSION_SUB;
|
||||
_info->keyframe_granule_shift=6;
|
||||
}
|
||||
|
||||
void th_info_clear(th_info *_info){
|
||||
memset(_info,0,sizeof(*_info));
|
||||
}
|
||||
|
||||
|
||||
|
||||
void th_comment_init(th_comment *_tc){
|
||||
memset(_tc,0,sizeof(*_tc));
|
||||
}
|
||||
|
||||
void th_comment_add(th_comment *_tc,const char *_comment){
|
||||
char **user_comments;
|
||||
int *comment_lengths;
|
||||
int comment_len;
|
||||
user_comments=_ogg_realloc(_tc->user_comments,
|
||||
(_tc->comments+2)*sizeof(*_tc->user_comments));
|
||||
if(user_comments==NULL)return;
|
||||
_tc->user_comments=user_comments;
|
||||
comment_lengths=_ogg_realloc(_tc->comment_lengths,
|
||||
(_tc->comments+2)*sizeof(*_tc->comment_lengths));
|
||||
if(comment_lengths==NULL)return;
|
||||
_tc->comment_lengths=comment_lengths;
|
||||
comment_len=strlen(_comment);
|
||||
comment_lengths[_tc->comments]=comment_len;
|
||||
user_comments[_tc->comments]=_ogg_malloc(comment_len+1);
|
||||
if(user_comments[_tc->comments]==NULL)return;
|
||||
memcpy(_tc->user_comments[_tc->comments],_comment,comment_len+1);
|
||||
_tc->comments++;
|
||||
_tc->user_comments[_tc->comments]=NULL;
|
||||
}
|
||||
|
||||
void th_comment_add_tag(th_comment *_tc,const char *_tag,const char *_val){
|
||||
char *comment;
|
||||
int tag_len;
|
||||
int val_len;
|
||||
tag_len=strlen(_tag);
|
||||
val_len=strlen(_val);
|
||||
/*+2 for '=' and '\0'.*/
|
||||
comment=_ogg_malloc(tag_len+val_len+2);
|
||||
if(comment==NULL)return;
|
||||
memcpy(comment,_tag,tag_len);
|
||||
comment[tag_len]='=';
|
||||
memcpy(comment+tag_len+1,_val,val_len+1);
|
||||
th_comment_add(_tc,comment);
|
||||
_ogg_free(comment);
|
||||
}
|
||||
|
||||
char *th_comment_query(th_comment *_tc,const char *_tag,int _count){
|
||||
long i;
|
||||
int found;
|
||||
int tag_len;
|
||||
tag_len=strlen(_tag);
|
||||
found=0;
|
||||
for(i=0;i<_tc->comments;i++){
|
||||
if(!oc_tagcompare(_tc->user_comments[i],_tag,tag_len)){
|
||||
/*We return a pointer to the data, not a copy.*/
|
||||
if(_count==found++)return _tc->user_comments[i]+tag_len+1;
|
||||
}
|
||||
}
|
||||
/*Didn't find anything.*/
|
||||
return NULL;
|
||||
}
|
||||
|
||||
int th_comment_query_count(th_comment *_tc,const char *_tag){
|
||||
long i;
|
||||
int tag_len;
|
||||
int count;
|
||||
tag_len=strlen(_tag);
|
||||
count=0;
|
||||
for(i=0;i<_tc->comments;i++){
|
||||
if(!oc_tagcompare(_tc->user_comments[i],_tag,tag_len))count++;
|
||||
}
|
||||
return count;
|
||||
}
|
||||
|
||||
void th_comment_clear(th_comment *_tc){
|
||||
if(_tc!=NULL){
|
||||
long i;
|
||||
for(i=0;i<_tc->comments;i++)_ogg_free(_tc->user_comments[i]);
|
||||
_ogg_free(_tc->user_comments);
|
||||
_ogg_free(_tc->comment_lengths);
|
||||
_ogg_free(_tc->vendor);
|
||||
memset(_tc,0,sizeof(*_tc));
|
||||
}
|
||||
}
|
||||
210
engine/thirdparty/libtheora/internal.c
vendored
Normal file
210
engine/thirdparty/libtheora/internal.c
vendored
Normal file
|
|
@ -0,0 +1,210 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
#include <string.h>
|
||||
#include "internal.h"
|
||||
|
||||
|
||||
|
||||
/*A map from the index in the zig zag scan to the coefficient number in a
|
||||
block.
|
||||
All zig zag indices beyond 63 are sent to coefficient 64, so that zero runs
|
||||
past the end of a block in bogus streams get mapped to a known location.*/
|
||||
const unsigned char OC_FZIG_ZAG[128]={
|
||||
0, 1, 8,16, 9, 2, 3,10,
|
||||
17,24,32,25,18,11, 4, 5,
|
||||
12,19,26,33,40,48,41,34,
|
||||
27,20,13, 6, 7,14,21,28,
|
||||
35,42,49,56,57,50,43,36,
|
||||
29,22,15,23,30,37,44,51,
|
||||
58,59,52,45,38,31,39,46,
|
||||
53,60,61,54,47,55,62,63,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64
|
||||
};
|
||||
|
||||
/*A map from the coefficient number in a block to its index in the zig zag
|
||||
scan.*/
|
||||
const unsigned char OC_IZIG_ZAG[64]={
|
||||
0, 1, 5, 6,14,15,27,28,
|
||||
2, 4, 7,13,16,26,29,42,
|
||||
3, 8,12,17,25,30,41,43,
|
||||
9,11,18,24,31,40,44,53,
|
||||
10,19,23,32,39,45,52,54,
|
||||
20,22,33,38,46,51,55,60,
|
||||
21,34,37,47,50,56,59,61,
|
||||
35,36,48,49,57,58,62,63
|
||||
};
|
||||
|
||||
/*A map from physical macro block ordering to bitstream macro block
|
||||
ordering within a super block.*/
|
||||
const unsigned char OC_MB_MAP[2][2]={{0,3},{1,2}};
|
||||
|
||||
/*A list of the indices in the oc_mb.map array that can be valid for each of
|
||||
the various chroma decimation types.*/
|
||||
const unsigned char OC_MB_MAP_IDXS[TH_PF_NFORMATS][12]={
|
||||
{0,1,2,3,4,8},
|
||||
{0,1,2,3,4,5,8,9},
|
||||
{0,1,2,3,4,6,8,10},
|
||||
{0,1,2,3,4,5,6,7,8,9,10,11}
|
||||
};
|
||||
|
||||
/*The number of indices in the oc_mb.map array that can be valid for each of
|
||||
the various chroma decimation types.*/
|
||||
const unsigned char OC_MB_MAP_NIDXS[TH_PF_NFORMATS]={6,8,8,12};
|
||||
|
||||
/*The number of extra bits that are coded with each of the DCT tokens.
|
||||
Each DCT token has some fixed number of additional bits (possibly 0) stored
|
||||
after the token itself, containing, for example, coefficient magnitude,
|
||||
sign bits, etc.*/
|
||||
const unsigned char OC_DCT_TOKEN_EXTRA_BITS[TH_NDCT_TOKENS]={
|
||||
0,0,0,2,3,4,12,3,6,
|
||||
0,0,0,0,
|
||||
1,1,1,1,2,3,4,5,6,10,
|
||||
1,1,1,1,1,3,4,
|
||||
2,3
|
||||
};
|
||||
|
||||
|
||||
|
||||
int oc_ilog(unsigned _v){
|
||||
int ret;
|
||||
for(ret=0;_v;ret++)_v>>=1;
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
void *oc_aligned_malloc(size_t _sz,size_t _align){
|
||||
unsigned char *p;
|
||||
if(_align-1>UCHAR_MAX||(_align&_align-1)||_sz>~(size_t)0-_align)return NULL;
|
||||
p=(unsigned char *)_ogg_malloc(_sz+_align);
|
||||
if(p!=NULL){
|
||||
int offs;
|
||||
offs=((p-(unsigned char *)0)-1&_align-1);
|
||||
p[offs]=offs;
|
||||
p+=offs+1;
|
||||
}
|
||||
return p;
|
||||
}
|
||||
|
||||
void oc_aligned_free(void *_ptr){
|
||||
unsigned char *p;
|
||||
p=(unsigned char *)_ptr;
|
||||
if(p!=NULL){
|
||||
int offs;
|
||||
offs=*--p;
|
||||
_ogg_free(p-offs);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz){
|
||||
size_t rowsz;
|
||||
size_t colsz;
|
||||
size_t datsz;
|
||||
char *ret;
|
||||
colsz=_height*sizeof(void *);
|
||||
rowsz=_sz*_width;
|
||||
datsz=rowsz*_height;
|
||||
/*Alloc array and row pointers.*/
|
||||
ret=(char *)_ogg_malloc(datsz+colsz);
|
||||
/*Initialize the array.*/
|
||||
if(ret!=NULL){
|
||||
size_t i;
|
||||
void **p;
|
||||
char *datptr;
|
||||
p=(void **)ret;
|
||||
i=_height;
|
||||
for(datptr=ret+colsz;i-->0;p++,datptr+=rowsz)*p=(void *)datptr;
|
||||
}
|
||||
return (void **)ret;
|
||||
}
|
||||
|
||||
void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz){
|
||||
size_t colsz;
|
||||
size_t rowsz;
|
||||
size_t datsz;
|
||||
char *ret;
|
||||
colsz=_height*sizeof(void *);
|
||||
rowsz=_sz*_width;
|
||||
datsz=rowsz*_height;
|
||||
/*Alloc array and row pointers.*/
|
||||
ret=(char *)_ogg_calloc(datsz+colsz,1);
|
||||
/*Initialize the array.*/
|
||||
if(ret!=NULL){
|
||||
size_t i;
|
||||
void **p;
|
||||
char *datptr;
|
||||
p=(void **)ret;
|
||||
i=_height;
|
||||
for(datptr=ret+colsz;i-->0;p++,datptr+=rowsz)*p=(void *)datptr;
|
||||
}
|
||||
return (void **)ret;
|
||||
}
|
||||
|
||||
void oc_free_2d(void *_ptr){
|
||||
_ogg_free(_ptr);
|
||||
}
|
||||
|
||||
/*Fills in a Y'CbCr buffer with a pointer to the image data in the first
|
||||
buffer, but with the opposite vertical orientation.
|
||||
_dst: The destination buffer.
|
||||
This can be the same as _src.
|
||||
_src: The source buffer.*/
|
||||
void oc_ycbcr_buffer_flip(th_ycbcr_buffer _dst,
|
||||
const th_ycbcr_buffer _src){
|
||||
int pli;
|
||||
for(pli=0;pli<3;pli++){
|
||||
_dst[pli].width=_src[pli].width;
|
||||
_dst[pli].height=_src[pli].height;
|
||||
_dst[pli].stride=-_src[pli].stride;
|
||||
_dst[pli].data=_src[pli].data
|
||||
+(1-_dst[pli].height)*(ptrdiff_t)_dst[pli].stride;
|
||||
}
|
||||
}
|
||||
|
||||
const char *th_version_string(void){
|
||||
return OC_VENDOR_STRING;
|
||||
}
|
||||
|
||||
ogg_uint32_t th_version_number(void){
|
||||
return (TH_VERSION_MAJOR<<16)+(TH_VERSION_MINOR<<8)+TH_VERSION_SUB;
|
||||
}
|
||||
|
||||
/*Determines the packet type.
|
||||
Note that this correctly interprets a 0-byte packet as a video data packet.
|
||||
Return: 1 for a header packet, 0 for a data packet.*/
|
||||
int th_packet_isheader(ogg_packet *_op){
|
||||
return _op->bytes>0?_op->packet[0]>>7:0;
|
||||
}
|
||||
|
||||
/*Determines the frame type of a video data packet.
|
||||
Note that this correctly interprets a 0-byte packet as a delta frame.
|
||||
Return: 1 for a key frame, 0 for a delta frame, and -1 for a header
|
||||
packet.*/
|
||||
int th_packet_iskeyframe(ogg_packet *_op){
|
||||
return _op->bytes<=0?0:_op->packet[0]&0x80?-1:!(_op->packet[0]&0x40);
|
||||
}
|
||||
116
engine/thirdparty/libtheora/internal.h
vendored
Normal file
116
engine/thirdparty/libtheora/internal.h
vendored
Normal file
|
|
@ -0,0 +1,116 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
#if !defined(_internal_H)
|
||||
# define _internal_H (1)
|
||||
# include <stdlib.h>
|
||||
# include <limits.h>
|
||||
# if defined(HAVE_CONFIG_H)
|
||||
# include "config.h"
|
||||
# endif
|
||||
# include "theora/codec.h"
|
||||
# include "theora/theora.h"
|
||||
# include "ocintrin.h"
|
||||
|
||||
# if !defined(__GNUC_PREREQ)
|
||||
# if defined(__GNUC__)&&defined(__GNUC_MINOR__)
|
||||
# define __GNUC_PREREQ(_maj,_min) \
|
||||
((__GNUC__<<16)+__GNUC_MINOR__>=((_maj)<<16)+(_min))
|
||||
# else
|
||||
# define __GNUC_PREREQ(_maj,_min) 0
|
||||
# endif
|
||||
# endif
|
||||
|
||||
# if defined(_MSC_VER)
|
||||
/*Disable missing EMMS warnings.*/
|
||||
# pragma warning(disable:4799)
|
||||
/*Thank you Microsoft, I know the order of operations.*/
|
||||
# pragma warning(disable:4554)
|
||||
# endif
|
||||
/*You, too, gcc.*/
|
||||
# if __GNUC_PREREQ(4,2)
|
||||
# pragma GCC diagnostic ignored "-Wparentheses"
|
||||
# endif
|
||||
|
||||
/*Some assembly constructs require aligned operands.
|
||||
The following macros are _only_ intended for structure member declarations.
|
||||
Although they will sometimes work on stack variables, gcc will often silently
|
||||
ignore them.
|
||||
A separate set of macros could be made for manual stack alignment, but we
|
||||
don't actually require it anywhere.*/
|
||||
# if defined(OC_X86_ASM)||defined(OC_ARM_ASM)
|
||||
# if defined(__GNUC__)
|
||||
# define OC_ALIGN8(expr) expr __attribute__((aligned(8)))
|
||||
# define OC_ALIGN16(expr) expr __attribute__((aligned(16)))
|
||||
# elif defined(_MSC_VER)
|
||||
# define OC_ALIGN8(expr) __declspec (align(8)) expr
|
||||
# define OC_ALIGN16(expr) __declspec (align(16)) expr
|
||||
# else
|
||||
# error "Alignment macros required for this platform."
|
||||
# endif
|
||||
# endif
|
||||
# if !defined(OC_ALIGN8)
|
||||
# define OC_ALIGN8(expr) expr
|
||||
# endif
|
||||
# if !defined(OC_ALIGN16)
|
||||
# define OC_ALIGN16(expr) expr
|
||||
# endif
|
||||
|
||||
|
||||
|
||||
/*This library's version.*/
|
||||
# define OC_VENDOR_STRING "Xiph.Org libtheora 1.2.0alpha 20100924 (Ptalarbvorm)"
|
||||
|
||||
/*Theora bitstream version.*/
|
||||
# define TH_VERSION_MAJOR (3)
|
||||
# define TH_VERSION_MINOR (2)
|
||||
# define TH_VERSION_SUB (1)
|
||||
# define TH_VERSION_CHECK(_info,_maj,_min,_sub) \
|
||||
((_info)->version_major>(_maj)||(_info)->version_major==(_maj)&& \
|
||||
((_info)->version_minor>(_min)||(_info)->version_minor==(_min)&& \
|
||||
(_info)->version_subminor>=(_sub)))
|
||||
|
||||
|
||||
|
||||
/*A map from the index in the zig zag scan to the coefficient number in a
|
||||
block.*/
|
||||
extern const unsigned char OC_FZIG_ZAG[128];
|
||||
/*A map from the coefficient number in a block to its index in the zig zag
|
||||
scan.*/
|
||||
extern const unsigned char OC_IZIG_ZAG[64];
|
||||
/*A map from physical macro block ordering to bitstream macro block
|
||||
ordering within a super block.*/
|
||||
extern const unsigned char OC_MB_MAP[2][2];
|
||||
/*A list of the indices in the oc_mb_map array that can be valid for each of
|
||||
the various chroma decimation types.*/
|
||||
extern const unsigned char OC_MB_MAP_IDXS[TH_PF_NFORMATS][12];
|
||||
/*The number of indices in the oc_mb_map array that can be valid for each of
|
||||
the various chroma decimation types.*/
|
||||
extern const unsigned char OC_MB_MAP_NIDXS[TH_PF_NFORMATS];
|
||||
|
||||
|
||||
|
||||
int oc_ilog(unsigned _v);
|
||||
void *oc_aligned_malloc(size_t _sz,size_t _align);
|
||||
void oc_aligned_free(void *_ptr);
|
||||
void **oc_malloc_2d(size_t _height,size_t _width,size_t _sz);
|
||||
void **oc_calloc_2d(size_t _height,size_t _width,size_t _sz);
|
||||
void oc_free_2d(void *_ptr);
|
||||
|
||||
void oc_ycbcr_buffer_flip(th_ycbcr_buffer _dst,
|
||||
const th_ycbcr_buffer _src);
|
||||
|
||||
#endif
|
||||
314
engine/thirdparty/libtheora/mathops.c
vendored
Normal file
314
engine/thirdparty/libtheora/mathops.c
vendored
Normal file
|
|
@ -0,0 +1,314 @@
|
|||
#include "internal.h"
|
||||
#include "mathops.h"
|
||||
|
||||
/*The fastest fallback strategy for platforms with fast multiplication appears
|
||||
to be based on de Bruijn sequences~\cite{LP98}.
|
||||
Define OC_ILOG_NODEBRUIJN to use a simpler fallback on platforms where
|
||||
multiplication or table lookups are too expensive.
|
||||
|
||||
@UNPUBLISHED{LP98,
|
||||
author="Charles E. Leiserson and Harald Prokop",
|
||||
title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word",
|
||||
month=Jun,
|
||||
year=1998,
|
||||
note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}"
|
||||
}*/
|
||||
#if !defined(OC_ILOG_NODEBRUIJN)&&!defined(OC_CLZ32)
|
||||
static const unsigned char OC_DEBRUIJN_IDX32[32]={
|
||||
0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8,
|
||||
31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9
|
||||
};
|
||||
#endif
|
||||
|
||||
int oc_ilog32(ogg_uint32_t _v){
|
||||
#if defined(OC_CLZ32)
|
||||
return OC_CLZ32_OFFS-OC_CLZ32(_v)&-!!_v;
|
||||
#else
|
||||
/*On a Pentium M, this branchless version tested as the fastest version without
|
||||
multiplications on 1,000,000,000 random 32-bit integers, edging out a
|
||||
similar version with branches, and a 256-entry LUT version.*/
|
||||
# if defined(OC_ILOG_NODEBRUIJN)
|
||||
int ret;
|
||||
int m;
|
||||
ret=_v>0;
|
||||
m=(_v>0xFFFFU)<<4;
|
||||
_v>>=m;
|
||||
ret|=m;
|
||||
m=(_v>0xFFU)<<3;
|
||||
_v>>=m;
|
||||
ret|=m;
|
||||
m=(_v>0xFU)<<2;
|
||||
_v>>=m;
|
||||
ret|=m;
|
||||
m=(_v>3)<<1;
|
||||
_v>>=m;
|
||||
ret|=m;
|
||||
ret+=_v>1;
|
||||
return ret;
|
||||
/*This de Bruijn sequence version is faster if you have a fast multiplier.*/
|
||||
# else
|
||||
int ret;
|
||||
_v|=_v>>1;
|
||||
_v|=_v>>2;
|
||||
_v|=_v>>4;
|
||||
_v|=_v>>8;
|
||||
_v|=_v>>16;
|
||||
ret=_v&1;
|
||||
_v=(_v>>1)+1;
|
||||
ret+=OC_DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F];
|
||||
return ret;
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
int oc_ilog64(ogg_int64_t _v){
|
||||
#if defined(OC_CLZ64)
|
||||
return OC_CLZ64_OFFS-OC_CLZ64(_v)&-!!_v;
|
||||
#else
|
||||
/*If we don't have a fast 64-bit word implementation, split it into two 32-bit
|
||||
halves.*/
|
||||
# if defined(OC_ILOG_NODEBRUIJN)|| \
|
||||
defined(OC_CLZ32)||LONG_MAX<9223372036854775807LL
|
||||
ogg_uint32_t v;
|
||||
int ret;
|
||||
int m;
|
||||
m=(_v>0xFFFFFFFFU)<<5;
|
||||
v=(ogg_uint32_t)(_v>>m);
|
||||
# if defined(OC_CLZ32)
|
||||
ret=m+OC_CLZ32_OFFS-OC_CLZ32(v)&-!!v;
|
||||
# elif defined(OC_ILOG_NODEBRUIJN)
|
||||
ret=v>0|m;
|
||||
m=(v>0xFFFFU)<<4;
|
||||
v>>=m;
|
||||
ret|=m;
|
||||
m=(v>0xFFU)<<3;
|
||||
v>>=m;
|
||||
ret|=m;
|
||||
m=(v>0xFU)<<2;
|
||||
v>>=m;
|
||||
ret|=m;
|
||||
m=(v>3)<<1;
|
||||
v>>=m;
|
||||
ret|=m;
|
||||
ret+=v>1;
|
||||
return ret;
|
||||
# else
|
||||
v|=v>>1;
|
||||
v|=v>>2;
|
||||
v|=v>>4;
|
||||
v|=v>>8;
|
||||
v|=v>>16;
|
||||
ret=v&1|m;
|
||||
v=(v>>1)+1;
|
||||
ret+=OC_DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F];
|
||||
# endif
|
||||
return ret;
|
||||
/*Otherwise do it in one 64-bit multiply.*/
|
||||
# else
|
||||
static const unsigned char OC_DEBRUIJN_IDX64[64]={
|
||||
0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
|
||||
5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
|
||||
63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,
|
||||
62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
|
||||
};
|
||||
int ret;
|
||||
_v|=_v>>1;
|
||||
_v|=_v>>2;
|
||||
_v|=_v>>4;
|
||||
_v|=_v>>8;
|
||||
_v|=_v>>16;
|
||||
_v|=_v>>32;
|
||||
ret=(int)_v&1;
|
||||
_v=(_v>>1)+1;
|
||||
ret+=OC_DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F];
|
||||
return ret;
|
||||
# endif
|
||||
#endif
|
||||
}
|
||||
|
||||
/*round(2**(62+i)*atanh(2**(-(i+1)))/log(2))*/
|
||||
static const ogg_int64_t OC_ATANH_LOG2[32]={
|
||||
0x32B803473F7AD0F4LL,0x2F2A71BD4E25E916LL,0x2E68B244BB93BA06LL,
|
||||
0x2E39FB9198CE62E4LL,0x2E2E683F68565C8FLL,0x2E2B850BE2077FC1LL,
|
||||
0x2E2ACC58FE7B78DBLL,0x2E2A9E2DE52FD5F2LL,0x2E2A92A338D53EECLL,
|
||||
0x2E2A8FC08F5E19B6LL,0x2E2A8F07E51A485ELL,0x2E2A8ED9BA8AF388LL,
|
||||
0x2E2A8ECE2FE7384ALL,0x2E2A8ECB4D3E4B1ALL,0x2E2A8ECA94940FE8LL,
|
||||
0x2E2A8ECA6669811DLL,0x2E2A8ECA5ADEDD6ALL,0x2E2A8ECA57FC347ELL,
|
||||
0x2E2A8ECA57438A43LL,0x2E2A8ECA57155FB4LL,0x2E2A8ECA5709D510LL,
|
||||
0x2E2A8ECA5706F267LL,0x2E2A8ECA570639BDLL,0x2E2A8ECA57060B92LL,
|
||||
0x2E2A8ECA57060008LL,0x2E2A8ECA5705FD25LL,0x2E2A8ECA5705FC6CLL,
|
||||
0x2E2A8ECA5705FC3ELL,0x2E2A8ECA5705FC33LL,0x2E2A8ECA5705FC30LL,
|
||||
0x2E2A8ECA5705FC2FLL,0x2E2A8ECA5705FC2FLL
|
||||
};
|
||||
|
||||
/*Computes the binary exponential of _z, a log base 2 in Q57 format.*/
|
||||
ogg_int64_t oc_bexp64(ogg_int64_t _z){
|
||||
ogg_int64_t w;
|
||||
ogg_int64_t z;
|
||||
int ipart;
|
||||
ipart=(int)(_z>>57);
|
||||
if(ipart<0)return 0;
|
||||
if(ipart>=63)return 0x7FFFFFFFFFFFFFFFLL;
|
||||
z=_z-OC_Q57(ipart);
|
||||
if(z){
|
||||
ogg_int64_t mask;
|
||||
long wlo;
|
||||
int i;
|
||||
/*C doesn't give us 64x64->128 muls, so we use CORDIC.
|
||||
This is not particularly fast, but it's not being used in time-critical
|
||||
code; it is very accurate.*/
|
||||
/*z is the fractional part of the log in Q62 format.
|
||||
We need 1 bit of headroom since the magnitude can get larger than 1
|
||||
during the iteration, and a sign bit.*/
|
||||
z<<=5;
|
||||
/*w is the exponential in Q61 format (since it also needs headroom and can
|
||||
get as large as 2.0); we could get another bit if we dropped the sign,
|
||||
but we'll recover that bit later anyway.
|
||||
Ideally this should start out as
|
||||
\lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}}
|
||||
but in order to guarantee convergence we have to repeat iterations 4,
|
||||
13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/
|
||||
w=0x26A3D0E401DD846DLL;
|
||||
for(i=0;;i++){
|
||||
mask=-(z<0);
|
||||
w+=(w>>i+1)+mask^mask;
|
||||
z-=OC_ATANH_LOG2[i]+mask^mask;
|
||||
/*Repeat iteration 4.*/
|
||||
if(i>=3)break;
|
||||
z<<=1;
|
||||
}
|
||||
for(;;i++){
|
||||
mask=-(z<0);
|
||||
w+=(w>>i+1)+mask^mask;
|
||||
z-=OC_ATANH_LOG2[i]+mask^mask;
|
||||
/*Repeat iteration 13.*/
|
||||
if(i>=12)break;
|
||||
z<<=1;
|
||||
}
|
||||
for(;i<32;i++){
|
||||
mask=-(z<0);
|
||||
w+=(w>>i+1)+mask^mask;
|
||||
z=z-(OC_ATANH_LOG2[i]+mask^mask)<<1;
|
||||
}
|
||||
wlo=0;
|
||||
/*Skip the remaining iterations unless we really require that much
|
||||
precision.
|
||||
We could have bailed out earlier for smaller iparts, but that would
|
||||
require initializing w from a table, as the limit doesn't converge to
|
||||
61-bit precision until n=30.*/
|
||||
if(ipart>30){
|
||||
/*For these iterations, we just update the low bits, as the high bits
|
||||
can't possibly be affected.
|
||||
OC_ATANH_LOG2 has also converged (it actually did so one iteration
|
||||
earlier, but that's no reason for an extra special case).*/
|
||||
for(;;i++){
|
||||
mask=-(z<0);
|
||||
wlo+=(w>>i)+mask^mask;
|
||||
z-=OC_ATANH_LOG2[31]+mask^mask;
|
||||
/*Repeat iteration 40.*/
|
||||
if(i>=39)break;
|
||||
z<<=1;
|
||||
}
|
||||
for(;i<61;i++){
|
||||
mask=-(z<0);
|
||||
wlo+=(w>>i)+mask^mask;
|
||||
z=z-(OC_ATANH_LOG2[31]+mask^mask)<<1;
|
||||
}
|
||||
}
|
||||
w=(w<<1)+wlo;
|
||||
}
|
||||
else w=(ogg_int64_t)1<<62;
|
||||
if(ipart<62)w=(w>>61-ipart)+1>>1;
|
||||
return w;
|
||||
}
|
||||
|
||||
/*Computes the binary logarithm of _w, returned in Q57 format.*/
|
||||
ogg_int64_t oc_blog64(ogg_int64_t _w){
|
||||
ogg_int64_t z;
|
||||
int ipart;
|
||||
if(_w<=0)return -1;
|
||||
ipart=OC_ILOGNZ_64(_w)-1;
|
||||
if(ipart>61)_w>>=ipart-61;
|
||||
else _w<<=61-ipart;
|
||||
z=0;
|
||||
if(_w&_w-1){
|
||||
ogg_int64_t x;
|
||||
ogg_int64_t y;
|
||||
ogg_int64_t u;
|
||||
ogg_int64_t mask;
|
||||
int i;
|
||||
/*C doesn't give us 64x64->128 muls, so we use CORDIC.
|
||||
This is not particularly fast, but it's not being used in time-critical
|
||||
code; it is very accurate.*/
|
||||
/*z is the fractional part of the log in Q61 format.*/
|
||||
/*x and y are the cosh() and sinh(), respectively, in Q61 format.
|
||||
We are computing z=2*atanh(y/x)=2*atanh((_w-1)/(_w+1)).*/
|
||||
x=_w+((ogg_int64_t)1<<61);
|
||||
y=_w-((ogg_int64_t)1<<61);
|
||||
for(i=0;i<4;i++){
|
||||
mask=-(y<0);
|
||||
z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
|
||||
u=x>>i+1;
|
||||
x-=(y>>i+1)+mask^mask;
|
||||
y-=u+mask^mask;
|
||||
}
|
||||
/*Repeat iteration 4.*/
|
||||
for(i--;i<13;i++){
|
||||
mask=-(y<0);
|
||||
z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
|
||||
u=x>>i+1;
|
||||
x-=(y>>i+1)+mask^mask;
|
||||
y-=u+mask^mask;
|
||||
}
|
||||
/*Repeat iteration 13.*/
|
||||
for(i--;i<32;i++){
|
||||
mask=-(y<0);
|
||||
z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
|
||||
u=x>>i+1;
|
||||
x-=(y>>i+1)+mask^mask;
|
||||
y-=u+mask^mask;
|
||||
}
|
||||
/*OC_ATANH_LOG2 has converged.*/
|
||||
for(;i<40;i++){
|
||||
mask=-(y<0);
|
||||
z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
|
||||
u=x>>i+1;
|
||||
x-=(y>>i+1)+mask^mask;
|
||||
y-=u+mask^mask;
|
||||
}
|
||||
/*Repeat iteration 40.*/
|
||||
for(i--;i<62;i++){
|
||||
mask=-(y<0);
|
||||
z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
|
||||
u=x>>i+1;
|
||||
x-=(y>>i+1)+mask^mask;
|
||||
y-=u+mask^mask;
|
||||
}
|
||||
z=z+8>>4;
|
||||
}
|
||||
return OC_Q57(ipart)+z;
|
||||
}
|
||||
|
||||
/*Polynomial approximation of a binary exponential.
|
||||
Q10 input, Q0 output.*/
|
||||
ogg_uint32_t oc_bexp32_q10(int _z){
|
||||
unsigned n;
|
||||
int ipart;
|
||||
ipart=_z>>10;
|
||||
n=(_z&(1<<10)-1)<<4;
|
||||
n=(n*((n*((n*((n*3548>>15)+6817)>>15)+15823)>>15)+22708)>>15)+16384;
|
||||
return 14-ipart>0?n+(1<<13-ipart)>>14-ipart:n<<ipart-14;
|
||||
}
|
||||
|
||||
/*Polynomial approximation of a binary logarithm.
|
||||
Q0 input, Q10 output.*/
|
||||
int oc_blog32_q10(ogg_uint32_t _w){
|
||||
int n;
|
||||
int ipart;
|
||||
int fpart;
|
||||
if(_w<=0)return -1;
|
||||
ipart=OC_ILOGNZ_32(_w);
|
||||
n=(ipart-16>0?_w>>ipart-16:_w<<16-ipart)-32768-16384;
|
||||
fpart=(n*((n*((n*((n*-1402>>15)+2546)>>15)-5216)>>15)+15745)>>15)-6793;
|
||||
return (ipart<<10)+(fpart>>4);
|
||||
}
|
||||
143
engine/thirdparty/libtheora/mathops.h
vendored
Normal file
143
engine/thirdparty/libtheora/mathops.h
vendored
Normal file
|
|
@ -0,0 +1,143 @@
|
|||
#if !defined(_mathops_H)
|
||||
# define _mathops_H (1)
|
||||
# include <ogg/ogg.h>
|
||||
|
||||
# if __GNUC_PREREQ(3,4)
|
||||
# include <limits.h>
|
||||
/*Note the casts to (int) below: this prevents OC_CLZ{32|64}_OFFS from
|
||||
"upgrading" the type of an entire expression to an (unsigned) size_t.*/
|
||||
# if INT_MAX>=2147483647
|
||||
# define OC_CLZ32_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
|
||||
# define OC_CLZ32(_x) (__builtin_clz(_x))
|
||||
# elif LONG_MAX>=2147483647L
|
||||
# define OC_CLZ32_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
|
||||
# define OC_CLZ32(_x) (__builtin_clzl(_x))
|
||||
# endif
|
||||
# if INT_MAX>=9223372036854775807LL
|
||||
# define OC_CLZ64_OFFS ((int)sizeof(unsigned)*CHAR_BIT)
|
||||
# define OC_CLZ64(_x) (__builtin_clz(_x))
|
||||
# elif LONG_MAX>=9223372036854775807LL
|
||||
# define OC_CLZ64_OFFS ((int)sizeof(unsigned long)*CHAR_BIT)
|
||||
# define OC_CLZ64(_x) (__builtin_clzl(_x))
|
||||
# elif LLONG_MAX>=9223372036854775807LL|| \
|
||||
__LONG_LONG_MAX__>=9223372036854775807LL
|
||||
# define OC_CLZ64_OFFS ((int)sizeof(unsigned long long)*CHAR_BIT)
|
||||
# define OC_CLZ64(_x) (__builtin_clzll(_x))
|
||||
# endif
|
||||
# endif
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* oc_ilog32 - Integer binary logarithm of a 32-bit value.
|
||||
* @_v: A 32-bit value.
|
||||
* Returns floor(log2(_v))+1, or 0 if _v==0.
|
||||
* This is the number of bits that would be required to represent _v in two's
|
||||
* complement notation with all of the leading zeros stripped.
|
||||
* The OC_ILOG_32() or OC_ILOGNZ_32() macros may be able to use a builtin
|
||||
* function instead, which should be faster.
|
||||
*/
|
||||
int oc_ilog32(ogg_uint32_t _v);
|
||||
/**
|
||||
* oc_ilog64 - Integer binary logarithm of a 64-bit value.
|
||||
* @_v: A 64-bit value.
|
||||
* Returns floor(log2(_v))+1, or 0 if _v==0.
|
||||
* This is the number of bits that would be required to represent _v in two's
|
||||
* complement notation with all of the leading zeros stripped.
|
||||
* The OC_ILOG_64() or OC_ILOGNZ_64() macros may be able to use a builtin
|
||||
* function instead, which should be faster.
|
||||
*/
|
||||
int oc_ilog64(ogg_int64_t _v);
|
||||
|
||||
|
||||
# if defined(OC_CLZ32)
|
||||
/**
|
||||
* OC_ILOGNZ_32 - Integer binary logarithm of a non-zero 32-bit value.
|
||||
* @_v: A non-zero 32-bit value.
|
||||
* Returns floor(log2(_v))+1.
|
||||
* This is the number of bits that would be required to represent _v in two's
|
||||
* complement notation with all of the leading zeros stripped.
|
||||
* If _v is zero, the return value is undefined; use OC_ILOG_32() instead.
|
||||
*/
|
||||
# define OC_ILOGNZ_32(_v) (OC_CLZ32_OFFS-OC_CLZ32(_v))
|
||||
/**
|
||||
* OC_ILOG_32 - Integer binary logarithm of a 32-bit value.
|
||||
* @_v: A 32-bit value.
|
||||
* Returns floor(log2(_v))+1, or 0 if _v==0.
|
||||
* This is the number of bits that would be required to represent _v in two's
|
||||
* complement notation with all of the leading zeros stripped.
|
||||
*/
|
||||
# define OC_ILOG_32(_v) (OC_ILOGNZ_32(_v)&-!!(_v))
|
||||
# else
|
||||
# define OC_ILOGNZ_32(_v) (oc_ilog32(_v))
|
||||
# define OC_ILOG_32(_v) (oc_ilog32(_v))
|
||||
# endif
|
||||
|
||||
# if defined(CLZ64)
|
||||
/**
|
||||
* OC_ILOGNZ_64 - Integer binary logarithm of a non-zero 64-bit value.
|
||||
* @_v: A non-zero 64-bit value.
|
||||
* Returns floor(log2(_v))+1.
|
||||
* This is the number of bits that would be required to represent _v in two's
|
||||
* complement notation with all of the leading zeros stripped.
|
||||
* If _v is zero, the return value is undefined; use OC_ILOG_64() instead.
|
||||
*/
|
||||
# define OC_ILOGNZ_64(_v) (CLZ64_OFFS-CLZ64(_v))
|
||||
/**
|
||||
* OC_ILOG_64 - Integer binary logarithm of a 64-bit value.
|
||||
* @_v: A 64-bit value.
|
||||
* Returns floor(log2(_v))+1, or 0 if _v==0.
|
||||
* This is the number of bits that would be required to represent _v in two's
|
||||
* complement notation with all of the leading zeros stripped.
|
||||
*/
|
||||
# define OC_ILOG_64(_v) (OC_ILOGNZ_64(_v)&-!!(_v))
|
||||
# else
|
||||
# define OC_ILOGNZ_64(_v) (oc_ilog64(_v))
|
||||
# define OC_ILOG_64(_v) (oc_ilog64(_v))
|
||||
# endif
|
||||
|
||||
# define OC_STATIC_ILOG0(_v) (!!(_v))
|
||||
# define OC_STATIC_ILOG1(_v) (((_v)&0x2)?2:OC_STATIC_ILOG0(_v))
|
||||
# define OC_STATIC_ILOG2(_v) \
|
||||
(((_v)&0xC)?2+OC_STATIC_ILOG1((_v)>>2):OC_STATIC_ILOG1(_v))
|
||||
# define OC_STATIC_ILOG3(_v) \
|
||||
(((_v)&0xF0)?4+OC_STATIC_ILOG2((_v)>>4):OC_STATIC_ILOG2(_v))
|
||||
# define OC_STATIC_ILOG4(_v) \
|
||||
(((_v)&0xFF00)?8+OC_STATIC_ILOG3((_v)>>8):OC_STATIC_ILOG3(_v))
|
||||
# define OC_STATIC_ILOG5(_v) \
|
||||
(((_v)&0xFFFF0000)?16+OC_STATIC_ILOG4((_v)>>16):OC_STATIC_ILOG4(_v))
|
||||
# define OC_STATIC_ILOG6(_v) \
|
||||
(((_v)&0xFFFFFFFF00000000ULL)?32+OC_STATIC_ILOG5((_v)>>32):OC_STATIC_ILOG5(_v))
|
||||
/**
|
||||
* OC_STATIC_ILOG_32 - The integer logarithm of an (unsigned, 32-bit) constant.
|
||||
* @_v: A non-negative 32-bit constant.
|
||||
* Returns floor(log2(_v))+1, or 0 if _v==0.
|
||||
* This is the number of bits that would be required to represent _v in two's
|
||||
* complement notation with all of the leading zeros stripped.
|
||||
* This macro is suitable for evaluation at compile time, but it should not be
|
||||
* used on values that can change at runtime, as it operates via exhaustive
|
||||
* search.
|
||||
*/
|
||||
# define OC_STATIC_ILOG_32(_v) (OC_STATIC_ILOG5((ogg_uint32_t)(_v)))
|
||||
/**
|
||||
* OC_STATIC_ILOG_64 - The integer logarithm of an (unsigned, 64-bit) constant.
|
||||
* @_v: A non-negative 64-bit constant.
|
||||
* Returns floor(log2(_v))+1, or 0 if _v==0.
|
||||
* This is the number of bits that would be required to represent _v in two's
|
||||
* complement notation with all of the leading zeros stripped.
|
||||
* This macro is suitable for evaluation at compile time, but it should not be
|
||||
* used on values that can change at runtime, as it operates via exhaustive
|
||||
* search.
|
||||
*/
|
||||
# define OC_STATIC_ILOG_64(_v) (OC_STATIC_ILOG6((ogg_int64_t)(_v)))
|
||||
|
||||
#define OC_Q57(_v) ((ogg_int64_t)(_v)<<57)
|
||||
#define OC_Q10(_v) ((_v)<<10)
|
||||
|
||||
ogg_int64_t oc_bexp64(ogg_int64_t _z);
|
||||
ogg_int64_t oc_blog64(ogg_int64_t _w);
|
||||
|
||||
ogg_uint32_t oc_bexp32_q10(int _z);
|
||||
int oc_blog32_q10(ogg_uint32_t _w);
|
||||
|
||||
#endif
|
||||
792
engine/thirdparty/libtheora/mcenc.c
vendored
Normal file
792
engine/thirdparty/libtheora/mcenc.c
vendored
Normal file
|
|
@ -0,0 +1,792 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
#include <string.h>
|
||||
#include "encint.h"
|
||||
|
||||
|
||||
|
||||
typedef struct oc_mcenc_ctx oc_mcenc_ctx;
|
||||
|
||||
|
||||
|
||||
/*Temporary state used for motion estimation.*/
|
||||
struct oc_mcenc_ctx{
|
||||
/*The candidate motion vectors.*/
|
||||
int candidates[13][2];
|
||||
/*The start of the Set B candidates.*/
|
||||
int setb0;
|
||||
/*The total number of candidates.*/
|
||||
int ncandidates;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*The maximum Y plane SAD value for accepting the median predictor.*/
|
||||
#define OC_YSAD_THRESH1 (256)
|
||||
/*The amount to right shift the minimum error by when inflating it for
|
||||
computing the second maximum Y plane SAD threshold.*/
|
||||
#define OC_YSAD_THRESH2_SCALE_BITS (4)
|
||||
/*The amount to add to the second maximum Y plane threshold when inflating
|
||||
it.*/
|
||||
#define OC_YSAD_THRESH2_OFFSET (64)
|
||||
|
||||
/*The vector offsets in the X direction for each search site in the square
|
||||
pattern.*/
|
||||
static const int OC_SQUARE_DX[9]={-1,0,1,-1,0,1,-1,0,1};
|
||||
/*The vector offsets in the Y direction for each search site in the square
|
||||
pattern.*/
|
||||
static const int OC_SQUARE_DY[9]={-1,-1,-1,0,0,0,1,1,1};
|
||||
/*The number of sites to search for each boundary condition in the square
|
||||
pattern.
|
||||
Bit flags for the boundary conditions are as follows:
|
||||
1: -16==dx
|
||||
2: dx==15(.5)
|
||||
4: -16==dy
|
||||
8: dy==15(.5)*/
|
||||
static const int OC_SQUARE_NSITES[11]={8,5,5,0,5,3,3,0,5,3,3};
|
||||
/*The list of sites to search for each boundary condition in the square
|
||||
pattern.*/
|
||||
static const int OC_SQUARE_SITES[11][8]={
|
||||
/* -15.5<dx<31, -15.5<dy<15(.5)*/
|
||||
{0,1,2,3,5,6,7,8},
|
||||
/*-15.5==dx, -15.5<dy<15(.5)*/
|
||||
{1,2,5,7,8},
|
||||
/* dx==15(.5), -15.5<dy<15(.5)*/
|
||||
{0,1,3,6,7},
|
||||
/*-15.5==dx==15(.5), -15.5<dy<15(.5)*/
|
||||
{-1},
|
||||
/* -15.5<dx<15(.5), -15.5==dy*/
|
||||
{3,5,6,7,8},
|
||||
/*-15.5==dx, -15.5==dy*/
|
||||
{5,7,8},
|
||||
/* dx==15(.5), -15.5==dy*/
|
||||
{3,6,7},
|
||||
/*-15.5==dx==15(.5), -15.5==dy*/
|
||||
{-1},
|
||||
/*-15.5dx<15(.5), dy==15(.5)*/
|
||||
{0,1,2,3,5},
|
||||
/*-15.5==dx, dy==15(.5)*/
|
||||
{1,2,5},
|
||||
/* dx==15(.5), dy==15(.5)*/
|
||||
{0,1,3}
|
||||
};
|
||||
|
||||
|
||||
static void oc_mcenc_find_candidates_a(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
|
||||
oc_mv _accum,int _mbi,int _frame){
|
||||
oc_mb_enc_info *embs;
|
||||
int accum_x;
|
||||
int accum_y;
|
||||
int a[3][2];
|
||||
int ncandidates;
|
||||
unsigned nmbi;
|
||||
int i;
|
||||
embs=_enc->mb_info;
|
||||
/*Skip a position to store the median predictor in.*/
|
||||
ncandidates=1;
|
||||
if(embs[_mbi].ncneighbors>0){
|
||||
/*Fill in the first part of set A: the vectors from adjacent blocks.*/
|
||||
for(i=0;i<embs[_mbi].ncneighbors;i++){
|
||||
nmbi=embs[_mbi].cneighbors[i];
|
||||
_mcenc->candidates[ncandidates][0]=
|
||||
OC_MV_X(embs[nmbi].analysis_mv[0][_frame]);
|
||||
_mcenc->candidates[ncandidates][1]=
|
||||
OC_MV_Y(embs[nmbi].analysis_mv[0][_frame]);
|
||||
ncandidates++;
|
||||
}
|
||||
}
|
||||
accum_x=OC_MV_X(_accum);
|
||||
accum_y=OC_MV_Y(_accum);
|
||||
/*Add a few additional vectors to set A: the vectors used in the previous
|
||||
frames and the (0,0) vector.*/
|
||||
_mcenc->candidates[ncandidates][0]=accum_x;
|
||||
_mcenc->candidates[ncandidates][1]=accum_y;
|
||||
ncandidates++;
|
||||
_mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
|
||||
OC_MV_X(embs[_mbi].analysis_mv[1][_frame])+accum_x,31);
|
||||
_mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
|
||||
OC_MV_Y(embs[_mbi].analysis_mv[1][_frame])+accum_y,31);
|
||||
ncandidates++;
|
||||
_mcenc->candidates[ncandidates][0]=0;
|
||||
_mcenc->candidates[ncandidates][1]=0;
|
||||
ncandidates++;
|
||||
/*Use the first three vectors of set A to find our best predictor: their
|
||||
median.*/
|
||||
memcpy(a,_mcenc->candidates+1,sizeof(a));
|
||||
OC_SORT2I(a[0][0],a[1][0]);
|
||||
OC_SORT2I(a[0][1],a[1][1]);
|
||||
OC_SORT2I(a[1][0],a[2][0]);
|
||||
OC_SORT2I(a[1][1],a[2][1]);
|
||||
OC_SORT2I(a[0][0],a[1][0]);
|
||||
OC_SORT2I(a[0][1],a[1][1]);
|
||||
_mcenc->candidates[0][0]=a[1][0];
|
||||
_mcenc->candidates[0][1]=a[1][1];
|
||||
_mcenc->setb0=ncandidates;
|
||||
}
|
||||
|
||||
static void oc_mcenc_find_candidates_b(oc_enc_ctx *_enc,oc_mcenc_ctx *_mcenc,
|
||||
oc_mv _accum,int _mbi,int _frame){
|
||||
oc_mb_enc_info *embs;
|
||||
int accum_x;
|
||||
int accum_y;
|
||||
int ncandidates;
|
||||
embs=_enc->mb_info;
|
||||
accum_x=OC_MV_X(_accum);
|
||||
accum_y=OC_MV_Y(_accum);
|
||||
/*Fill in set B: accelerated predictors for this and adjacent macro blocks.*/
|
||||
ncandidates=_mcenc->setb0;
|
||||
/*Use only the current block. Using more did not appear to be helpful
|
||||
with the current selection logic due to escaping the local search too
|
||||
quickly.*/
|
||||
_mcenc->candidates[ncandidates][0]=OC_CLAMPI(-31,
|
||||
2*OC_MV_X(embs[_mbi].analysis_mv[1][_frame])
|
||||
-OC_MV_X(embs[_mbi].analysis_mv[2][_frame])+accum_x,31);
|
||||
_mcenc->candidates[ncandidates][1]=OC_CLAMPI(-31,
|
||||
2*OC_MV_Y(embs[_mbi].analysis_mv[1][_frame])
|
||||
-OC_MV_Y(embs[_mbi].analysis_mv[2][_frame])+accum_y,31);
|
||||
ncandidates++;
|
||||
_mcenc->ncandidates=ncandidates;
|
||||
}
|
||||
|
||||
static unsigned oc_sad16_halfpel(const oc_enc_ctx *_enc,
|
||||
const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
|
||||
int _mvoffset0,int _mvoffset1,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _best_err){
|
||||
unsigned err;
|
||||
int bi;
|
||||
err=0;
|
||||
for(bi=0;bi<4;bi++){
|
||||
ptrdiff_t frag_offs;
|
||||
frag_offs=_frag_buf_offs[_fragis[bi]];
|
||||
err+=oc_enc_frag_sad2_thresh(_enc,_src+frag_offs,_ref+frag_offs+_mvoffset0,
|
||||
_ref+frag_offs+_mvoffset1,_ystride,_best_err-err);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static unsigned oc_satd16_halfpel(const oc_enc_ctx *_enc,
|
||||
const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],
|
||||
int _mvoffset0,int _mvoffset1,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _best_err){
|
||||
unsigned err;
|
||||
int dc;
|
||||
int bi;
|
||||
err=0;
|
||||
for(bi=0;bi<4;bi++){
|
||||
ptrdiff_t frag_offs;
|
||||
frag_offs=_frag_buf_offs[_fragis[bi]];
|
||||
err+=oc_enc_frag_satd2(_enc,&dc,_src+frag_offs,
|
||||
_ref+frag_offs+_mvoffset0,_ref+frag_offs+_mvoffset1,_ystride);
|
||||
err+=abs(dc);
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static unsigned oc_mcenc_ysad_check_mbcandidate_fullpel(const oc_enc_ctx *_enc,
|
||||
const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy,
|
||||
const unsigned char *_src,const unsigned char *_ref,int _ystride,
|
||||
unsigned _block_err[4]){
|
||||
unsigned err;
|
||||
int mvoffset;
|
||||
int bi;
|
||||
mvoffset=_dx+_dy*_ystride;
|
||||
err=0;
|
||||
for(bi=0;bi<4;bi++){
|
||||
ptrdiff_t frag_offs;
|
||||
unsigned block_err;
|
||||
frag_offs=_frag_buf_offs[_fragis[bi]];
|
||||
block_err=oc_enc_frag_sad(_enc,
|
||||
_src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
|
||||
_block_err[bi]=block_err;
|
||||
err+=block_err;
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static int oc_mcenc_ysatd_check_mbcandidate_fullpel(const oc_enc_ctx *_enc,
|
||||
const ptrdiff_t *_frag_buf_offs,const ptrdiff_t _fragis[4],int _dx,int _dy,
|
||||
const unsigned char *_src,const unsigned char *_ref,int _ystride){
|
||||
int mvoffset;
|
||||
int err;
|
||||
int bi;
|
||||
mvoffset=_dx+_dy*_ystride;
|
||||
err=0;
|
||||
for(bi=0;bi<4;bi++){
|
||||
ptrdiff_t frag_offs;
|
||||
int dc;
|
||||
frag_offs=_frag_buf_offs[_fragis[bi]];
|
||||
if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
|
||||
err+=oc_enc_frag_satd(_enc,&dc,
|
||||
_src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
|
||||
err+=abs(dc);
|
||||
}
|
||||
else{
|
||||
err+=oc_enc_frag_sad(_enc,
|
||||
_src+frag_offs,_ref+frag_offs+mvoffset,_ystride);
|
||||
}
|
||||
}
|
||||
return err;
|
||||
}
|
||||
|
||||
static unsigned oc_mcenc_ysatd_check_bcandidate_fullpel(const oc_enc_ctx *_enc,
|
||||
ptrdiff_t _frag_offs,int _dx,int _dy,
|
||||
const unsigned char *_src,const unsigned char *_ref,int _ystride){
|
||||
unsigned err;
|
||||
int dc;
|
||||
err=oc_enc_frag_satd(_enc,&dc,
|
||||
_src+_frag_offs,_ref+_frag_offs+_dx+_dy*_ystride,_ystride);
|
||||
return err+abs(dc);
|
||||
}
|
||||
|
||||
/*Perform a motion vector search for this macro block against a single
|
||||
reference frame.
|
||||
As a bonus, individual block motion vectors are computed as well, as much of
|
||||
the work can be shared.
|
||||
The actual motion vector is stored in the appropriate place in the
|
||||
oc_mb_enc_info structure.
|
||||
_accum: Drop frame/golden MV accumulators.
|
||||
_mbi: The macro block index.
|
||||
_frame: The frame to use for SATD calculations and refinement,
|
||||
either OC_FRAME_PREV or OC_FRAME_GOLD.
|
||||
_frame_full: The frame to perform the 1px search on, one of OC_FRAME_PREV,
|
||||
OC_FRAME_GOLD, OC_FRAME_PREV_ORIG, or OC_FRAME_GOLD_ORIG.*/
|
||||
void oc_mcenc_search_frame(oc_enc_ctx *_enc,oc_mv _accum,int _mbi,int _frame,
|
||||
int _frame_full){
|
||||
/*Note: Traditionally this search is done using a rate-distortion objective
|
||||
function of the form D+lambda*R.
|
||||
However, xiphmont tested this and found it produced a small degredation,
|
||||
while requiring extra computation.
|
||||
This is most likely due to Theora's peculiar MV encoding scheme: MVs are
|
||||
not coded relative to a predictor, and the only truly cheap way to use a
|
||||
MV is in the LAST or LAST2 MB modes, which are not being considered here.
|
||||
Therefore if we use the MV found here, it's only because both LAST and
|
||||
LAST2 performed poorly, and therefore the MB is not likely to be uniform
|
||||
or suffer from the aperture problem.
|
||||
Furthermore we would like to re-use the MV found here for as many MBs as
|
||||
possible, so picking a slightly sub-optimal vector to save a bit or two
|
||||
may cause increased degredation in many blocks to come.
|
||||
We could artificially reduce lambda to compensate, but it's faster to just
|
||||
disable it entirely, and use D (the distortion) as the sole criterion.*/
|
||||
oc_mcenc_ctx mcenc;
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
const ptrdiff_t *fragis;
|
||||
const unsigned char *src;
|
||||
const unsigned char *ref;
|
||||
const unsigned char *satd_ref;
|
||||
int ystride;
|
||||
oc_mb_enc_info *embs;
|
||||
ogg_int32_t hit_cache[31];
|
||||
ogg_int32_t hitbit;
|
||||
unsigned best_block_err[4];
|
||||
unsigned block_err[4];
|
||||
unsigned best_err;
|
||||
int best_vec[2];
|
||||
int best_block_vec[4][2];
|
||||
int candx;
|
||||
int candy;
|
||||
int bi;
|
||||
embs=_enc->mb_info;
|
||||
/*Find some candidate motion vectors.*/
|
||||
oc_mcenc_find_candidates_a(_enc,&mcenc,_accum,_mbi,_frame);
|
||||
/*Clear the cache of locations we've examined.*/
|
||||
memset(hit_cache,0,sizeof(hit_cache));
|
||||
/*Start with the median predictor.*/
|
||||
candx=OC_DIV2(mcenc.candidates[0][0]);
|
||||
candy=OC_DIV2(mcenc.candidates[0][1]);
|
||||
hit_cache[candy+15]|=(ogg_int32_t)1<<candx+15;
|
||||
frag_buf_offs=_enc->state.frag_buf_offs;
|
||||
fragis=_enc->state.mb_maps[_mbi][0];
|
||||
src=_enc->state.ref_frame_data[OC_FRAME_IO];
|
||||
ref=_enc->state.ref_frame_data[_frame_full];
|
||||
satd_ref=_enc->state.ref_frame_data[_frame];
|
||||
ystride=_enc->state.ref_ystride[0];
|
||||
/*TODO: customize error function for speed/(quality+size) tradeoff.*/
|
||||
best_err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
|
||||
frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
|
||||
best_vec[0]=candx;
|
||||
best_vec[1]=candy;
|
||||
if(_frame==OC_FRAME_PREV){
|
||||
for(bi=0;bi<4;bi++){
|
||||
best_block_err[bi]=block_err[bi];
|
||||
best_block_vec[bi][0]=candx;
|
||||
best_block_vec[bi][1]=candy;
|
||||
}
|
||||
}
|
||||
/*If this predictor fails, move on to set A.*/
|
||||
if(best_err>OC_YSAD_THRESH1){
|
||||
unsigned err;
|
||||
unsigned t2;
|
||||
int ncs;
|
||||
int ci;
|
||||
/*Compute the early termination threshold for set A.*/
|
||||
t2=embs[_mbi].error[_frame];
|
||||
ncs=OC_MINI(3,embs[_mbi].ncneighbors);
|
||||
for(ci=0;ci<ncs;ci++){
|
||||
t2=OC_MAXI(t2,embs[embs[_mbi].cneighbors[ci]].error[_frame]);
|
||||
}
|
||||
t2+=(t2>>OC_YSAD_THRESH2_SCALE_BITS)+OC_YSAD_THRESH2_OFFSET;
|
||||
/*Examine the candidates in set A.*/
|
||||
for(ci=1;ci<mcenc.setb0;ci++){
|
||||
candx=OC_DIV2(mcenc.candidates[ci][0]);
|
||||
candy=OC_DIV2(mcenc.candidates[ci][1]);
|
||||
/*If we've already examined this vector, then we would be using it if it
|
||||
was better than what we are using.*/
|
||||
hitbit=(ogg_int32_t)1<<candx+15;
|
||||
if(hit_cache[candy+15]&hitbit)continue;
|
||||
hit_cache[candy+15]|=hitbit;
|
||||
err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
|
||||
frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
|
||||
if(err<best_err){
|
||||
best_err=err;
|
||||
best_vec[0]=candx;
|
||||
best_vec[1]=candy;
|
||||
}
|
||||
if(_frame==OC_FRAME_PREV){
|
||||
for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){
|
||||
best_block_err[bi]=block_err[bi];
|
||||
best_block_vec[bi][0]=candx;
|
||||
best_block_vec[bi][1]=candy;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(best_err>t2){
|
||||
oc_mcenc_find_candidates_b(_enc,&mcenc,_accum,_mbi,_frame);
|
||||
/*Examine the candidates in set B.*/
|
||||
for(;ci<mcenc.ncandidates;ci++){
|
||||
candx=OC_DIV2(mcenc.candidates[ci][0]);
|
||||
candy=OC_DIV2(mcenc.candidates[ci][1]);
|
||||
hitbit=(ogg_int32_t)1<<candx+15;
|
||||
if(hit_cache[candy+15]&hitbit)continue;
|
||||
hit_cache[candy+15]|=hitbit;
|
||||
err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
|
||||
frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
|
||||
if(err<best_err){
|
||||
best_err=err;
|
||||
best_vec[0]=candx;
|
||||
best_vec[1]=candy;
|
||||
}
|
||||
if(_frame==OC_FRAME_PREV){
|
||||
for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){
|
||||
best_block_err[bi]=block_err[bi];
|
||||
best_block_vec[bi][0]=candx;
|
||||
best_block_vec[bi][1]=candy;
|
||||
}
|
||||
}
|
||||
}
|
||||
/*Use the same threshold for set B as in set A.*/
|
||||
if(best_err>t2){
|
||||
int best_site;
|
||||
int nsites;
|
||||
int sitei;
|
||||
int site;
|
||||
int b;
|
||||
/*Square pattern search.*/
|
||||
for(;;){
|
||||
best_site=4;
|
||||
/*Compose the bit flags for boundary conditions.*/
|
||||
b=OC_DIV16(-best_vec[0]+1)|OC_DIV16(best_vec[0]+1)<<1|
|
||||
OC_DIV16(-best_vec[1]+1)<<2|OC_DIV16(best_vec[1]+1)<<3;
|
||||
nsites=OC_SQUARE_NSITES[b];
|
||||
for(sitei=0;sitei<nsites;sitei++){
|
||||
site=OC_SQUARE_SITES[b][sitei];
|
||||
candx=best_vec[0]+OC_SQUARE_DX[site];
|
||||
candy=best_vec[1]+OC_SQUARE_DY[site];
|
||||
hitbit=(ogg_int32_t)1<<candx+15;
|
||||
if(hit_cache[candy+15]&hitbit)continue;
|
||||
hit_cache[candy+15]|=hitbit;
|
||||
err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
|
||||
frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
|
||||
if(err<best_err){
|
||||
best_err=err;
|
||||
best_site=site;
|
||||
}
|
||||
if(_frame==OC_FRAME_PREV){
|
||||
for(bi=0;bi<4;bi++)if(block_err[bi]<best_block_err[bi]){
|
||||
best_block_err[bi]=block_err[bi];
|
||||
best_block_vec[bi][0]=candx;
|
||||
best_block_vec[bi][1]=candy;
|
||||
}
|
||||
}
|
||||
}
|
||||
if(best_site==4)break;
|
||||
best_vec[0]+=OC_SQUARE_DX[best_site];
|
||||
best_vec[1]+=OC_SQUARE_DY[best_site];
|
||||
}
|
||||
/*Final 4-MV search.*/
|
||||
/*Simply use 1/4 of the macro block set A and B threshold as the
|
||||
individual block threshold.*/
|
||||
if(_frame==OC_FRAME_PREV){
|
||||
t2>>=2;
|
||||
for(bi=0;bi<4;bi++){
|
||||
if(best_block_err[bi]>t2){
|
||||
/*Square pattern search.
|
||||
We do this in a slightly interesting manner.
|
||||
We continue to check the SAD of all four blocks in the
|
||||
macro block.
|
||||
This gives us two things:
|
||||
1) We can continue to use the hit_cache to avoid duplicate
|
||||
checks.
|
||||
Otherwise we could continue to read it, but not write to it
|
||||
without saving and restoring it for each block.
|
||||
Note that we could still eliminate a large number of
|
||||
duplicate checks by taking into account the site we came
|
||||
from when choosing the site list.
|
||||
We can still do that to avoid extra hit_cache queries, and
|
||||
it might even be a speed win.
|
||||
2) It gives us a slightly better chance of escaping local
|
||||
minima.
|
||||
We would not be here if we weren't doing a fairly bad job
|
||||
in finding a good vector, and checking these vectors can
|
||||
save us from 100 to several thousand points off our SAD 1
|
||||
in 15 times.
|
||||
TODO: Is this a good idea?
|
||||
Who knows.
|
||||
It needs more testing.*/
|
||||
for(;;){
|
||||
int bestx;
|
||||
int besty;
|
||||
int bj;
|
||||
bestx=best_block_vec[bi][0];
|
||||
besty=best_block_vec[bi][1];
|
||||
/*Compose the bit flags for boundary conditions.*/
|
||||
b=OC_DIV16(-bestx+1)|OC_DIV16(bestx+1)<<1|
|
||||
OC_DIV16(-besty+1)<<2|OC_DIV16(besty+1)<<3;
|
||||
nsites=OC_SQUARE_NSITES[b];
|
||||
for(sitei=0;sitei<nsites;sitei++){
|
||||
site=OC_SQUARE_SITES[b][sitei];
|
||||
candx=bestx+OC_SQUARE_DX[site];
|
||||
candy=besty+OC_SQUARE_DY[site];
|
||||
hitbit=(ogg_int32_t)1<<candx+15;
|
||||
if(hit_cache[candy+15]&hitbit)continue;
|
||||
hit_cache[candy+15]|=hitbit;
|
||||
err=oc_mcenc_ysad_check_mbcandidate_fullpel(_enc,
|
||||
frag_buf_offs,fragis,candx,candy,src,ref,ystride,block_err);
|
||||
if(err<best_err){
|
||||
best_err=err;
|
||||
best_vec[0]=candx;
|
||||
best_vec[1]=candy;
|
||||
}
|
||||
for(bj=0;bj<4;bj++)if(block_err[bj]<best_block_err[bj]){
|
||||
best_block_err[bj]=block_err[bj];
|
||||
best_block_vec[bj][0]=candx;
|
||||
best_block_vec[bj][1]=candy;
|
||||
}
|
||||
}
|
||||
if(best_block_vec[bi][0]==bestx&&best_block_vec[bi][1]==besty){
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
embs[_mbi].error[_frame]=(ogg_uint16_t)best_err;
|
||||
candx=best_vec[0];
|
||||
candy=best_vec[1];
|
||||
embs[_mbi].satd[_frame]=oc_mcenc_ysatd_check_mbcandidate_fullpel(_enc,
|
||||
frag_buf_offs,fragis,candx,candy,src,satd_ref,ystride);
|
||||
embs[_mbi].analysis_mv[0][_frame]=OC_MV(candx<<1,candy<<1);
|
||||
if(_frame==OC_FRAME_PREV&&_enc->sp_level<OC_SP_LEVEL_FAST_ANALYSIS){
|
||||
for(bi=0;bi<4;bi++){
|
||||
candx=best_block_vec[bi][0];
|
||||
candy=best_block_vec[bi][1];
|
||||
embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_check_bcandidate_fullpel(_enc,
|
||||
frag_buf_offs[fragis[bi]],candx,candy,src,satd_ref,ystride);
|
||||
embs[_mbi].block_mv[bi]=OC_MV(candx<<1,candy<<1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void oc_mcenc_search(oc_enc_ctx *_enc,int _mbi){
|
||||
oc_mv2 *mvs;
|
||||
oc_mv accum_p;
|
||||
oc_mv accum_g;
|
||||
oc_mv mv2_p;
|
||||
mvs=_enc->mb_info[_mbi].analysis_mv;
|
||||
if(_enc->prevframe_dropped)accum_p=mvs[0][OC_FRAME_PREV];
|
||||
else accum_p=0;
|
||||
accum_g=mvs[2][OC_FRAME_GOLD];
|
||||
/*Move the motion vector predictors back a frame.*/
|
||||
mv2_p=mvs[2][OC_FRAME_PREV];
|
||||
mvs[2][OC_FRAME_GOLD]=mvs[1][OC_FRAME_GOLD];
|
||||
mvs[2][OC_FRAME_PREV]=mvs[1][OC_FRAME_PREV];
|
||||
mvs[1][OC_FRAME_GOLD]=mvs[0][OC_FRAME_GOLD];
|
||||
mvs[1][OC_FRAME_PREV]=OC_MV_SUB(mvs[0][OC_FRAME_PREV],mv2_p);
|
||||
/*Search the last frame.*/
|
||||
oc_mcenc_search_frame(_enc,accum_p,_mbi,OC_FRAME_PREV,OC_FRAME_PREV_ORIG);
|
||||
mvs[2][OC_FRAME_PREV]=accum_p;
|
||||
/*GOLDEN MVs are different from PREV MVs in that they're each absolute
|
||||
offsets from some frame in the past rather than relative offsets from the
|
||||
frame before.
|
||||
For predictor calculation to make sense, we need them to be in the same
|
||||
form as PREV MVs.*/
|
||||
mvs[1][OC_FRAME_GOLD]=OC_MV_SUB(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]);
|
||||
mvs[2][OC_FRAME_GOLD]=OC_MV_SUB(mvs[2][OC_FRAME_GOLD],accum_g);
|
||||
/*Search the golden frame.*/
|
||||
oc_mcenc_search_frame(_enc,accum_g,_mbi,OC_FRAME_GOLD,OC_FRAME_GOLD_ORIG);
|
||||
/*Put GOLDEN MVs back into absolute offset form.
|
||||
The newest MV is already an absolute offset.*/
|
||||
mvs[2][OC_FRAME_GOLD]=OC_MV_ADD(mvs[2][OC_FRAME_GOLD],accum_g);
|
||||
mvs[1][OC_FRAME_GOLD]=OC_MV_ADD(mvs[1][OC_FRAME_GOLD],mvs[2][OC_FRAME_GOLD]);
|
||||
}
|
||||
|
||||
#if 0
|
||||
static int oc_mcenc_ysad_halfpel_mbrefine(const oc_enc_ctx *_enc,int _mbi,
|
||||
int _vec[2],int _best_err,int _frame){
|
||||
const unsigned char *src;
|
||||
const unsigned char *ref;
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
const ptrdiff_t *fragis;
|
||||
int offset_y[9];
|
||||
int ystride;
|
||||
int mvoffset_base;
|
||||
int best_site;
|
||||
int sitei;
|
||||
int err;
|
||||
src=_enc->state.ref_frame_data[OC_FRAME_IO];
|
||||
ref=_enc->state.ref_frame_data[_framei];
|
||||
frag_buf_offs=_enc->state.frag_buf_offs;
|
||||
fragis=_enc->state.mb_maps[_mbi][0];
|
||||
ystride=_enc->state.ref_ystride[0];
|
||||
mvoffset_base=_vec[0]+_vec[1]*ystride;
|
||||
offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
|
||||
offset_y[3]=offset_y[5]=0;
|
||||
offset_y[6]=offset_y[7]=offset_y[8]=ystride;
|
||||
best_site=4;
|
||||
for(sitei=0;sitei<8;sitei++){
|
||||
int site;
|
||||
int xmask;
|
||||
int ymask;
|
||||
int dx;
|
||||
int dy;
|
||||
int mvoffset0;
|
||||
int mvoffset1;
|
||||
site=OC_SQUARE_SITES[0][sitei];
|
||||
dx=OC_SQUARE_DX[site];
|
||||
dy=OC_SQUARE_DY[site];
|
||||
/*The following code SHOULD be equivalent to
|
||||
oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
|
||||
(_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
|
||||
However, it should also be much faster, as it involves no multiplies and
|
||||
doesn't have to handle chroma vectors.*/
|
||||
xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
|
||||
ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
|
||||
mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
|
||||
mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
|
||||
err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis,
|
||||
mvoffset0,mvoffset1,src,ref,ystride,_best_err);
|
||||
if(err<_best_err){
|
||||
_best_err=err;
|
||||
best_site=site;
|
||||
}
|
||||
}
|
||||
_vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
|
||||
_vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
|
||||
return _best_err;
|
||||
}
|
||||
#endif
|
||||
|
||||
static unsigned oc_mcenc_ysatd_halfpel_mbrefine(const oc_enc_ctx *_enc,
|
||||
int _mbi,int _vec[2],unsigned _best_err,int _frame){
|
||||
const unsigned char *src;
|
||||
const unsigned char *ref;
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
const ptrdiff_t *fragis;
|
||||
int offset_y[9];
|
||||
int ystride;
|
||||
int mvoffset_base;
|
||||
int best_site;
|
||||
int sitei;
|
||||
int err;
|
||||
src=_enc->state.ref_frame_data[OC_FRAME_IO];
|
||||
ref=_enc->state.ref_frame_data[_frame];
|
||||
frag_buf_offs=_enc->state.frag_buf_offs;
|
||||
fragis=_enc->state.mb_maps[_mbi][0];
|
||||
ystride=_enc->state.ref_ystride[0];
|
||||
mvoffset_base=_vec[0]+_vec[1]*ystride;
|
||||
offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
|
||||
offset_y[3]=offset_y[5]=0;
|
||||
offset_y[6]=offset_y[7]=offset_y[8]=ystride;
|
||||
best_site=4;
|
||||
for(sitei=0;sitei<8;sitei++){
|
||||
int site;
|
||||
int xmask;
|
||||
int ymask;
|
||||
int dx;
|
||||
int dy;
|
||||
int mvoffset0;
|
||||
int mvoffset1;
|
||||
site=OC_SQUARE_SITES[0][sitei];
|
||||
dx=OC_SQUARE_DX[site];
|
||||
dy=OC_SQUARE_DY[site];
|
||||
/*The following code SHOULD be equivalent to
|
||||
oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
|
||||
(_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
|
||||
However, it should also be much faster, as it involves no multiplies and
|
||||
doesn't have to handle chroma vectors.*/
|
||||
xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
|
||||
ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
|
||||
mvoffset0=mvoffset_base+(dx&xmask)+(offset_y[site]&ymask);
|
||||
mvoffset1=mvoffset_base+(dx&~xmask)+(offset_y[site]&~ymask);
|
||||
if(_enc->sp_level<OC_SP_LEVEL_NOSATD){
|
||||
err=oc_satd16_halfpel(_enc,frag_buf_offs,fragis,
|
||||
mvoffset0,mvoffset1,src,ref,ystride,_best_err);
|
||||
}
|
||||
else{
|
||||
err=oc_sad16_halfpel(_enc,frag_buf_offs,fragis,
|
||||
mvoffset0,mvoffset1,src,ref,ystride,_best_err);
|
||||
}
|
||||
if(err<_best_err){
|
||||
_best_err=err;
|
||||
best_site=site;
|
||||
}
|
||||
}
|
||||
_vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
|
||||
_vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
|
||||
return _best_err;
|
||||
}
|
||||
|
||||
void oc_mcenc_refine1mv(oc_enc_ctx *_enc,int _mbi,int _frame){
|
||||
oc_mb_enc_info *embs;
|
||||
int vec[2];
|
||||
embs=_enc->mb_info;
|
||||
vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].analysis_mv[0][_frame]));
|
||||
vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].analysis_mv[0][_frame]));
|
||||
embs[_mbi].satd[_frame]=oc_mcenc_ysatd_halfpel_mbrefine(_enc,
|
||||
_mbi,vec,embs[_mbi].satd[_frame],_frame);
|
||||
embs[_mbi].analysis_mv[0][_frame]=OC_MV(vec[0],vec[1]);
|
||||
}
|
||||
|
||||
#if 0
|
||||
static int oc_mcenc_ysad_halfpel_brefine(const oc_enc_ctx *_enc,
|
||||
int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride,
|
||||
int _offset_y[9],unsigned _best_err){
|
||||
int mvoffset_base;
|
||||
int best_site;
|
||||
int sitei;
|
||||
mvoffset_base=_vec[0]+_vec[1]*_ystride;
|
||||
best_site=4;
|
||||
for(sitei=0;sitei<8;sitei++){
|
||||
unsigned err;
|
||||
int site;
|
||||
int xmask;
|
||||
int ymask;
|
||||
int dx;
|
||||
int dy;
|
||||
int mvoffset0;
|
||||
int mvoffset1;
|
||||
site=OC_SQUARE_SITES[0][sitei];
|
||||
dx=OC_SQUARE_DX[site];
|
||||
dy=OC_SQUARE_DY[site];
|
||||
/*The following code SHOULD be equivalent to
|
||||
oc_state_get_mv_offsets(&_mcenc->enc.state,&mvoffset0,&mvoffset1,
|
||||
(_vec[0]<<1)+dx,(_vec[1]<<1)+dy,ref_ystride,0);
|
||||
However, it should also be much faster, as it involves no multiplies and
|
||||
doesn't have to handle chroma vectors.*/
|
||||
xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
|
||||
ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
|
||||
mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask);
|
||||
mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask);
|
||||
err=oc_enc_frag_sad2_thresh(_enc,_src,
|
||||
_ref+mvoffset0,_ref+mvoffset1,ystride,_best_err);
|
||||
if(err<_best_err){
|
||||
_best_err=err;
|
||||
best_site=site;
|
||||
}
|
||||
}
|
||||
_vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
|
||||
_vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
|
||||
return _best_err;
|
||||
}
|
||||
#endif
|
||||
|
||||
static unsigned oc_mcenc_ysatd_halfpel_brefine(const oc_enc_ctx *_enc,
|
||||
int _vec[2],const unsigned char *_src,const unsigned char *_ref,int _ystride,
|
||||
int _offset_y[9],unsigned _best_err){
|
||||
int mvoffset_base;
|
||||
int best_site;
|
||||
int sitei;
|
||||
mvoffset_base=_vec[0]+_vec[1]*_ystride;
|
||||
best_site=4;
|
||||
for(sitei=0;sitei<8;sitei++){
|
||||
unsigned err;
|
||||
int dc;
|
||||
int site;
|
||||
int xmask;
|
||||
int ymask;
|
||||
int dx;
|
||||
int dy;
|
||||
int mvoffset0;
|
||||
int mvoffset1;
|
||||
site=OC_SQUARE_SITES[0][sitei];
|
||||
dx=OC_SQUARE_DX[site];
|
||||
dy=OC_SQUARE_DY[site];
|
||||
/*The following code SHOULD be equivalent to
|
||||
oc_state_get_mv_offsets(&_enc->state,&mvoffsets,0,
|
||||
(_vec[0]<<1)+dx,(_vec[1]<<1)+dy);
|
||||
However, it should also be much faster, as it involves no multiplies and
|
||||
doesn't have to handle chroma vectors.*/
|
||||
xmask=OC_SIGNMASK(((_vec[0]<<1)+dx)^dx);
|
||||
ymask=OC_SIGNMASK(((_vec[1]<<1)+dy)^dy);
|
||||
mvoffset0=mvoffset_base+(dx&xmask)+(_offset_y[site]&ymask);
|
||||
mvoffset1=mvoffset_base+(dx&~xmask)+(_offset_y[site]&~ymask);
|
||||
err=oc_enc_frag_satd2(_enc,&dc,_src,
|
||||
_ref+mvoffset0,_ref+mvoffset1,_ystride);
|
||||
err+=abs(dc);
|
||||
if(err<_best_err){
|
||||
_best_err=err;
|
||||
best_site=site;
|
||||
}
|
||||
}
|
||||
_vec[0]=(_vec[0]<<1)+OC_SQUARE_DX[best_site];
|
||||
_vec[1]=(_vec[1]<<1)+OC_SQUARE_DY[best_site];
|
||||
return _best_err;
|
||||
}
|
||||
|
||||
void oc_mcenc_refine4mv(oc_enc_ctx *_enc,int _mbi){
|
||||
oc_mb_enc_info *embs;
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
const ptrdiff_t *fragis;
|
||||
const unsigned char *src;
|
||||
const unsigned char *ref;
|
||||
int offset_y[9];
|
||||
int ystride;
|
||||
int bi;
|
||||
ystride=_enc->state.ref_ystride[0];
|
||||
frag_buf_offs=_enc->state.frag_buf_offs;
|
||||
fragis=_enc->state.mb_maps[_mbi][0];
|
||||
src=_enc->state.ref_frame_data[OC_FRAME_IO];
|
||||
ref=_enc->state.ref_frame_data[OC_FRAME_PREV];
|
||||
offset_y[0]=offset_y[1]=offset_y[2]=-ystride;
|
||||
offset_y[3]=offset_y[5]=0;
|
||||
offset_y[6]=offset_y[7]=offset_y[8]=ystride;
|
||||
embs=_enc->mb_info;
|
||||
for(bi=0;bi<4;bi++){
|
||||
ptrdiff_t frag_offs;
|
||||
int vec[2];
|
||||
frag_offs=frag_buf_offs[fragis[bi]];
|
||||
vec[0]=OC_DIV2(OC_MV_X(embs[_mbi].block_mv[bi]));
|
||||
vec[1]=OC_DIV2(OC_MV_Y(embs[_mbi].block_mv[bi]));
|
||||
embs[_mbi].block_satd[bi]=oc_mcenc_ysatd_halfpel_brefine(_enc,vec,
|
||||
src+frag_offs,ref+frag_offs,ystride,offset_y,embs[_mbi].block_satd[bi]);
|
||||
embs[_mbi].ref_mv[bi]=OC_MV(vec[0],vec[1]);
|
||||
}
|
||||
}
|
||||
1030
engine/thirdparty/libtheora/modedec.h
vendored
Normal file
1030
engine/thirdparty/libtheora/modedec.h
vendored
Normal file
File diff suppressed because it is too large
Load diff
128
engine/thirdparty/libtheora/ocintrin.h
vendored
Normal file
128
engine/thirdparty/libtheora/ocintrin.h
vendored
Normal file
|
|
@ -0,0 +1,128 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/*Some common macros for potential platform-specific optimization.*/
|
||||
#include <math.h>
|
||||
#if !defined(_ocintrin_H)
|
||||
# define _ocintrin_H (1)
|
||||
|
||||
/*Some specific platforms may have optimized intrinsic or inline assembly
|
||||
versions of these functions which can substantially improve performance.
|
||||
We define macros for them to allow easy incorporation of these non-ANSI
|
||||
features.*/
|
||||
|
||||
/*Note that we do not provide a macro for abs(), because it is provided as a
|
||||
library function, which we assume is translated into an intrinsic to avoid
|
||||
the function call overhead and then implemented in the smartest way for the
|
||||
target platform.
|
||||
With modern gcc (4.x), this is true: it uses cmov instructions if the
|
||||
architecture supports it and branchless bit-twiddling if it does not (the
|
||||
speed difference between the two approaches is not measurable).
|
||||
Interestingly, the bit-twiddling method was patented in 2000 (US 6,073,150)
|
||||
by Sun Microsystems, despite prior art dating back to at least 1996:
|
||||
http://web.archive.org/web/19961201174141/www.x86.org/ftp/articles/pentopt/PENTOPT.TXT
|
||||
On gcc 3.x, however, our assumption is not true, as abs() is translated to a
|
||||
conditional jump, which is horrible on deeply piplined architectures (e.g.,
|
||||
all consumer architectures for the past decade or more).
|
||||
Also be warned that -C*abs(x) where C is a constant is mis-optimized as
|
||||
abs(C*x) on every gcc release before 4.2.3.
|
||||
See bug http://gcc.gnu.org/bugzilla/show_bug.cgi?id=34130 */
|
||||
|
||||
/*Modern gcc (4.x) can compile the naive versions of min and max with cmov if
|
||||
given an appropriate architecture, but the branchless bit-twiddling versions
|
||||
are just as fast, and do not require any special target architecture.
|
||||
Earlier gcc versions (3.x) compiled both code to the same assembly
|
||||
instructions, because of the way they represented ((_b)>(_a)) internally.*/
|
||||
#define OC_MAXI(_a,_b) ((_a)-((_a)-(_b)&-((_b)>(_a))))
|
||||
#define OC_MINI(_a,_b) ((_a)+((_b)-(_a)&-((_b)<(_a))))
|
||||
/*Clamps an integer into the given range.
|
||||
If _a>_c, then the lower bound _a is respected over the upper bound _c (this
|
||||
behavior is required to meet our documented API behavior).
|
||||
_a: The lower bound.
|
||||
_b: The value to clamp.
|
||||
_c: The upper boud.*/
|
||||
#define OC_CLAMPI(_a,_b,_c) (OC_MAXI(_a,OC_MINI(_b,_c)))
|
||||
#define OC_CLAMP255(_x) ((unsigned char)((((_x)<0)-1)&((_x)|-((_x)>255))))
|
||||
/*This has a chance of compiling branchless, and is just as fast as the
|
||||
bit-twiddling method, which is slightly less portable, since it relies on a
|
||||
sign-extended rightshift, which is not guaranteed by ANSI (but present on
|
||||
every relevant platform).*/
|
||||
#define OC_SIGNI(_a) (((_a)>0)-((_a)<0))
|
||||
/*Slightly more portable than relying on a sign-extended right-shift (which is
|
||||
not guaranteed by ANSI), and just as fast, since gcc (3.x and 4.x both)
|
||||
compile it into the right-shift anyway.*/
|
||||
#define OC_SIGNMASK(_a) (-((_a)<0))
|
||||
/*Divides an integer by a power of two, truncating towards 0.
|
||||
_dividend: The integer to divide.
|
||||
_shift: The non-negative power of two to divide by.
|
||||
_rmask: (1<<_shift)-1*/
|
||||
#define OC_DIV_POW2(_dividend,_shift,_rmask)\
|
||||
((_dividend)+(OC_SIGNMASK(_dividend)&(_rmask))>>(_shift))
|
||||
/*Divides _x by 65536, truncating towards 0.*/
|
||||
#define OC_DIV2_16(_x) OC_DIV_POW2(_x,16,0xFFFF)
|
||||
/*Divides _x by 2, truncating towards 0.*/
|
||||
#define OC_DIV2(_x) OC_DIV_POW2(_x,1,0x1)
|
||||
/*Divides _x by 8, truncating towards 0.*/
|
||||
#define OC_DIV8(_x) OC_DIV_POW2(_x,3,0x7)
|
||||
/*Divides _x by 16, truncating towards 0.*/
|
||||
#define OC_DIV16(_x) OC_DIV_POW2(_x,4,0xF)
|
||||
/*Right shifts _dividend by _shift, adding _rval, and subtracting one for
|
||||
negative dividends first.
|
||||
When _rval is (1<<_shift-1), this is equivalent to division with rounding
|
||||
ties away from zero.*/
|
||||
#define OC_DIV_ROUND_POW2(_dividend,_shift,_rval)\
|
||||
((_dividend)+OC_SIGNMASK(_dividend)+(_rval)>>(_shift))
|
||||
/*Divides a _x by 2, rounding towards even numbers.*/
|
||||
#define OC_DIV2_RE(_x) ((_x)+((_x)>>1&1)>>1)
|
||||
/*Divides a _x by (1<<(_shift)), rounding towards even numbers.*/
|
||||
#define OC_DIV_POW2_RE(_x,_shift) \
|
||||
((_x)+((_x)>>(_shift)&1)+((1<<(_shift))-1>>1)>>(_shift))
|
||||
/*Swaps two integers _a and _b if _a>_b.*/
|
||||
#define OC_SORT2I(_a,_b) \
|
||||
do{ \
|
||||
int t__; \
|
||||
t__=((_a)^(_b))&-((_b)<(_a)); \
|
||||
(_a)^=t__; \
|
||||
(_b)^=t__; \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
/*Accesses one of four (signed) bytes given an index.
|
||||
This can be used to avoid small lookup tables.*/
|
||||
#define OC_BYTE_TABLE32(_a,_b,_c,_d,_i) \
|
||||
((signed char) \
|
||||
(((_a)&0xFF|((_b)&0xFF)<<8|((_c)&0xFF)<<16|((_d)&0xFF)<<24)>>(_i)*8))
|
||||
/*Accesses one of eight (unsigned) nibbles given an index.
|
||||
This can be used to avoid small lookup tables.*/
|
||||
#define OC_UNIBBLE_TABLE32(_a,_b,_c,_d,_e,_f,_g,_h,_i) \
|
||||
((((_a)&0xF|((_b)&0xF)<<4|((_c)&0xF)<<8|((_d)&0xF)<<12| \
|
||||
((_e)&0xF)<<16|((_f)&0xF)<<20|((_g)&0xF)<<24|((_h)&0xF)<<28)>>(_i)*4)&0xF)
|
||||
|
||||
|
||||
|
||||
/*All of these macros should expect floats as arguments.*/
|
||||
#define OC_MAXF(_a,_b) ((_a)<(_b)?(_b):(_a))
|
||||
#define OC_MINF(_a,_b) ((_a)>(_b)?(_b):(_a))
|
||||
#define OC_CLAMPF(_a,_b,_c) (OC_MINF(_a,OC_MAXF(_b,_c)))
|
||||
#define OC_FABSF(_f) ((float)fabs(_f))
|
||||
#define OC_SQRTF(_f) ((float)sqrt(_f))
|
||||
#define OC_POWF(_b,_e) ((float)pow(_b,_e))
|
||||
#define OC_LOGF(_f) ((float)log(_f))
|
||||
#define OC_IFLOORF(_f) ((int)floor(_f))
|
||||
#define OC_ICEILF(_f) ((int)ceil(_f))
|
||||
|
||||
#endif
|
||||
127
engine/thirdparty/libtheora/quant.c
vendored
Normal file
127
engine/thirdparty/libtheora/quant.c
vendored
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
#include <ogg/ogg.h>
|
||||
#include "quant.h"
|
||||
#include "decint.h"
|
||||
|
||||
/*The maximum output of the DCT with +/- 255 inputs is +/- 8157.
|
||||
These minimum quantizers ensure the result after quantization (and after
|
||||
prediction for DC) will be no more than +/- 510.
|
||||
The tokenization system can handle values up to +/- 580, so there is no need
|
||||
to do any coefficient clamping.
|
||||
I would rather have allowed smaller quantizers and had to clamp, but these
|
||||
minimums were required when constructing the original VP3 matrices and have
|
||||
been formalized in the spec.*/
|
||||
static const unsigned OC_DC_QUANT_MIN[2]={4<<2,8<<2};
|
||||
static const unsigned OC_AC_QUANT_MIN[2]={2<<2,4<<2};
|
||||
|
||||
/*Initializes the dequantization tables from a set of quantizer info.
|
||||
Currently the dequantizer (and elsewhere enquantizer) tables are expected to
|
||||
be initialized as pointing to the storage reserved for them in the
|
||||
oc_theora_state (resp. oc_enc_ctx) structure.
|
||||
If some tables are duplicates of others, the pointers will be adjusted to
|
||||
point to a single copy of the tables, but the storage for them will not be
|
||||
freed.
|
||||
If you're concerned about the memory footprint, the obvious thing to do is
|
||||
to move the storage out of its fixed place in the structures and allocate
|
||||
it on demand.
|
||||
However, a much, much better option is to only store the quantization
|
||||
matrices being used for the current frame, and to recalculate these as the
|
||||
qi values change between frames (this is what VP3 did).*/
|
||||
void oc_dequant_tables_init(ogg_uint16_t *_dequant[64][3][2],
|
||||
int _pp_dc_scale[64],const th_quant_info *_qinfo){
|
||||
/*Coding mode: intra or inter.*/
|
||||
int qti;
|
||||
/*Y', C_b, C_r*/
|
||||
int pli;
|
||||
for(qti=0;qti<2;qti++)for(pli=0;pli<3;pli++){
|
||||
/*Quality index.*/
|
||||
int qi;
|
||||
/*Range iterator.*/
|
||||
int qri;
|
||||
for(qi=0,qri=0;qri<=_qinfo->qi_ranges[qti][pli].nranges;qri++){
|
||||
th_quant_base base;
|
||||
ogg_uint32_t q;
|
||||
int qi_start;
|
||||
int qi_end;
|
||||
memcpy(base,_qinfo->qi_ranges[qti][pli].base_matrices[qri],
|
||||
sizeof(base));
|
||||
qi_start=qi;
|
||||
if(qri==_qinfo->qi_ranges[qti][pli].nranges)qi_end=qi+1;
|
||||
else qi_end=qi+_qinfo->qi_ranges[qti][pli].sizes[qri];
|
||||
/*Iterate over quality indicies in this range.*/
|
||||
for(;;){
|
||||
ogg_uint32_t qfac;
|
||||
int zzi;
|
||||
int ci;
|
||||
/*In the original VP3.2 code, the rounding offset and the size of the
|
||||
dead zone around 0 were controlled by a "sharpness" parameter.
|
||||
The size of our dead zone is now controlled by the per-coefficient
|
||||
quality thresholds returned by our HVS module.
|
||||
We round down from a more accurate value when the quality of the
|
||||
reconstruction does not fall below our threshold and it saves bits.
|
||||
Hence, all of that VP3.2 code is gone from here, and the remaining
|
||||
floating point code has been implemented as equivalent integer code
|
||||
with exact precision.*/
|
||||
qfac=(ogg_uint32_t)_qinfo->dc_scale[qi]*base[0];
|
||||
/*For postprocessing, not dequantization.*/
|
||||
if(_pp_dc_scale!=NULL)_pp_dc_scale[qi]=(int)(qfac/160);
|
||||
/*Scale DC the coefficient from the proper table.*/
|
||||
q=(qfac/100)<<2;
|
||||
q=OC_CLAMPI(OC_DC_QUANT_MIN[qti],q,OC_QUANT_MAX);
|
||||
_dequant[qi][pli][qti][0]=(ogg_uint16_t)q;
|
||||
/*Now scale AC coefficients from the proper table.*/
|
||||
for(zzi=1;zzi<64;zzi++){
|
||||
q=((ogg_uint32_t)_qinfo->ac_scale[qi]*base[OC_FZIG_ZAG[zzi]]/100)<<2;
|
||||
q=OC_CLAMPI(OC_AC_QUANT_MIN[qti],q,OC_QUANT_MAX);
|
||||
_dequant[qi][pli][qti][zzi]=(ogg_uint16_t)q;
|
||||
}
|
||||
/*If this is a duplicate of a previous matrix, use that instead.
|
||||
This simple check helps us improve cache coherency later.*/
|
||||
{
|
||||
int dupe;
|
||||
int qtj;
|
||||
int plj;
|
||||
dupe=0;
|
||||
for(qtj=0;qtj<=qti;qtj++){
|
||||
for(plj=0;plj<(qtj<qti?3:pli);plj++){
|
||||
if(!memcmp(_dequant[qi][pli][qti],_dequant[qi][plj][qtj],
|
||||
sizeof(oc_quant_table))){
|
||||
dupe=1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if(dupe)break;
|
||||
}
|
||||
if(dupe)_dequant[qi][pli][qti]=_dequant[qi][plj][qtj];
|
||||
}
|
||||
if(++qi>=qi_end)break;
|
||||
/*Interpolate the next base matrix.*/
|
||||
for(ci=0;ci<64;ci++){
|
||||
base[ci]=(unsigned char)(
|
||||
(2*((qi_end-qi)*_qinfo->qi_ranges[qti][pli].base_matrices[qri][ci]+
|
||||
(qi-qi_start)*_qinfo->qi_ranges[qti][pli].base_matrices[qri+1][ci])
|
||||
+_qinfo->qi_ranges[qti][pli].sizes[qri])/
|
||||
(2*_qinfo->qi_ranges[qti][pli].sizes[qri]));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
33
engine/thirdparty/libtheora/quant.h
vendored
Normal file
33
engine/thirdparty/libtheora/quant.h
vendored
Normal file
|
|
@ -0,0 +1,33 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_quant_H)
|
||||
# define _quant_H (1)
|
||||
# include "theora/codec.h"
|
||||
# include "ocintrin.h"
|
||||
|
||||
typedef ogg_uint16_t oc_quant_table[64];
|
||||
|
||||
|
||||
/*Maximum scaled quantizer value.*/
|
||||
#define OC_QUANT_MAX (1024<<2)
|
||||
|
||||
|
||||
void oc_dequant_tables_init(ogg_uint16_t *_dequant[64][3][2],
|
||||
int _pp_dc_scale[64],const th_quant_info *_qinfo);
|
||||
|
||||
#endif
|
||||
1147
engine/thirdparty/libtheora/rate.c
vendored
Normal file
1147
engine/thirdparty/libtheora/rate.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
1267
engine/thirdparty/libtheora/state.c
vendored
Normal file
1267
engine/thirdparty/libtheora/state.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
552
engine/thirdparty/libtheora/state.h
vendored
Normal file
552
engine/thirdparty/libtheora/state.h
vendored
Normal file
|
|
@ -0,0 +1,552 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: internal.h 17337 2010-07-19 16:08:54Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
#if !defined(_state_H)
|
||||
# define _state_H (1)
|
||||
# include "internal.h"
|
||||
# include "huffman.h"
|
||||
# include "quant.h"
|
||||
|
||||
|
||||
|
||||
/*A single quadrant of the map from a super block to fragment numbers.*/
|
||||
typedef ptrdiff_t oc_sb_map_quad[4];
|
||||
/*A map from a super block to fragment numbers.*/
|
||||
typedef oc_sb_map_quad oc_sb_map[4];
|
||||
/*A single plane of the map from a macro block to fragment numbers.*/
|
||||
typedef ptrdiff_t oc_mb_map_plane[4];
|
||||
/*A map from a macro block to fragment numbers.*/
|
||||
typedef oc_mb_map_plane oc_mb_map[3];
|
||||
/*A motion vector.*/
|
||||
typedef ogg_int16_t oc_mv;
|
||||
|
||||
typedef struct oc_sb_flags oc_sb_flags;
|
||||
typedef struct oc_border_info oc_border_info;
|
||||
typedef struct oc_fragment oc_fragment;
|
||||
typedef struct oc_fragment_plane oc_fragment_plane;
|
||||
typedef struct oc_base_opt_vtable oc_base_opt_vtable;
|
||||
typedef struct oc_base_opt_data oc_base_opt_data;
|
||||
typedef struct oc_state_dispatch_vtable oc_state_dispatch_vtable;
|
||||
typedef struct oc_theora_state oc_theora_state;
|
||||
|
||||
|
||||
|
||||
/*Shared accelerated functions.*/
|
||||
# if defined(OC_X86_ASM)
|
||||
# if defined(_MSC_VER)
|
||||
# include "x86_vc/x86int.h"
|
||||
# else
|
||||
# include "x86/x86int.h"
|
||||
# endif
|
||||
# endif
|
||||
# if defined(OC_ARM_ASM)
|
||||
# include "arm/armint.h"
|
||||
# endif
|
||||
# if defined(OC_C64X_ASM)
|
||||
# include "c64x/c64xint.h"
|
||||
# endif
|
||||
|
||||
# if !defined(oc_state_accel_init)
|
||||
# define oc_state_accel_init oc_state_accel_init_c
|
||||
# endif
|
||||
# if defined(OC_STATE_USE_VTABLE)
|
||||
# if !defined(oc_frag_copy)
|
||||
# define oc_frag_copy(_state,_dst,_src,_ystride) \
|
||||
((*(_state)->opt_vtable.frag_copy)(_dst,_src,_ystride))
|
||||
# endif
|
||||
# if !defined(oc_frag_copy_list)
|
||||
# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
|
||||
_fragis,_nfragis,_frag_buf_offs) \
|
||||
((*(_state)->opt_vtable.frag_copy_list)(_dst_frame,_src_frame,_ystride, \
|
||||
_fragis,_nfragis,_frag_buf_offs))
|
||||
# endif
|
||||
# if !defined(oc_frag_recon_intra)
|
||||
# define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
|
||||
((*(_state)->opt_vtable.frag_recon_intra)(_dst,_dst_ystride,_residue))
|
||||
# endif
|
||||
# if !defined(oc_frag_recon_inter)
|
||||
# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
|
||||
((*(_state)->opt_vtable.frag_recon_inter)(_dst,_src,_ystride,_residue))
|
||||
# endif
|
||||
# if !defined(oc_frag_recon_inter2)
|
||||
# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
|
||||
((*(_state)->opt_vtable.frag_recon_inter2)(_dst, \
|
||||
_src1,_src2,_ystride,_residue))
|
||||
# endif
|
||||
# if !defined(oc_idct8x8)
|
||||
# define oc_idct8x8(_state,_y,_x,_last_zzi) \
|
||||
((*(_state)->opt_vtable.idct8x8)(_y,_x,_last_zzi))
|
||||
# endif
|
||||
# if !defined(oc_state_frag_recon)
|
||||
# define oc_state_frag_recon(_state,_fragi, \
|
||||
_pli,_dct_coeffs,_last_zzi,_dc_quant) \
|
||||
((*(_state)->opt_vtable.state_frag_recon)(_state,_fragi, \
|
||||
_pli,_dct_coeffs,_last_zzi,_dc_quant))
|
||||
# endif
|
||||
# if !defined(oc_loop_filter_init)
|
||||
# define oc_loop_filter_init(_state,_bv,_flimit) \
|
||||
((*(_state)->opt_vtable.loop_filter_init)(_bv,_flimit))
|
||||
# endif
|
||||
# if !defined(oc_state_loop_filter_frag_rows)
|
||||
# define oc_state_loop_filter_frag_rows(_state, \
|
||||
_bv,_refi,_pli,_fragy0,_fragy_end) \
|
||||
((*(_state)->opt_vtable.state_loop_filter_frag_rows)(_state, \
|
||||
_bv,_refi,_pli,_fragy0,_fragy_end))
|
||||
# endif
|
||||
# if !defined(oc_restore_fpu)
|
||||
# define oc_restore_fpu(_state) \
|
||||
((*(_state)->opt_vtable.restore_fpu)())
|
||||
# endif
|
||||
# else
|
||||
# if !defined(oc_frag_copy)
|
||||
# define oc_frag_copy(_state,_dst,_src,_ystride) \
|
||||
oc_frag_copy_c(_dst,_src,_ystride)
|
||||
# endif
|
||||
# if !defined(oc_frag_copy_list)
|
||||
# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
|
||||
_fragis,_nfragis,_frag_buf_offs) \
|
||||
oc_frag_copy_list_c(_dst_frame,_src_frame,_ystride, \
|
||||
_fragis,_nfragis,_frag_buf_offs)
|
||||
# endif
|
||||
# if !defined(oc_frag_recon_intra)
|
||||
# define oc_frag_recon_intra(_state,_dst,_dst_ystride,_residue) \
|
||||
oc_frag_recon_intra_c(_dst,_dst_ystride,_residue)
|
||||
# endif
|
||||
# if !defined(oc_frag_recon_inter)
|
||||
# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
|
||||
oc_frag_recon_inter_c(_dst,_src,_ystride,_residue)
|
||||
# endif
|
||||
# if !defined(oc_frag_recon_inter2)
|
||||
# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
|
||||
oc_frag_recon_inter2_c(_dst,_src1,_src2,_ystride,_residue)
|
||||
# endif
|
||||
# if !defined(oc_idct8x8)
|
||||
# define oc_idct8x8(_state,_y,_x,_last_zzi) oc_idct8x8_c(_y,_x,_last_zzi)
|
||||
# endif
|
||||
# if !defined(oc_state_frag_recon)
|
||||
# define oc_state_frag_recon oc_state_frag_recon_c
|
||||
# endif
|
||||
# if !defined(oc_loop_filter_init)
|
||||
# define oc_loop_filter_init(_state,_bv,_flimit) \
|
||||
oc_loop_filter_init_c(_bv,_flimit)
|
||||
# endif
|
||||
# if !defined(oc_state_loop_filter_frag_rows)
|
||||
# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_c
|
||||
# endif
|
||||
# if !defined(oc_restore_fpu)
|
||||
# define oc_restore_fpu(_state) do{}while(0)
|
||||
# endif
|
||||
# endif
|
||||
|
||||
|
||||
|
||||
/*A keyframe.*/
|
||||
# define OC_INTRA_FRAME (0)
|
||||
/*A predicted frame.*/
|
||||
# define OC_INTER_FRAME (1)
|
||||
/*A frame of unknown type (frame type decision has not yet been made).*/
|
||||
# define OC_UNKWN_FRAME (-1)
|
||||
|
||||
/*The amount of padding to add to the reconstructed frame buffers on all
|
||||
sides.
|
||||
This is used to allow unrestricted motion vectors without special casing.
|
||||
This must be a multiple of 2.*/
|
||||
# define OC_UMV_PADDING (16)
|
||||
|
||||
/*Frame classification indices.*/
|
||||
/*The previous golden frame.*/
|
||||
# define OC_FRAME_GOLD (0)
|
||||
/*The previous frame.*/
|
||||
# define OC_FRAME_PREV (1)
|
||||
/*The current frame.*/
|
||||
# define OC_FRAME_SELF (2)
|
||||
/*Used to mark uncoded fragments (for DC prediction).*/
|
||||
# define OC_FRAME_NONE (3)
|
||||
|
||||
/*The input or output buffer.*/
|
||||
# define OC_FRAME_IO (3)
|
||||
/*Uncompressed prev golden frame.*/
|
||||
# define OC_FRAME_GOLD_ORIG (4)
|
||||
/*Uncompressed previous frame. */
|
||||
# define OC_FRAME_PREV_ORIG (5)
|
||||
|
||||
/*Macroblock modes.*/
|
||||
/*Macro block is invalid: It is never coded.*/
|
||||
# define OC_MODE_INVALID (-1)
|
||||
/*Encoded difference from the same macro block in the previous frame.*/
|
||||
# define OC_MODE_INTER_NOMV (0)
|
||||
/*Encoded with no motion compensated prediction.*/
|
||||
# define OC_MODE_INTRA (1)
|
||||
/*Encoded difference from the previous frame offset by the given motion
|
||||
vector.*/
|
||||
# define OC_MODE_INTER_MV (2)
|
||||
/*Encoded difference from the previous frame offset by the last coded motion
|
||||
vector.*/
|
||||
# define OC_MODE_INTER_MV_LAST (3)
|
||||
/*Encoded difference from the previous frame offset by the second to last
|
||||
coded motion vector.*/
|
||||
# define OC_MODE_INTER_MV_LAST2 (4)
|
||||
/*Encoded difference from the same macro block in the previous golden
|
||||
frame.*/
|
||||
# define OC_MODE_GOLDEN_NOMV (5)
|
||||
/*Encoded difference from the previous golden frame offset by the given motion
|
||||
vector.*/
|
||||
# define OC_MODE_GOLDEN_MV (6)
|
||||
/*Encoded difference from the previous frame offset by the individual motion
|
||||
vectors given for each block.*/
|
||||
# define OC_MODE_INTER_MV_FOUR (7)
|
||||
/*The number of (coded) modes.*/
|
||||
# define OC_NMODES (8)
|
||||
|
||||
/*Determines the reference frame used for a given MB mode.*/
|
||||
# define OC_FRAME_FOR_MODE(_x) \
|
||||
OC_UNIBBLE_TABLE32(OC_FRAME_PREV,OC_FRAME_SELF,OC_FRAME_PREV,OC_FRAME_PREV, \
|
||||
OC_FRAME_PREV,OC_FRAME_GOLD,OC_FRAME_GOLD,OC_FRAME_PREV,(_x))
|
||||
|
||||
/*Constants for the packet state machine common between encoder and decoder.*/
|
||||
|
||||
/*Next packet to emit/read: Codec info header.*/
|
||||
# define OC_PACKET_INFO_HDR (-3)
|
||||
/*Next packet to emit/read: Comment header.*/
|
||||
# define OC_PACKET_COMMENT_HDR (-2)
|
||||
/*Next packet to emit/read: Codec setup header.*/
|
||||
# define OC_PACKET_SETUP_HDR (-1)
|
||||
/*No more packets to emit/read.*/
|
||||
# define OC_PACKET_DONE (INT_MAX)
|
||||
|
||||
|
||||
|
||||
#define OC_MV(_x,_y) ((oc_mv)((_x)&0xFF|(_y)<<8))
|
||||
#define OC_MV_X(_mv) ((signed char)(_mv))
|
||||
#define OC_MV_Y(_mv) ((_mv)>>8)
|
||||
#define OC_MV_ADD(_mv1,_mv2) \
|
||||
OC_MV(OC_MV_X(_mv1)+OC_MV_X(_mv2), \
|
||||
OC_MV_Y(_mv1)+OC_MV_Y(_mv2))
|
||||
#define OC_MV_SUB(_mv1,_mv2) \
|
||||
OC_MV(OC_MV_X(_mv1)-OC_MV_X(_mv2), \
|
||||
OC_MV_Y(_mv1)-OC_MV_Y(_mv2))
|
||||
|
||||
|
||||
|
||||
/*Super blocks are 32x32 segments of pixels in a single color plane indexed
|
||||
in image order.
|
||||
Internally, super blocks are broken up into four quadrants, each of which
|
||||
contains a 2x2 pattern of blocks, each of which is an 8x8 block of pixels.
|
||||
Quadrants, and the blocks within them, are indexed in a special order called
|
||||
a "Hilbert curve" within the super block.
|
||||
|
||||
In order to differentiate between the Hilbert-curve indexing strategy and
|
||||
the regular image order indexing strategy, blocks indexed in image order
|
||||
are called "fragments".
|
||||
Fragments are indexed in image order, left to right, then bottom to top,
|
||||
from Y' plane to Cb plane to Cr plane.
|
||||
|
||||
The co-located fragments in all image planes corresponding to the location
|
||||
of a single quadrant of a luma plane super block form a macro block.
|
||||
Thus there is only a single set of macro blocks for all planes, each of which
|
||||
contains between 6 and 12 fragments, depending on the pixel format.
|
||||
Therefore macro block information is kept in a separate set of arrays from
|
||||
super blocks to avoid unused space in the other planes.
|
||||
The lists are indexed in super block order.
|
||||
That is, the macro block corresponding to the macro block mbi in (luma plane)
|
||||
super block sbi is at index (sbi<<2|mbi).
|
||||
Thus the number of macro blocks in each dimension is always twice the number
|
||||
of super blocks, even when only an odd number fall inside the coded frame.
|
||||
These "extra" macro blocks are just an artifact of our internal data layout,
|
||||
and not part of the coded stream; they are flagged with a negative MB mode.*/
|
||||
|
||||
|
||||
|
||||
/*Super block information.*/
|
||||
struct oc_sb_flags{
|
||||
unsigned char coded_fully:1;
|
||||
unsigned char coded_partially:1;
|
||||
unsigned char quad_valid:4;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*Information about a fragment which intersects the border of the displayable
|
||||
region.
|
||||
This marks which pixels belong to the displayable region.*/
|
||||
struct oc_border_info{
|
||||
/*A bit mask marking which pixels are in the displayable region.
|
||||
Pixel (x,y) corresponds to bit (y<<3|x).*/
|
||||
ogg_int64_t mask;
|
||||
/*The number of pixels in the displayable region.
|
||||
This is always positive, and always less than 64.*/
|
||||
int npixels;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*Fragment information.*/
|
||||
struct oc_fragment{
|
||||
/*A flag indicating whether or not this fragment is coded.*/
|
||||
unsigned coded:1;
|
||||
/*A flag indicating that this entire fragment lies outside the displayable
|
||||
region of the frame.
|
||||
Note the contrast with an invalid macro block, which is outside the coded
|
||||
frame, not just the displayable one.
|
||||
There are no fragments outside the coded frame by construction.*/
|
||||
unsigned invalid:1;
|
||||
/*The index of the quality index used for this fragment's AC coefficients.*/
|
||||
unsigned qii:4;
|
||||
/*The index of the reference frame this fragment is predicted from.*/
|
||||
unsigned refi:2;
|
||||
/*The mode of the macroblock this fragment belongs to.*/
|
||||
unsigned mb_mode:3;
|
||||
/*The index of the associated border information for fragments which lie
|
||||
partially outside the displayable region.
|
||||
For fragments completely inside or outside this region, this is -1.
|
||||
Note that the C standard requires an explicit signed keyword for bitfield
|
||||
types, since some compilers may treat them as unsigned without it.*/
|
||||
signed int borderi:5;
|
||||
/*The prediction-corrected DC component.
|
||||
Note that the C standard requires an explicit signed keyword for bitfield
|
||||
types, since some compilers may treat them as unsigned without it.*/
|
||||
signed int dc:16;
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*A description of each fragment plane.*/
|
||||
struct oc_fragment_plane{
|
||||
/*The number of fragments in the horizontal direction.*/
|
||||
int nhfrags;
|
||||
/*The number of fragments in the vertical direction.*/
|
||||
int nvfrags;
|
||||
/*The offset of the first fragment in the plane.*/
|
||||
ptrdiff_t froffset;
|
||||
/*The total number of fragments in the plane.*/
|
||||
ptrdiff_t nfrags;
|
||||
/*The number of super blocks in the horizontal direction.*/
|
||||
unsigned nhsbs;
|
||||
/*The number of super blocks in the vertical direction.*/
|
||||
unsigned nvsbs;
|
||||
/*The offset of the first super block in the plane.*/
|
||||
unsigned sboffset;
|
||||
/*The total number of super blocks in the plane.*/
|
||||
unsigned nsbs;
|
||||
};
|
||||
|
||||
|
||||
typedef void (*oc_state_loop_filter_frag_rows_func)(
|
||||
const oc_theora_state *_state,signed char _bv[256],int _refi,int _pli,
|
||||
int _fragy0,int _fragy_end);
|
||||
|
||||
/*The shared (encoder and decoder) functions that have accelerated variants.*/
|
||||
struct oc_base_opt_vtable{
|
||||
void (*frag_copy)(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride);
|
||||
void (*frag_copy_list)(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||
void (*frag_recon_intra)(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t _residue[64]);
|
||||
void (*frag_recon_inter)(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void (*frag_recon_inter2)(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void (*idct8x8)(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void (*state_frag_recon)(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void (*loop_filter_init)(signed char _bv[256],int _flimit);
|
||||
oc_state_loop_filter_frag_rows_func state_loop_filter_frag_rows;
|
||||
void (*restore_fpu)(void);
|
||||
};
|
||||
|
||||
/*The shared (encoder and decoder) tables that vary according to which variants
|
||||
of the above functions are used.*/
|
||||
struct oc_base_opt_data{
|
||||
const unsigned char *dct_fzig_zag;
|
||||
};
|
||||
|
||||
|
||||
/*State information common to both the encoder and decoder.*/
|
||||
struct oc_theora_state{
|
||||
/*The stream information.*/
|
||||
th_info info;
|
||||
# if defined(OC_STATE_USE_VTABLE)
|
||||
/*Table for shared accelerated functions.*/
|
||||
oc_base_opt_vtable opt_vtable;
|
||||
# endif
|
||||
/*Table for shared data used by accelerated functions.*/
|
||||
oc_base_opt_data opt_data;
|
||||
/*CPU flags to detect the presence of extended instruction sets.*/
|
||||
ogg_uint32_t cpu_flags;
|
||||
/*The fragment plane descriptions.*/
|
||||
oc_fragment_plane fplanes[3];
|
||||
/*The list of fragments, indexed in image order.*/
|
||||
oc_fragment *frags;
|
||||
/*The the offset into the reference frame buffer to the upper-left pixel of
|
||||
each fragment.*/
|
||||
ptrdiff_t *frag_buf_offs;
|
||||
/*The motion vector for each fragment.*/
|
||||
oc_mv *frag_mvs;
|
||||
/*The total number of fragments in a single frame.*/
|
||||
ptrdiff_t nfrags;
|
||||
/*The list of super block maps, indexed in image order.*/
|
||||
oc_sb_map *sb_maps;
|
||||
/*The list of super block flags, indexed in image order.*/
|
||||
oc_sb_flags *sb_flags;
|
||||
/*The total number of super blocks in a single frame.*/
|
||||
unsigned nsbs;
|
||||
/*The fragments from each color plane that belong to each macro block.
|
||||
Fragments are stored in image order (left to right then top to bottom).
|
||||
When chroma components are decimated, the extra fragments have an index of
|
||||
-1.*/
|
||||
oc_mb_map *mb_maps;
|
||||
/*The list of macro block modes.
|
||||
A negative number indicates the macro block lies entirely outside the
|
||||
coded frame.*/
|
||||
signed char *mb_modes;
|
||||
/*The number of macro blocks in the X direction.*/
|
||||
unsigned nhmbs;
|
||||
/*The number of macro blocks in the Y direction.*/
|
||||
unsigned nvmbs;
|
||||
/*The total number of macro blocks.*/
|
||||
size_t nmbs;
|
||||
/*The list of coded fragments, in coded order.
|
||||
Uncoded fragments are stored in reverse order from the end of the list.*/
|
||||
ptrdiff_t *coded_fragis;
|
||||
/*The number of coded fragments in each plane.*/
|
||||
ptrdiff_t ncoded_fragis[3];
|
||||
/*The total number of coded fragments.*/
|
||||
ptrdiff_t ntotal_coded_fragis;
|
||||
/*The actual buffers used for the reference frames.*/
|
||||
th_ycbcr_buffer ref_frame_bufs[6];
|
||||
/*The index of the buffers being used for each OC_FRAME_* reference frame.*/
|
||||
int ref_frame_idx[6];
|
||||
/*The storage for the reference frame buffers.
|
||||
This is just ref_frame_bufs[ref_frame_idx[i]][0].data, but is cached here
|
||||
for faster look-up.*/
|
||||
unsigned char *ref_frame_data[6];
|
||||
/*The handle used to allocate the reference frame buffers.*/
|
||||
unsigned char *ref_frame_handle;
|
||||
/*The strides for each plane in the reference frames.*/
|
||||
int ref_ystride[3];
|
||||
/*The number of unique border patterns.*/
|
||||
int nborders;
|
||||
/*The unique border patterns for all border fragments.
|
||||
The borderi field of fragments which straddle the border indexes this
|
||||
list.*/
|
||||
oc_border_info borders[16];
|
||||
/*The frame number of the last keyframe.*/
|
||||
ogg_int64_t keyframe_num;
|
||||
/*The frame number of the current frame.*/
|
||||
ogg_int64_t curframe_num;
|
||||
/*The granpos of the current frame.*/
|
||||
ogg_int64_t granpos;
|
||||
/*The type of the current frame.*/
|
||||
signed char frame_type;
|
||||
/*The bias to add to the frame count when computing granule positions.*/
|
||||
unsigned char granpos_bias;
|
||||
/*The number of quality indices used in the current frame.*/
|
||||
unsigned char nqis;
|
||||
/*The quality indices of the current frame.*/
|
||||
unsigned char qis[3];
|
||||
/*The dequantization tables, stored in zig-zag order, and indexed by
|
||||
qi, pli, qti, and zzi.*/
|
||||
ogg_uint16_t *dequant_tables[64][3][2];
|
||||
OC_ALIGN16(oc_quant_table dequant_table_data[64][3][2]);
|
||||
/*Loop filter strength parameters.*/
|
||||
unsigned char loop_filter_limits[64];
|
||||
};
|
||||
|
||||
|
||||
|
||||
/*The function type used to fill in the chroma plane motion vectors for a
|
||||
macro block when 4 different motion vectors are specified in the luma
|
||||
plane.
|
||||
_cbmvs: The chroma block-level motion vectors to fill in.
|
||||
_lmbmv: The luma macro-block level motion vector to fill in for use in
|
||||
prediction.
|
||||
_lbmvs: The luma block-level motion vectors.*/
|
||||
typedef void (*oc_set_chroma_mvs_func)(oc_mv _cbmvs[4],const oc_mv _lbmvs[4]);
|
||||
|
||||
|
||||
|
||||
/*A table of functions used to fill in the Cb,Cr plane motion vectors for a
|
||||
macro block when 4 different motion vectors are specified in the luma
|
||||
plane.*/
|
||||
extern const oc_set_chroma_mvs_func OC_SET_CHROMA_MVS_TABLE[TH_PF_NFORMATS];
|
||||
|
||||
|
||||
|
||||
int oc_state_init(oc_theora_state *_state,const th_info *_info,int _nrefs);
|
||||
void oc_state_clear(oc_theora_state *_state);
|
||||
void oc_state_accel_init_c(oc_theora_state *_state);
|
||||
void oc_state_borders_fill_rows(oc_theora_state *_state,int _refi,int _pli,
|
||||
int _y0,int _yend);
|
||||
void oc_state_borders_fill_caps(oc_theora_state *_state,int _refi,int _pli);
|
||||
void oc_state_borders_fill(oc_theora_state *_state,int _refi);
|
||||
void oc_state_fill_buffer_ptrs(oc_theora_state *_state,int _buf_idx,
|
||||
th_ycbcr_buffer _img);
|
||||
int oc_state_mbi_for_pos(oc_theora_state *_state,int _mbx,int _mby);
|
||||
int oc_state_get_mv_offsets(const oc_theora_state *_state,int _offsets[2],
|
||||
int _pli,oc_mv _mv);
|
||||
|
||||
void oc_loop_filter_init_c(signed char _bv[256],int _flimit);
|
||||
void oc_state_loop_filter(oc_theora_state *_state,int _frame);
|
||||
# if defined(OC_DUMP_IMAGES)
|
||||
int oc_state_dump_frame(const oc_theora_state *_state,int _frame,
|
||||
const char *_suf);
|
||||
# endif
|
||||
|
||||
/*Default pure-C implementations of shared accelerated functions.*/
|
||||
void oc_frag_copy_c(unsigned char *_dst,
|
||||
const unsigned char *_src,int _src_ystride);
|
||||
void oc_frag_copy_list_c(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||
void oc_frag_recon_intra_c(unsigned char *_dst,int _dst_ystride,
|
||||
const ogg_int16_t _residue[64]);
|
||||
void oc_frag_recon_inter_c(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void oc_frag_recon_inter2_c(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t _residue[64]);
|
||||
void oc_idct8x8_c(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void oc_state_frag_recon_c(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void oc_state_loop_filter_frag_rows_c(const oc_theora_state *_state,
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
void oc_restore_fpu_c(void);
|
||||
|
||||
/*We need a way to call a few encoder functions without introducing a link-time
|
||||
dependency into the decoder, while still allowing the old alpha API which
|
||||
does not distinguish between encoder and decoder objects to be used.
|
||||
We do this by placing a function table at the start of the encoder object
|
||||
which can dispatch into the encoder library.
|
||||
We do a similar thing for the decoder in case we ever decide to split off a
|
||||
common base library.*/
|
||||
typedef void (*oc_state_clear_func)(theora_state *_th);
|
||||
typedef int (*oc_state_control_func)(theora_state *th,int _req,
|
||||
void *_buf,size_t _buf_sz);
|
||||
typedef ogg_int64_t (*oc_state_granule_frame_func)(theora_state *_th,
|
||||
ogg_int64_t _granulepos);
|
||||
typedef double (*oc_state_granule_time_func)(theora_state *_th,
|
||||
ogg_int64_t _granulepos);
|
||||
|
||||
|
||||
struct oc_state_dispatch_vtable{
|
||||
oc_state_clear_func clear;
|
||||
oc_state_control_func control;
|
||||
oc_state_granule_frame_func granule_frame;
|
||||
oc_state_granule_time_func granule_time;
|
||||
};
|
||||
|
||||
#endif
|
||||
606
engine/thirdparty/libtheora/theora/codec.h
vendored
Normal file
606
engine/thirdparty/libtheora/theora/codec.h
vendored
Normal file
|
|
@ -0,0 +1,606 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: theora.h,v 1.8 2004/03/15 22:17:32 derf Exp $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/**\mainpage
|
||||
*
|
||||
* \section intro Introduction
|
||||
*
|
||||
* This is the documentation for the <tt>libtheora</tt> C API.
|
||||
*
|
||||
* The \c libtheora package is the current reference
|
||||
* implementation for <a href="http://www.theora.org/">Theora</a>, a free,
|
||||
* patent-unencumbered video codec.
|
||||
* Theora is derived from On2's VP3 codec with additional features and
|
||||
* integration with Ogg multimedia formats by
|
||||
* <a href="http://www.xiph.org/">the Xiph.Org Foundation</a>.
|
||||
* Complete documentation of the format itself is available in
|
||||
* <a href="http://www.theora.org/doc/Theora.pdf">the Theora
|
||||
* specification</a>.
|
||||
*
|
||||
* \section Organization
|
||||
*
|
||||
* The functions documented here are divided between two
|
||||
* separate libraries:
|
||||
* - \c libtheoraenc contains the encoder interface,
|
||||
* described in \ref encfuncs.
|
||||
* - \c libtheoradec contains the decoder interface,
|
||||
* described in \ref decfuncs, \n
|
||||
* and additional \ref basefuncs.
|
||||
*
|
||||
* New code should link to \c libtheoradec. If using encoder
|
||||
* features, it must also link to \c libtheoraenc.
|
||||
*
|
||||
* During initial development, prior to the 1.0 release,
|
||||
* \c libtheora exported a different \ref oldfuncs which
|
||||
* combined both encode and decode functions.
|
||||
* In general, legacy API symbols can be indentified
|
||||
* by their \c theora_ or \c OC_ namespace prefixes.
|
||||
* The current API uses \c th_ or \c TH_ instead.
|
||||
*
|
||||
* While deprecated, \c libtheoraenc and \c libtheoradec
|
||||
* together export the legacy api as well at the one documented above.
|
||||
* Likewise, the legacy \c libtheora included with this package
|
||||
* exports the new 1.x API. Older code and build scripts can therefore
|
||||
* but updated independently to the current scheme.
|
||||
*/
|
||||
|
||||
/**\file
|
||||
* The shared <tt>libtheoradec</tt> and <tt>libtheoraenc</tt> C API.
|
||||
* You don't need to include this directly.*/
|
||||
|
||||
#if !defined(_O_THEORA_CODEC_H_)
|
||||
# define _O_THEORA_CODEC_H_ (1)
|
||||
# include <ogg/ogg.h>
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/**\name Return codes*/
|
||||
/*@{*/
|
||||
/**An invalid pointer was provided.*/
|
||||
#define TH_EFAULT (-1)
|
||||
/**An invalid argument was provided.*/
|
||||
#define TH_EINVAL (-10)
|
||||
/**The contents of the header were incomplete, invalid, or unexpected.*/
|
||||
#define TH_EBADHEADER (-20)
|
||||
/**The header does not belong to a Theora stream.*/
|
||||
#define TH_ENOTFORMAT (-21)
|
||||
/**The bitstream version is too high.*/
|
||||
#define TH_EVERSION (-22)
|
||||
/**The specified function is not implemented.*/
|
||||
#define TH_EIMPL (-23)
|
||||
/**There were errors in the video data packet.*/
|
||||
#define TH_EBADPACKET (-24)
|
||||
/**The decoded packet represented a dropped frame.
|
||||
The player can continue to display the current frame, as the contents of the
|
||||
decoded frame buffer have not changed.*/
|
||||
#define TH_DUPFRAME (1)
|
||||
/*@}*/
|
||||
|
||||
/**The currently defined color space tags.
|
||||
* See <a href="http://www.theora.org/doc/Theora.pdf">the Theora
|
||||
* specification</a>, Chapter 4, for exact details on the meaning
|
||||
* of each of these color spaces.*/
|
||||
typedef enum{
|
||||
/**The color space was not specified at the encoder.
|
||||
It may be conveyed by an external means.*/
|
||||
TH_CS_UNSPECIFIED,
|
||||
/**A color space designed for NTSC content.*/
|
||||
TH_CS_ITU_REC_470M,
|
||||
/**A color space designed for PAL/SECAM content.*/
|
||||
TH_CS_ITU_REC_470BG,
|
||||
/**The total number of currently defined color spaces.*/
|
||||
TH_CS_NSPACES
|
||||
}th_colorspace;
|
||||
|
||||
/**The currently defined pixel format tags.
|
||||
* See <a href="http://www.theora.org/doc/Theora.pdf">the Theora
|
||||
* specification</a>, Section 4.4, for details on the precise sample
|
||||
* locations.*/
|
||||
typedef enum{
|
||||
/**Chroma decimation by 2 in both the X and Y directions (4:2:0).
|
||||
The Cb and Cr chroma planes are half the width and half the
|
||||
height of the luma plane.*/
|
||||
TH_PF_420,
|
||||
/**Currently reserved.*/
|
||||
TH_PF_RSVD,
|
||||
/**Chroma decimation by 2 in the X direction (4:2:2).
|
||||
The Cb and Cr chroma planes are half the width of the luma plane, but full
|
||||
height.*/
|
||||
TH_PF_422,
|
||||
/**No chroma decimation (4:4:4).
|
||||
The Cb and Cr chroma planes are full width and full height.*/
|
||||
TH_PF_444,
|
||||
/**The total number of currently defined pixel formats.*/
|
||||
TH_PF_NFORMATS
|
||||
}th_pixel_fmt;
|
||||
|
||||
|
||||
|
||||
/**A buffer for a single color plane in an uncompressed image.
|
||||
* This contains the image data in a left-to-right, top-down format.
|
||||
* Each row of pixels is stored contiguously in memory, but successive
|
||||
* rows need not be.
|
||||
* Use \a stride to compute the offset of the next row.
|
||||
* The encoder accepts both positive \a stride values (top-down in memory)
|
||||
* and negative (bottom-up in memory).
|
||||
* The decoder currently always generates images with positive strides.*/
|
||||
typedef struct{
|
||||
/**The width of this plane.*/
|
||||
int width;
|
||||
/**The height of this plane.*/
|
||||
int height;
|
||||
/**The offset in bytes between successive rows.*/
|
||||
int stride;
|
||||
/**A pointer to the beginning of the first row.*/
|
||||
unsigned char *data;
|
||||
}th_img_plane;
|
||||
|
||||
/**A complete image buffer for an uncompressed frame.
|
||||
* The chroma planes may be decimated by a factor of two in either
|
||||
* direction, as indicated by th_info#pixel_fmt.
|
||||
* The width and height of the Y' plane must be multiples of 16.
|
||||
* They may need to be cropped for display, using the rectangle
|
||||
* specified by th_info#pic_x, th_info#pic_y, th_info#pic_width,
|
||||
* and th_info#pic_height.
|
||||
* All samples are 8 bits.
|
||||
* \note The term YUV often used to describe a colorspace is ambiguous.
|
||||
* The exact parameters of the RGB to YUV conversion process aside, in
|
||||
* many contexts the U and V channels actually have opposite meanings.
|
||||
* To avoid this confusion, we are explicit: the name of the color
|
||||
* channels are Y'CbCr, and they appear in that order, always.
|
||||
* The prime symbol denotes that the Y channel is non-linear.
|
||||
* Cb and Cr stand for "Chroma blue" and "Chroma red", respectively.*/
|
||||
typedef th_img_plane th_ycbcr_buffer[3];
|
||||
|
||||
/**Theora bitstream information.
|
||||
* This contains the basic playback parameters for a stream, and corresponds to
|
||||
* the initial 'info' header packet.
|
||||
* To initialize an encoder, the application fills in this structure and
|
||||
* passes it to th_encode_alloc().
|
||||
* A default encoding mode is chosen based on the values of the #quality and
|
||||
* #target_bitrate fields.
|
||||
* On decode, it is filled in by th_decode_headerin(), and then passed to
|
||||
* th_decode_alloc().
|
||||
*
|
||||
* Encoded Theora frames must be a multiple of 16 in size;
|
||||
* this is what the #frame_width and #frame_height members represent.
|
||||
* To handle arbitrary picture sizes, a crop rectangle is specified in the
|
||||
* #pic_x, #pic_y, #pic_width and #pic_height members.
|
||||
*
|
||||
* All frame buffers contain pointers to the full, padded frame.
|
||||
* However, the current encoder <em>will not</em> reference pixels outside of
|
||||
* the cropped picture region, and the application does not need to fill them
|
||||
* in.
|
||||
* The decoder <em>will</em> allocate storage for a full frame, but the
|
||||
* application <em>should not</em> rely on the padding containing sensible
|
||||
* data.
|
||||
*
|
||||
* It is also generally recommended that the offsets and sizes should still be
|
||||
* multiples of 2 to avoid chroma sampling shifts when chroma is sub-sampled.
|
||||
* See <a href="http://www.theora.org/doc/Theora.pdf">the Theora
|
||||
* specification</a>, Section 4.4, for more details.
|
||||
*
|
||||
* Frame rate, in frames per second, is stored as a rational fraction, as is
|
||||
* the pixel aspect ratio.
|
||||
* Note that this refers to the aspect ratio of the individual pixels, not of
|
||||
* the overall frame itself.
|
||||
* The frame aspect ratio can be computed from pixel aspect ratio using the
|
||||
* image dimensions.*/
|
||||
typedef struct{
|
||||
/**\name Theora version
|
||||
* Bitstream version information.*/
|
||||
/*@{*/
|
||||
unsigned char version_major;
|
||||
unsigned char version_minor;
|
||||
unsigned char version_subminor;
|
||||
/*@}*/
|
||||
/**The encoded frame width.
|
||||
* This must be a multiple of 16, and less than 1048576.*/
|
||||
ogg_uint32_t frame_width;
|
||||
/**The encoded frame height.
|
||||
* This must be a multiple of 16, and less than 1048576.*/
|
||||
ogg_uint32_t frame_height;
|
||||
/**The displayed picture width.
|
||||
* This must be no larger than width.*/
|
||||
ogg_uint32_t pic_width;
|
||||
/**The displayed picture height.
|
||||
* This must be no larger than height.*/
|
||||
ogg_uint32_t pic_height;
|
||||
/**The X offset of the displayed picture.
|
||||
* This must be no larger than #frame_width-#pic_width or 255, whichever is
|
||||
* smaller.*/
|
||||
ogg_uint32_t pic_x;
|
||||
/**The Y offset of the displayed picture.
|
||||
* This must be no larger than #frame_height-#pic_height, and
|
||||
* #frame_height-#pic_height-#pic_y must be no larger than 255.
|
||||
* This slightly funny restriction is due to the fact that the offset is
|
||||
* specified from the top of the image for consistency with the standard
|
||||
* graphics left-handed coordinate system used throughout this API, while
|
||||
* it is stored in the encoded stream as an offset from the bottom.*/
|
||||
ogg_uint32_t pic_y;
|
||||
/**\name Frame rate
|
||||
* The frame rate, as a fraction.
|
||||
* If either is 0, the frame rate is undefined.*/
|
||||
/*@{*/
|
||||
ogg_uint32_t fps_numerator;
|
||||
ogg_uint32_t fps_denominator;
|
||||
/*@}*/
|
||||
/**\name Aspect ratio
|
||||
* The aspect ratio of the pixels.
|
||||
* If either value is zero, the aspect ratio is undefined.
|
||||
* If not specified by any external means, 1:1 should be assumed.
|
||||
* The aspect ratio of the full picture can be computed as
|
||||
* \code
|
||||
* aspect_numerator*pic_width/(aspect_denominator*pic_height).
|
||||
* \endcode */
|
||||
/*@{*/
|
||||
ogg_uint32_t aspect_numerator;
|
||||
ogg_uint32_t aspect_denominator;
|
||||
/*@}*/
|
||||
/**The color space.*/
|
||||
th_colorspace colorspace;
|
||||
/**The pixel format.*/
|
||||
th_pixel_fmt pixel_fmt;
|
||||
/**The target bit-rate in bits per second.
|
||||
If initializing an encoder with this struct, set this field to a non-zero
|
||||
value to activate CBR encoding by default.*/
|
||||
int target_bitrate;
|
||||
/**The target quality level.
|
||||
Valid values range from 0 to 63, inclusive, with higher values giving
|
||||
higher quality.
|
||||
If initializing an encoder with this struct, and #target_bitrate is set
|
||||
to zero, VBR encoding at this quality will be activated by default.*/
|
||||
/*Currently this is set so that a qi of 0 corresponds to distortions of 24
|
||||
times the JND, and each increase by 16 halves that value.
|
||||
This gives us fine discrimination at low qualities, yet effective rate
|
||||
control at high qualities.
|
||||
The qi value 63 is special, however.
|
||||
For this, the highest quality, we use one half of a JND for our threshold.
|
||||
Due to the lower bounds placed on allowable quantizers in Theora, we will
|
||||
not actually be able to achieve quality this good, but this should
|
||||
provide as close to visually lossless quality as Theora is capable of.
|
||||
We could lift the quantizer restrictions without breaking VP3.1
|
||||
compatibility, but this would result in quantized coefficients that are
|
||||
too large for the current bitstream to be able to store.
|
||||
We'd have to redesign the token syntax to store these large coefficients,
|
||||
which would make transcoding complex.*/
|
||||
int quality;
|
||||
/**The amount to shift to extract the last keyframe number from the granule
|
||||
* position.
|
||||
* This can be at most 31.
|
||||
* th_info_init() will set this to a default value (currently <tt>6</tt>,
|
||||
* which is good for streaming applications), but you can set it to 0 to
|
||||
* make every frame a keyframe.
|
||||
* The maximum distance between key frames is
|
||||
* <tt>1<<#keyframe_granule_shift</tt>.
|
||||
* The keyframe frequency can be more finely controlled with
|
||||
* #TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE, which can also be adjusted
|
||||
* during encoding (for example, to force the next frame to be a keyframe),
|
||||
* but it cannot be set larger than the amount permitted by this field after
|
||||
* the headers have been output.*/
|
||||
int keyframe_granule_shift;
|
||||
}th_info;
|
||||
|
||||
/**The comment information.
|
||||
*
|
||||
* This structure holds the in-stream metadata corresponding to
|
||||
* the 'comment' header packet.
|
||||
* The comment header is meant to be used much like someone jotting a quick
|
||||
* note on the label of a video.
|
||||
* It should be a short, to the point text note that can be more than a couple
|
||||
* words, but not more than a short paragraph.
|
||||
*
|
||||
* The metadata is stored as a series of (tag, value) pairs, in
|
||||
* length-encoded string vectors.
|
||||
* The first occurrence of the '=' character delimits the tag and value.
|
||||
* A particular tag may occur more than once, and order is significant.
|
||||
* The character set encoding for the strings is always UTF-8, but the tag
|
||||
* names are limited to ASCII, and treated as case-insensitive.
|
||||
* See <a href="http://www.theora.org/doc/Theora.pdf">the Theora
|
||||
* specification</a>, Section 6.3.3 for details.
|
||||
*
|
||||
* In filling in this structure, th_decode_headerin() will null-terminate
|
||||
* the user_comment strings for safety.
|
||||
* However, the bitstream format itself treats them as 8-bit clean vectors,
|
||||
* possibly containing null characters, so the length array should be
|
||||
* treated as their authoritative length.
|
||||
*/
|
||||
typedef struct th_comment{
|
||||
/**The array of comment string vectors.*/
|
||||
char **user_comments;
|
||||
/**An array of the corresponding length of each vector, in bytes.*/
|
||||
int *comment_lengths;
|
||||
/**The total number of comment strings.*/
|
||||
int comments;
|
||||
/**The null-terminated vendor string.
|
||||
This identifies the software used to encode the stream.*/
|
||||
char *vendor;
|
||||
}th_comment;
|
||||
|
||||
|
||||
|
||||
/**A single base matrix.*/
|
||||
typedef unsigned char th_quant_base[64];
|
||||
|
||||
/**A set of \a qi ranges.*/
|
||||
typedef struct{
|
||||
/**The number of ranges in the set.*/
|
||||
int nranges;
|
||||
/**The size of each of the #nranges ranges.
|
||||
These must sum to 63.*/
|
||||
const int *sizes;
|
||||
/**#nranges <tt>+1</tt> base matrices.
|
||||
Matrices \a i and <tt>i+1</tt> form the endpoints of range \a i.*/
|
||||
const th_quant_base *base_matrices;
|
||||
}th_quant_ranges;
|
||||
|
||||
/**A complete set of quantization parameters.
|
||||
The quantizer for each coefficient is calculated as:
|
||||
\code
|
||||
Q=MAX(MIN(qmin[qti][ci!=0],scale[ci!=0][qi]*base[qti][pli][qi][ci]/100),
|
||||
1024).
|
||||
\endcode
|
||||
|
||||
\a qti is the quantization type index: 0 for intra, 1 for inter.
|
||||
<tt>ci!=0</tt> is 0 for the DC coefficient and 1 for AC coefficients.
|
||||
\a qi is the quality index, ranging between 0 (low quality) and 63 (high
|
||||
quality).
|
||||
\a pli is the color plane index: 0 for Y', 1 for Cb, 2 for Cr.
|
||||
\a ci is the DCT coefficient index.
|
||||
Coefficient indices correspond to the normal 2D DCT block
|
||||
ordering--row-major with low frequencies first--\em not zig-zag order.
|
||||
|
||||
Minimum quantizers are constant, and are given by:
|
||||
\code
|
||||
qmin[2][2]={{4,2},{8,4}}.
|
||||
\endcode
|
||||
|
||||
Parameters that can be stored in the bitstream are as follows:
|
||||
- The two scale matrices ac_scale and dc_scale.
|
||||
\code
|
||||
scale[2][64]={dc_scale,ac_scale}.
|
||||
\endcode
|
||||
- The base matrices for each \a qi, \a qti and \a pli (up to 384 in all).
|
||||
In order to avoid storing a full 384 base matrices, only a sparse set of
|
||||
matrices are stored, and the rest are linearly interpolated.
|
||||
This is done as follows.
|
||||
For each \a qti and \a pli, a series of \a n \a qi ranges is defined.
|
||||
The size of each \a qi range can vary arbitrarily, but they must sum to
|
||||
63.
|
||||
Then, <tt>n+1</tt> matrices are specified, one for each endpoint of the
|
||||
ranges.
|
||||
For interpolation purposes, each range's endpoints are the first \a qi
|
||||
value it contains and one past the last \a qi value it contains.
|
||||
Fractional values are rounded to the nearest integer, with ties rounded
|
||||
away from zero.
|
||||
|
||||
Base matrices are stored by reference, so if the same matrices are used
|
||||
multiple times, they will only appear once in the bitstream.
|
||||
The bitstream is also capable of omitting an entire set of ranges and
|
||||
its associated matrices if they are the same as either the previous
|
||||
set (indexed in row-major order) or if the inter set is the same as the
|
||||
intra set.
|
||||
|
||||
- Loop filter limit values.
|
||||
The same limits are used for the loop filter in all color planes, despite
|
||||
potentially differing levels of quantization in each.
|
||||
|
||||
For the current encoder, <tt>scale[ci!=0][qi]</tt> must be no greater
|
||||
than <tt>scale[ci!=0][qi-1]</tt> and <tt>base[qti][pli][qi][ci]</tt> must
|
||||
be no greater than <tt>base[qti][pli][qi-1][ci]</tt>.
|
||||
These two conditions ensure that the actual quantizer for a given \a qti,
|
||||
\a pli, and \a ci does not increase as \a qi increases.
|
||||
This is not required by the decoder.*/
|
||||
typedef struct{
|
||||
/**The DC scaling factors.*/
|
||||
ogg_uint16_t dc_scale[64];
|
||||
/**The AC scaling factors.*/
|
||||
ogg_uint16_t ac_scale[64];
|
||||
/**The loop filter limit values.*/
|
||||
unsigned char loop_filter_limits[64];
|
||||
/**The \a qi ranges for each \a ci and \a pli.*/
|
||||
th_quant_ranges qi_ranges[2][3];
|
||||
}th_quant_info;
|
||||
|
||||
|
||||
|
||||
/**The number of Huffman tables used by Theora.*/
|
||||
#define TH_NHUFFMAN_TABLES (80)
|
||||
/**The number of DCT token values in each table.*/
|
||||
#define TH_NDCT_TOKENS (32)
|
||||
|
||||
/**A Huffman code for a Theora DCT token.
|
||||
* Each set of Huffman codes in a given table must form a complete, prefix-free
|
||||
* code.
|
||||
* There is no requirement that all the tokens in a table have a valid code,
|
||||
* but the current encoder is not optimized to take advantage of this.
|
||||
* If each of the five grouops of 16 tables does not contain at least one table
|
||||
* with a code for every token, then the encoder may fail to encode certain
|
||||
* frames.
|
||||
* The complete table in the first group of 16 does not have to be in the same
|
||||
* place as the complete table in the other groups, but the complete tables in
|
||||
* the remaining four groups must all be in the same place.*/
|
||||
typedef struct{
|
||||
/**The bit pattern for the code, with the LSbit of the pattern aligned in
|
||||
* the LSbit of the word.*/
|
||||
ogg_uint32_t pattern;
|
||||
/**The number of bits in the code.
|
||||
* This must be between 0 and 32, inclusive.*/
|
||||
int nbits;
|
||||
}th_huff_code;
|
||||
|
||||
|
||||
|
||||
/**\defgroup basefuncs Functions Shared by Encode and Decode*/
|
||||
/*@{*/
|
||||
/**\name Basic shared functions
|
||||
* These functions return information about the library itself,
|
||||
* or provide high-level information about codec state
|
||||
* and packet type.
|
||||
*
|
||||
* You must link to \c libtheoradec if you use any of the
|
||||
* functions in this section.*/
|
||||
/*@{*/
|
||||
/**Retrieves a human-readable string to identify the library vendor and
|
||||
* version.
|
||||
* \return the version string.*/
|
||||
extern const char *th_version_string(void);
|
||||
/**Retrieves the library version number.
|
||||
* This is the highest bitstream version that the encoder library will produce,
|
||||
* or that the decoder library can decode.
|
||||
* This number is composed of a 16-bit major version, 8-bit minor version
|
||||
* and 8 bit sub-version, composed as follows:
|
||||
* \code
|
||||
* (VERSION_MAJOR<<16)+(VERSION_MINOR<<8)+(VERSION_SUBMINOR)
|
||||
* \endcode
|
||||
* \return the version number.*/
|
||||
extern ogg_uint32_t th_version_number(void);
|
||||
/**Converts a granule position to an absolute frame index, starting at
|
||||
* <tt>0</tt>.
|
||||
* The granule position is interpreted in the context of a given
|
||||
* #th_enc_ctx or #th_dec_ctx handle (either will suffice).
|
||||
* \param _encdec A previously allocated #th_enc_ctx or #th_dec_ctx
|
||||
* handle.
|
||||
* \param _granpos The granule position to convert.
|
||||
* \returns The absolute frame index corresponding to \a _granpos.
|
||||
* \retval -1 The given granule position was invalid (i.e. negative).*/
|
||||
extern ogg_int64_t th_granule_frame(void *_encdec,ogg_int64_t _granpos);
|
||||
/**Converts a granule position to an absolute time in seconds.
|
||||
* The granule position is interpreted in the context of a given
|
||||
* #th_enc_ctx or #th_dec_ctx handle (either will suffice).
|
||||
* \param _encdec A previously allocated #th_enc_ctx or #th_dec_ctx
|
||||
* handle.
|
||||
* \param _granpos The granule position to convert.
|
||||
* \return The absolute time in seconds corresponding to \a _granpos.
|
||||
* This is the "end time" for the frame, or the latest time it should
|
||||
* be displayed.
|
||||
* It is not the presentation time.
|
||||
* \retval -1 The given granule position was invalid (i.e. negative).*/
|
||||
extern double th_granule_time(void *_encdec,ogg_int64_t _granpos);
|
||||
/**Determines whether a Theora packet is a header or not.
|
||||
* This function does no verification beyond checking the packet type bit, so
|
||||
* it should not be used for bitstream identification; use
|
||||
* th_decode_headerin() for that.
|
||||
* As per the Theora specification, an empty (0-byte) packet is treated as a
|
||||
* data packet (a delta frame with no coded blocks).
|
||||
* \param _op An <tt>ogg_packet</tt> containing encoded Theora data.
|
||||
* \retval 1 The packet is a header packet
|
||||
* \retval 0 The packet is a video data packet.*/
|
||||
extern int th_packet_isheader(ogg_packet *_op);
|
||||
/**Determines whether a theora packet is a key frame or not.
|
||||
* This function does no verification beyond checking the packet type and
|
||||
* key frame bits, so it should not be used for bitstream identification; use
|
||||
* th_decode_headerin() for that.
|
||||
* As per the Theora specification, an empty (0-byte) packet is treated as a
|
||||
* delta frame (with no coded blocks).
|
||||
* \param _op An <tt>ogg_packet</tt> containing encoded Theora data.
|
||||
* \retval 1 The packet contains a key frame.
|
||||
* \retval 0 The packet contains a delta frame.
|
||||
* \retval -1 The packet is not a video data packet.*/
|
||||
extern int th_packet_iskeyframe(ogg_packet *_op);
|
||||
/*@}*/
|
||||
|
||||
|
||||
/**\name Functions for manipulating header data
|
||||
* These functions manipulate the #th_info and #th_comment structures
|
||||
* which describe video parameters and key-value metadata, respectively.
|
||||
*
|
||||
* You must link to \c libtheoradec if you use any of the
|
||||
* functions in this section.*/
|
||||
/*@{*/
|
||||
/**Initializes a th_info structure.
|
||||
* This should be called on a freshly allocated #th_info structure before
|
||||
* attempting to use it.
|
||||
* \param _info The #th_info struct to initialize.*/
|
||||
extern void th_info_init(th_info *_info);
|
||||
/**Clears a #th_info structure.
|
||||
* This should be called on a #th_info structure after it is no longer
|
||||
* needed.
|
||||
* \param _info The #th_info struct to clear.*/
|
||||
extern void th_info_clear(th_info *_info);
|
||||
|
||||
/**Initialize a #th_comment structure.
|
||||
* This should be called on a freshly allocated #th_comment structure
|
||||
* before attempting to use it.
|
||||
* \param _tc The #th_comment struct to initialize.*/
|
||||
extern void th_comment_init(th_comment *_tc);
|
||||
/**Add a comment to an initialized #th_comment structure.
|
||||
* \note Neither th_comment_add() nor th_comment_add_tag() support
|
||||
* comments containing null values, although the bitstream format does
|
||||
* support them.
|
||||
* To add such comments you will need to manipulate the #th_comment
|
||||
* structure directly.
|
||||
* \param _tc The #th_comment struct to add the comment to.
|
||||
* \param _comment Must be a null-terminated UTF-8 string containing the
|
||||
* comment in "TAG=the value" form.*/
|
||||
extern void th_comment_add(th_comment *_tc,const char *_comment);
|
||||
/**Add a comment to an initialized #th_comment structure.
|
||||
* \note Neither th_comment_add() nor th_comment_add_tag() support
|
||||
* comments containing null values, although the bitstream format does
|
||||
* support them.
|
||||
* To add such comments you will need to manipulate the #th_comment
|
||||
* structure directly.
|
||||
* \param _tc The #th_comment struct to add the comment to.
|
||||
* \param _tag A null-terminated string containing the tag associated with
|
||||
* the comment.
|
||||
* \param _val The corresponding value as a null-terminated string.*/
|
||||
extern void th_comment_add_tag(th_comment *_tc,const char *_tag,
|
||||
const char *_val);
|
||||
/**Look up a comment value by its tag.
|
||||
* \param _tc An initialized #th_comment structure.
|
||||
* \param _tag The tag to look up.
|
||||
* \param _count The instance of the tag.
|
||||
* The same tag can appear multiple times, each with a distinct
|
||||
* value, so an index is required to retrieve them all.
|
||||
* The order in which these values appear is significant and
|
||||
* should be preserved.
|
||||
* Use th_comment_query_count() to get the legal range for
|
||||
* the \a _count parameter.
|
||||
* \return A pointer to the queried tag's value.
|
||||
* This points directly to data in the #th_comment structure.
|
||||
* It should not be modified or freed by the application, and
|
||||
* modifications to the structure may invalidate the pointer.
|
||||
* \retval NULL If no matching tag is found.*/
|
||||
extern char *th_comment_query(th_comment *_tc,const char *_tag,int _count);
|
||||
/**Look up the number of instances of a tag.
|
||||
* Call this first when querying for a specific tag and then iterate over the
|
||||
* number of instances with separate calls to th_comment_query() to
|
||||
* retrieve all the values for that tag in order.
|
||||
* \param _tc An initialized #th_comment structure.
|
||||
* \param _tag The tag to look up.
|
||||
* \return The number of instances of this particular tag.*/
|
||||
extern int th_comment_query_count(th_comment *_tc,const char *_tag);
|
||||
/**Clears a #th_comment structure.
|
||||
* This should be called on a #th_comment structure after it is no longer
|
||||
* needed.
|
||||
* It will free all memory used by the structure members.
|
||||
* \param _tc The #th_comment struct to clear.*/
|
||||
extern void th_comment_clear(th_comment *_tc);
|
||||
/*@}*/
|
||||
/*@}*/
|
||||
|
||||
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
786
engine/thirdparty/libtheora/theora/theora.h
vendored
Normal file
786
engine/thirdparty/libtheora/theora/theora.h
vendored
Normal file
|
|
@ -0,0 +1,786 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: theora.h,v 1.17 2003/12/06 18:06:19 arc Exp $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#ifndef _O_THEORA_H_
|
||||
#define _O_THEORA_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C"
|
||||
{
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#include <stddef.h> /* for size_t */
|
||||
|
||||
#include <ogg/ogg.h>
|
||||
|
||||
/** \file
|
||||
* The libtheora pre-1.0 legacy C API.
|
||||
*
|
||||
* \ingroup oldfuncs
|
||||
*
|
||||
* \section intro Introduction
|
||||
*
|
||||
* This is the documentation for the libtheora legacy C API, declared in
|
||||
* the theora.h header, which describes the old interface used before
|
||||
* the 1.0 release. This API was widely deployed for several years and
|
||||
* remains supported, but for new code we recommend the cleaner API
|
||||
* declared in theoradec.h and theoraenc.h.
|
||||
*
|
||||
* libtheora is the reference implementation for
|
||||
* <a href="http://www.theora.org/">Theora</a>, a free video codec.
|
||||
* Theora is derived from On2's VP3 codec with improved integration with
|
||||
* Ogg multimedia formats by <a href="http://www.xiph.org/">Xiph.Org</a>.
|
||||
*
|
||||
* \section overview Overview
|
||||
*
|
||||
* This library will both decode and encode theora packets to/from raw YUV
|
||||
* frames. In either case, the packets will most likely either come from or
|
||||
* need to be embedded in an Ogg stream. Use
|
||||
* <a href="http://xiph.org/ogg/">libogg</a> or
|
||||
* <a href="http://www.annodex.net/software/liboggz/index.html">liboggz</a>
|
||||
* to extract/package these packets.
|
||||
*
|
||||
* \section decoding Decoding Process
|
||||
*
|
||||
* Decoding can be separated into the following steps:
|
||||
* -# initialise theora_info and theora_comment structures using
|
||||
* theora_info_init() and theora_comment_init():
|
||||
\verbatim
|
||||
theora_info info;
|
||||
theora_comment comment;
|
||||
|
||||
theora_info_init(&info);
|
||||
theora_comment_init(&comment);
|
||||
\endverbatim
|
||||
* -# retrieve header packets from Ogg stream (there should be 3) and decode
|
||||
* into theora_info and theora_comment structures using
|
||||
* theora_decode_header(). See \ref identification for more information on
|
||||
* identifying which packets are theora packets.
|
||||
\verbatim
|
||||
int i;
|
||||
for (i = 0; i < 3; i++)
|
||||
{
|
||||
(get a theora packet "op" from the Ogg stream)
|
||||
theora_decode_header(&info, &comment, op);
|
||||
}
|
||||
\endverbatim
|
||||
* -# initialise the decoder based on the information retrieved into the
|
||||
* theora_info struct by theora_decode_header(). You will need a
|
||||
* theora_state struct.
|
||||
\verbatim
|
||||
theora_state state;
|
||||
|
||||
theora_decode_init(&state, &info);
|
||||
\endverbatim
|
||||
* -# pass in packets and retrieve decoded frames! See the yuv_buffer
|
||||
* documentation for information on how to retrieve raw YUV data.
|
||||
\verbatim
|
||||
yuf_buffer buffer;
|
||||
while (last packet was not e_o_s) {
|
||||
(get a theora packet "op" from the Ogg stream)
|
||||
theora_decode_packetin(&state, op);
|
||||
theora_decode_YUVout(&state, &buffer);
|
||||
}
|
||||
\endverbatim
|
||||
*
|
||||
*
|
||||
* \subsection identification Identifying Theora Packets
|
||||
*
|
||||
* All streams inside an Ogg file have a unique serial_no attached to the
|
||||
* stream. Typically, you will want to
|
||||
* - retrieve the serial_no for each b_o_s (beginning of stream) page
|
||||
* encountered within the Ogg file;
|
||||
* - test the first (only) packet on that page to determine if it is a theora
|
||||
* packet;
|
||||
* - once you have found a theora b_o_s page then use the retrieved serial_no
|
||||
* to identify future packets belonging to the same theora stream.
|
||||
*
|
||||
* Note that you \e cannot use theora_packet_isheader() to determine if a
|
||||
* packet is a theora packet or not, as this function does not perform any
|
||||
* checking beyond whether a header bit is present. Instead, use the
|
||||
* theora_decode_header() function and check the return value; or examine the
|
||||
* header bytes at the beginning of the Ogg page.
|
||||
*/
|
||||
|
||||
|
||||
/** \defgroup oldfuncs Legacy pre-1.0 C API */
|
||||
/* @{ */
|
||||
|
||||
/**
|
||||
* A YUV buffer for passing uncompressed frames to and from the codec.
|
||||
* This holds a Y'CbCr frame in planar format. The CbCr planes can be
|
||||
* subsampled and have their own separate dimensions and row stride
|
||||
* offsets. Note that the strides may be negative in some
|
||||
* configurations. For theora the width and height of the largest plane
|
||||
* must be a multiple of 16. The actual meaningful picture size and
|
||||
* offset are stored in the theora_info structure; frames returned by
|
||||
* the decoder may need to be cropped for display.
|
||||
*
|
||||
* All samples are 8 bits. Within each plane samples are ordered by
|
||||
* row from the top of the frame to the bottom. Within each row samples
|
||||
* are ordered from left to right.
|
||||
*
|
||||
* During decode, the yuv_buffer struct is allocated by the user, but all
|
||||
* fields (including luma and chroma pointers) are filled by the library.
|
||||
* These pointers address library-internal memory and their contents should
|
||||
* not be modified.
|
||||
*
|
||||
* Conversely, during encode the user allocates the struct and fills out all
|
||||
* fields. The user also manages the data addressed by the luma and chroma
|
||||
* pointers. See the encoder_example.c and dump_video.c example files in
|
||||
* theora/examples/ for more information.
|
||||
*/
|
||||
typedef struct {
|
||||
int y_width; /**< Width of the Y' luminance plane */
|
||||
int y_height; /**< Height of the luminance plane */
|
||||
int y_stride; /**< Offset in bytes between successive rows */
|
||||
|
||||
int uv_width; /**< Width of the Cb and Cr chroma planes */
|
||||
int uv_height; /**< Height of the chroma planes */
|
||||
int uv_stride; /**< Offset between successive chroma rows */
|
||||
unsigned char *y; /**< Pointer to start of luminance data */
|
||||
unsigned char *u; /**< Pointer to start of Cb data */
|
||||
unsigned char *v; /**< Pointer to start of Cr data */
|
||||
|
||||
} yuv_buffer;
|
||||
|
||||
/**
|
||||
* A Colorspace.
|
||||
*/
|
||||
typedef enum {
|
||||
OC_CS_UNSPECIFIED, /**< The colorspace is unknown or unspecified */
|
||||
OC_CS_ITU_REC_470M, /**< This is the best option for 'NTSC' content */
|
||||
OC_CS_ITU_REC_470BG, /**< This is the best option for 'PAL' content */
|
||||
OC_CS_NSPACES /**< This marks the end of the defined colorspaces */
|
||||
} theora_colorspace;
|
||||
|
||||
/**
|
||||
* A Chroma subsampling
|
||||
*
|
||||
* These enumerate the available chroma subsampling options supported
|
||||
* by the theora format. See Section 4.4 of the specification for
|
||||
* exact definitions.
|
||||
*/
|
||||
typedef enum {
|
||||
OC_PF_420, /**< Chroma subsampling by 2 in each direction (4:2:0) */
|
||||
OC_PF_RSVD, /**< Reserved value */
|
||||
OC_PF_422, /**< Horizonatal chroma subsampling by 2 (4:2:2) */
|
||||
OC_PF_444 /**< No chroma subsampling at all (4:4:4) */
|
||||
} theora_pixelformat;
|
||||
|
||||
/**
|
||||
* Theora bitstream info.
|
||||
* Contains the basic playback parameters for a stream,
|
||||
* corresponding to the initial 'info' header packet.
|
||||
*
|
||||
* Encoded theora frames must be a multiple of 16 in width and height.
|
||||
* To handle other frame sizes, a crop rectangle is specified in
|
||||
* frame_height and frame_width, offset_x and * offset_y. The offset
|
||||
* and size should still be a multiple of 2 to avoid chroma sampling
|
||||
* shifts. Offset values in this structure are measured from the
|
||||
* upper left of the image.
|
||||
*
|
||||
* Frame rate, in frames per second, is stored as a rational
|
||||
* fraction. Aspect ratio is also stored as a rational fraction, and
|
||||
* refers to the aspect ratio of the frame pixels, not of the
|
||||
* overall frame itself.
|
||||
*
|
||||
* See <a href="http://svn.xiph.org/trunk/theora/examples/encoder_example.c">
|
||||
* examples/encoder_example.c</a> for usage examples of the
|
||||
* other parameters and good default settings for the encoder parameters.
|
||||
*/
|
||||
typedef struct {
|
||||
ogg_uint32_t width; /**< encoded frame width */
|
||||
ogg_uint32_t height; /**< encoded frame height */
|
||||
ogg_uint32_t frame_width; /**< display frame width */
|
||||
ogg_uint32_t frame_height; /**< display frame height */
|
||||
ogg_uint32_t offset_x; /**< horizontal offset of the displayed frame */
|
||||
ogg_uint32_t offset_y; /**< vertical offset of the displayed frame */
|
||||
ogg_uint32_t fps_numerator; /**< frame rate numerator **/
|
||||
ogg_uint32_t fps_denominator; /**< frame rate denominator **/
|
||||
ogg_uint32_t aspect_numerator; /**< pixel aspect ratio numerator */
|
||||
ogg_uint32_t aspect_denominator; /**< pixel aspect ratio denominator */
|
||||
theora_colorspace colorspace; /**< colorspace */
|
||||
int target_bitrate; /**< nominal bitrate in bits per second */
|
||||
int quality; /**< Nominal quality setting, 0-63 */
|
||||
int quick_p; /**< Quick encode/decode */
|
||||
|
||||
/* decode only */
|
||||
unsigned char version_major;
|
||||
unsigned char version_minor;
|
||||
unsigned char version_subminor;
|
||||
|
||||
void *codec_setup;
|
||||
|
||||
/* encode only */
|
||||
int dropframes_p;
|
||||
int keyframe_auto_p;
|
||||
ogg_uint32_t keyframe_frequency;
|
||||
ogg_uint32_t keyframe_frequency_force; /* also used for decode init to
|
||||
get granpos shift correct */
|
||||
ogg_uint32_t keyframe_data_target_bitrate;
|
||||
ogg_int32_t keyframe_auto_threshold;
|
||||
ogg_uint32_t keyframe_mindistance;
|
||||
ogg_int32_t noise_sensitivity;
|
||||
ogg_int32_t sharpness;
|
||||
|
||||
theora_pixelformat pixelformat; /**< chroma subsampling mode to expect */
|
||||
|
||||
} theora_info;
|
||||
|
||||
/** Codec internal state and context.
|
||||
*/
|
||||
typedef struct{
|
||||
theora_info *i;
|
||||
ogg_int64_t granulepos;
|
||||
|
||||
void *internal_encode;
|
||||
void *internal_decode;
|
||||
|
||||
} theora_state;
|
||||
|
||||
/**
|
||||
* Comment header metadata.
|
||||
*
|
||||
* This structure holds the in-stream metadata corresponding to
|
||||
* the 'comment' header packet.
|
||||
*
|
||||
* Meta data is stored as a series of (tag, value) pairs, in
|
||||
* length-encoded string vectors. The first occurence of the
|
||||
* '=' character delimits the tag and value. A particular tag
|
||||
* may occur more than once. The character set encoding for
|
||||
* the strings is always UTF-8, but the tag names are limited
|
||||
* to case-insensitive ASCII. See the spec for details.
|
||||
*
|
||||
* In filling in this structure, theora_decode_header() will
|
||||
* null-terminate the user_comment strings for safety. However,
|
||||
* the bitstream format itself treats them as 8-bit clean,
|
||||
* and so the length array should be treated as authoritative
|
||||
* for their length.
|
||||
*/
|
||||
typedef struct theora_comment{
|
||||
char **user_comments; /**< An array of comment string vectors */
|
||||
int *comment_lengths; /**< An array of corresponding string vector lengths in bytes */
|
||||
int comments; /**< The total number of comment string vectors */
|
||||
char *vendor; /**< The vendor string identifying the encoder, null terminated */
|
||||
|
||||
} theora_comment;
|
||||
|
||||
|
||||
/**\name theora_control() codes */
|
||||
/* \anchor decctlcodes_old
|
||||
* These are the available request codes for theora_control()
|
||||
* when called with a decoder instance.
|
||||
* By convention decoder control codes are odd, to distinguish
|
||||
* them from \ref encctlcodes_old "encoder control codes" which
|
||||
* are even.
|
||||
*
|
||||
* Note that since the 1.0 release, both the legacy and the final
|
||||
* implementation accept all the same control codes, but only the
|
||||
* final API declares the newer codes.
|
||||
*
|
||||
* Keep any experimental or vendor-specific values above \c 0x8000.*/
|
||||
|
||||
/*@{*/
|
||||
|
||||
/**Get the maximum post-processing level.
|
||||
* The decoder supports a post-processing filter that can improve
|
||||
* the appearance of the decoded images. This returns the highest
|
||||
* level setting for this post-processor, corresponding to maximum
|
||||
* improvement and computational expense.
|
||||
*/
|
||||
#define TH_DECCTL_GET_PPLEVEL_MAX (1)
|
||||
|
||||
/**Set the post-processing level.
|
||||
* Sets the level of post-processing to use when decoding the
|
||||
* compressed stream. This must be a value between zero (off)
|
||||
* and the maximum returned by TH_DECCTL_GET_PPLEVEL_MAX.
|
||||
*/
|
||||
#define TH_DECCTL_SET_PPLEVEL (3)
|
||||
|
||||
/**Sets the maximum distance between key frames.
|
||||
* This can be changed during an encode, but will be bounded by
|
||||
* <tt>1<<th_info#keyframe_granule_shift</tt>.
|
||||
* If it is set before encoding begins, th_info#keyframe_granule_shift will
|
||||
* be enlarged appropriately.
|
||||
*
|
||||
* \param[in] buf <tt>ogg_uint32_t</tt>: The maximum distance between key
|
||||
* frames.
|
||||
* \param[out] buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
|
||||
* \retval OC_FAULT \a theora_state or \a buf is <tt>NULL</tt>.
|
||||
* \retval OC_EINVAL \a buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
|
||||
* \retval OC_IMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
|
||||
|
||||
/**Set the granule position.
|
||||
* Call this after a seek, to update the internal granulepos
|
||||
* in the decoder, to insure that subsequent frames are marked
|
||||
* properly. If you track timestamps yourself and do not use
|
||||
* the granule postion returned by the decoder, then you do
|
||||
* not need to use this control.
|
||||
*/
|
||||
#define TH_DECCTL_SET_GRANPOS (5)
|
||||
|
||||
/**\anchor encctlcodes_old */
|
||||
|
||||
/**Sets the quantization parameters to use.
|
||||
* The parameters are copied, not stored by reference, so they can be freed
|
||||
* after this call.
|
||||
* <tt>NULL</tt> may be specified to revert to the default parameters.
|
||||
*
|
||||
* \param[in] buf #th_quant_info
|
||||
* \retval OC_FAULT \a theora_state is <tt>NULL</tt>.
|
||||
* \retval OC_EINVAL Encoding has already begun, the quantization parameters
|
||||
* are not acceptable to this version of the encoder,
|
||||
* \a buf is <tt>NULL</tt> and \a buf_sz is not zero,
|
||||
* or \a buf is non-<tt>NULL</tt> and \a buf_sz is
|
||||
* not <tt>sizeof(#th_quant_info)</tt>.
|
||||
* \retval OC_IMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_QUANT_PARAMS (2)
|
||||
|
||||
/**Disables any encoder features that would prevent lossless transcoding back
|
||||
* to VP3.
|
||||
* This primarily means disabling block-level QI values and not using 4MV mode
|
||||
* when any of the luma blocks in a macro block are not coded.
|
||||
* It also includes using the VP3 quantization tables and Huffman codes; if you
|
||||
* set them explicitly after calling this function, the resulting stream will
|
||||
* not be VP3-compatible.
|
||||
* If you enable VP3-compatibility when encoding 4:2:2 or 4:4:4 source
|
||||
* material, or when using a picture region smaller than the full frame (e.g.
|
||||
* a non-multiple-of-16 width or height), then non-VP3 bitstream features will
|
||||
* still be disabled, but the stream will still not be VP3-compatible, as VP3
|
||||
* was not capable of encoding such formats.
|
||||
* If you call this after encoding has already begun, then the quantization
|
||||
* tables and codebooks cannot be changed, but the frame-level features will
|
||||
* be enabled or disabled as requested.
|
||||
*
|
||||
* \param[in] buf <tt>int</tt>: a non-zero value to enable VP3 compatibility,
|
||||
* or 0 to disable it (the default).
|
||||
* \param[out] buf <tt>int</tt>: 1 if all bitstream features required for
|
||||
* VP3-compatibility could be set, and 0 otherwise.
|
||||
* The latter will be returned if the pixel format is not
|
||||
* 4:2:0, the picture region is smaller than the full frame,
|
||||
* or if encoding has begun, preventing the quantization
|
||||
* tables and codebooks from being set.
|
||||
* \retval OC_FAULT \a theora_state or \a buf is <tt>NULL</tt>.
|
||||
* \retval OC_EINVAL \a buf_sz is not <tt>sizeof(int)</tt>.
|
||||
* \retval OC_IMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_VP3_COMPATIBLE (10)
|
||||
|
||||
/**Gets the maximum speed level.
|
||||
* Higher speed levels favor quicker encoding over better quality per bit.
|
||||
* Depending on the encoding mode, and the internal algorithms used, quality
|
||||
* may actually improve, but in this case bitrate will also likely increase.
|
||||
* In any case, overall rate/distortion performance will probably decrease.
|
||||
* The maximum value, and the meaning of each value, may change depending on
|
||||
* the current encoding mode (VBR vs. CQI, etc.).
|
||||
*
|
||||
* \param[out] buf int: The maximum encoding speed level.
|
||||
* \retval OC_FAULT \a theora_state or \a buf is <tt>NULL</tt>.
|
||||
* \retval OC_EINVAL \a buf_sz is not <tt>sizeof(int)</tt>.
|
||||
* \retval OC_IMPL Not supported by this implementation in the current
|
||||
* encoding mode.*/
|
||||
#define TH_ENCCTL_GET_SPLEVEL_MAX (12)
|
||||
|
||||
/**Sets the speed level.
|
||||
* By default a speed value of 1 is used.
|
||||
*
|
||||
* \param[in] buf int: The new encoding speed level.
|
||||
* 0 is slowest, larger values use less CPU.
|
||||
* \retval OC_FAULT \a theora_state or \a buf is <tt>NULL</tt>.
|
||||
* \retval OC_EINVAL \a buf_sz is not <tt>sizeof(int)</tt>, or the
|
||||
* encoding speed level is out of bounds.
|
||||
* The maximum encoding speed level may be
|
||||
* implementation- and encoding mode-specific, and can be
|
||||
* obtained via #TH_ENCCTL_GET_SPLEVEL_MAX.
|
||||
* \retval OC_IMPL Not supported by this implementation in the current
|
||||
* encoding mode.*/
|
||||
#define TH_ENCCTL_SET_SPLEVEL (14)
|
||||
|
||||
/*@}*/
|
||||
|
||||
#define OC_FAULT -1 /**< General failure */
|
||||
#define OC_EINVAL -10 /**< Library encountered invalid internal data */
|
||||
#define OC_DISABLED -11 /**< Requested action is disabled */
|
||||
#define OC_BADHEADER -20 /**< Header packet was corrupt/invalid */
|
||||
#define OC_NOTFORMAT -21 /**< Packet is not a theora packet */
|
||||
#define OC_VERSION -22 /**< Bitstream version is not handled */
|
||||
#define OC_IMPL -23 /**< Feature or action not implemented */
|
||||
#define OC_BADPACKET -24 /**< Packet is corrupt */
|
||||
#define OC_NEWPACKET -25 /**< Packet is an (ignorable) unhandled extension */
|
||||
#define OC_DUPFRAME 1 /**< Packet is a dropped frame */
|
||||
|
||||
/**
|
||||
* Retrieve a human-readable string to identify the encoder vendor and version.
|
||||
* \returns A version string.
|
||||
*/
|
||||
extern const char *theora_version_string(void);
|
||||
|
||||
/**
|
||||
* Retrieve a 32-bit version number.
|
||||
* This number is composed of a 16-bit major version, 8-bit minor version
|
||||
* and 8 bit sub-version, composed as follows:
|
||||
<pre>
|
||||
(VERSION_MAJOR<<16) + (VERSION_MINOR<<8) + (VERSION_SUB)
|
||||
</pre>
|
||||
* \returns The version number.
|
||||
*/
|
||||
extern ogg_uint32_t theora_version_number(void);
|
||||
|
||||
/**
|
||||
* Initialize the theora encoder.
|
||||
* \param th The theora_state handle to initialize for encoding.
|
||||
* \param ti A theora_info struct filled with the desired encoding parameters.
|
||||
* \retval 0 Success
|
||||
*/
|
||||
extern int theora_encode_init(theora_state *th, theora_info *ti);
|
||||
|
||||
/**
|
||||
* Submit a YUV buffer to the theora encoder.
|
||||
* \param t A theora_state handle previously initialized for encoding.
|
||||
* \param yuv A buffer of YUV data to encode. Note that both the yuv_buffer
|
||||
* struct and the luma/chroma buffers within should be allocated by
|
||||
* the user.
|
||||
* \retval OC_EINVAL Encoder is not ready, or is finished.
|
||||
* \retval -1 The size of the given frame differs from those previously input
|
||||
* \retval 0 Success
|
||||
*/
|
||||
extern int theora_encode_YUVin(theora_state *t, yuv_buffer *yuv);
|
||||
|
||||
/**
|
||||
* Request the next packet of encoded video.
|
||||
* The encoded data is placed in a user-provided ogg_packet structure.
|
||||
* \param t A theora_state handle previously initialized for encoding.
|
||||
* \param last_p whether this is the last packet the encoder should produce.
|
||||
* \param op An ogg_packet structure to fill. libtheora will set all
|
||||
* elements of this structure, including a pointer to encoded
|
||||
* data. The memory for the encoded data is owned by libtheora.
|
||||
* \retval 0 No internal storage exists OR no packet is ready
|
||||
* \retval -1 The encoding process has completed
|
||||
* \retval 1 Success
|
||||
*/
|
||||
extern int theora_encode_packetout( theora_state *t, int last_p,
|
||||
ogg_packet *op);
|
||||
|
||||
/**
|
||||
* Request a packet containing the initial header.
|
||||
* A pointer to the header data is placed in a user-provided ogg_packet
|
||||
* structure.
|
||||
* \param t A theora_state handle previously initialized for encoding.
|
||||
* \param op An ogg_packet structure to fill. libtheora will set all
|
||||
* elements of this structure, including a pointer to the header
|
||||
* data. The memory for the header data is owned by libtheora.
|
||||
* \retval 0 Success
|
||||
*/
|
||||
extern int theora_encode_header(theora_state *t, ogg_packet *op);
|
||||
|
||||
/**
|
||||
* Request a comment header packet from provided metadata.
|
||||
* A pointer to the comment data is placed in a user-provided ogg_packet
|
||||
* structure.
|
||||
* \param tc A theora_comment structure filled with the desired metadata
|
||||
* \param op An ogg_packet structure to fill. libtheora will set all
|
||||
* elements of this structure, including a pointer to the encoded
|
||||
* comment data. The memory for the comment data is owned by
|
||||
* the application, and must be freed by it using _ogg_free().
|
||||
* On some systems (such as Windows when using dynamic linking), this
|
||||
* may mean the free is executed in a different module from the
|
||||
* malloc, which will crash; there is no way to free this memory on
|
||||
* such systems.
|
||||
* \retval 0 Success
|
||||
*/
|
||||
extern int theora_encode_comment(theora_comment *tc, ogg_packet *op);
|
||||
|
||||
/**
|
||||
* Request a packet containing the codebook tables for the stream.
|
||||
* A pointer to the codebook data is placed in a user-provided ogg_packet
|
||||
* structure.
|
||||
* \param t A theora_state handle previously initialized for encoding.
|
||||
* \param op An ogg_packet structure to fill. libtheora will set all
|
||||
* elements of this structure, including a pointer to the codebook
|
||||
* data. The memory for the header data is owned by libtheora.
|
||||
* \retval 0 Success
|
||||
*/
|
||||
extern int theora_encode_tables(theora_state *t, ogg_packet *op);
|
||||
|
||||
/**
|
||||
* Decode an Ogg packet, with the expectation that the packet contains
|
||||
* an initial header, comment data or codebook tables.
|
||||
*
|
||||
* \param ci A theora_info structure to fill. This must have been previously
|
||||
* initialized with theora_info_init(). If \a op contains an initial
|
||||
* header, theora_decode_header() will fill \a ci with the
|
||||
* parsed header values. If \a op contains codebook tables,
|
||||
* theora_decode_header() will parse these and attach an internal
|
||||
* representation to \a ci->codec_setup.
|
||||
* \param cc A theora_comment structure to fill. If \a op contains comment
|
||||
* data, theora_decode_header() will fill \a cc with the parsed
|
||||
* comments.
|
||||
* \param op An ogg_packet structure which you expect contains an initial
|
||||
* header, comment data or codebook tables.
|
||||
*
|
||||
* \retval OC_BADHEADER \a op is NULL; OR the first byte of \a op->packet
|
||||
* has the signature of an initial packet, but op is
|
||||
* not a b_o_s packet; OR this packet has the signature
|
||||
* of an initial header packet, but an initial header
|
||||
* packet has already been seen; OR this packet has the
|
||||
* signature of a comment packet, but the initial header
|
||||
* has not yet been seen; OR this packet has the signature
|
||||
* of a comment packet, but contains invalid data; OR
|
||||
* this packet has the signature of codebook tables,
|
||||
* but the initial header or comments have not yet
|
||||
* been seen; OR this packet has the signature of codebook
|
||||
* tables, but contains invalid data;
|
||||
* OR the stream being decoded has a compatible version
|
||||
* but this packet does not have the signature of a
|
||||
* theora initial header, comments, or codebook packet
|
||||
* \retval OC_VERSION The packet data of \a op is an initial header with
|
||||
* a version which is incompatible with this version of
|
||||
* libtheora.
|
||||
* \retval OC_NEWPACKET the stream being decoded has an incompatible (future)
|
||||
* version and contains an unknown signature.
|
||||
* \retval 0 Success
|
||||
*
|
||||
* \note The normal usage is that theora_decode_header() be called on the
|
||||
* first three packets of a theora logical bitstream in succession.
|
||||
*/
|
||||
extern int theora_decode_header(theora_info *ci, theora_comment *cc,
|
||||
ogg_packet *op);
|
||||
|
||||
/**
|
||||
* Initialize a theora_state handle for decoding.
|
||||
* \param th The theora_state handle to initialize.
|
||||
* \param c A theora_info struct filled with the desired decoding parameters.
|
||||
* This is of course usually obtained from a previous call to
|
||||
* theora_decode_header().
|
||||
* \retval 0 Success
|
||||
*/
|
||||
extern int theora_decode_init(theora_state *th, theora_info *c);
|
||||
|
||||
/**
|
||||
* Input a packet containing encoded data into the theora decoder.
|
||||
* \param th A theora_state handle previously initialized for decoding.
|
||||
* \param op An ogg_packet containing encoded theora data.
|
||||
* \retval 0 Success
|
||||
* \retval OC_BADPACKET \a op does not contain encoded video data
|
||||
*/
|
||||
extern int theora_decode_packetin(theora_state *th,ogg_packet *op);
|
||||
|
||||
/**
|
||||
* Output the next available frame of decoded YUV data.
|
||||
* \param th A theora_state handle previously initialized for decoding.
|
||||
* \param yuv A yuv_buffer in which libtheora should place the decoded data.
|
||||
* Note that the buffer struct itself is allocated by the user, but
|
||||
* that the luma and chroma pointers will be filled in by the
|
||||
* library. Also note that these luma and chroma regions should be
|
||||
* considered read-only by the user.
|
||||
* \retval 0 Success
|
||||
*/
|
||||
extern int theora_decode_YUVout(theora_state *th,yuv_buffer *yuv);
|
||||
|
||||
/**
|
||||
* Report whether a theora packet is a header or not
|
||||
* This function does no verification beyond checking the header
|
||||
* flag bit so it should not be used for bitstream identification;
|
||||
* use theora_decode_header() for that.
|
||||
*
|
||||
* \param op An ogg_packet containing encoded theora data.
|
||||
* \retval 1 The packet is a header packet
|
||||
* \retval 0 The packet is not a header packet (and so contains frame data)
|
||||
*
|
||||
* Thus function was added in the 1.0alpha4 release.
|
||||
*/
|
||||
extern int theora_packet_isheader(ogg_packet *op);
|
||||
|
||||
/**
|
||||
* Report whether a theora packet is a keyframe or not
|
||||
*
|
||||
* \param op An ogg_packet containing encoded theora data.
|
||||
* \retval 1 The packet contains a keyframe image
|
||||
* \retval 0 The packet is contains an interframe delta
|
||||
* \retval -1 The packet is not an image data packet at all
|
||||
*
|
||||
* Thus function was added in the 1.0alpha4 release.
|
||||
*/
|
||||
extern int theora_packet_iskeyframe(ogg_packet *op);
|
||||
|
||||
/**
|
||||
* Report the granulepos shift radix
|
||||
*
|
||||
* When embedded in Ogg, Theora uses a two-part granulepos,
|
||||
* splitting the 64-bit field into two pieces. The more-significant
|
||||
* section represents the frame count at the last keyframe,
|
||||
* and the less-significant section represents the count of
|
||||
* frames since the last keyframe. In this way the overall
|
||||
* field is still non-decreasing with time, but usefully encodes
|
||||
* a pointer to the last keyframe, which is necessary for
|
||||
* correctly restarting decode after a seek.
|
||||
*
|
||||
* This function reports the number of bits used to represent
|
||||
* the distance to the last keyframe, and thus how the granulepos
|
||||
* field must be shifted or masked to obtain the two parts.
|
||||
*
|
||||
* Since libtheora returns compressed data in an ogg_packet
|
||||
* structure, this may be generally useful even if the Theora
|
||||
* packets are not being used in an Ogg container.
|
||||
*
|
||||
* \param ti A previously initialized theora_info struct
|
||||
* \returns The bit shift dividing the two granulepos fields
|
||||
*
|
||||
* This function was added in the 1.0alpha5 release.
|
||||
*/
|
||||
int theora_granule_shift(theora_info *ti);
|
||||
|
||||
/**
|
||||
* Convert a granulepos to an absolute frame index, starting at 0.
|
||||
* The granulepos is interpreted in the context of a given theora_state handle.
|
||||
*
|
||||
* Note that while the granulepos encodes the frame count (i.e. starting
|
||||
* from 1) this call returns the frame index, starting from zero. Thus
|
||||
* One can calculate the presentation time by multiplying the index by
|
||||
* the rate.
|
||||
*
|
||||
* \param th A previously initialized theora_state handle (encode or decode)
|
||||
* \param granulepos The granulepos to convert.
|
||||
* \returns The frame index corresponding to \a granulepos.
|
||||
* \retval -1 The given granulepos is undefined (i.e. negative)
|
||||
*
|
||||
* Thus function was added in the 1.0alpha4 release.
|
||||
*/
|
||||
extern ogg_int64_t theora_granule_frame(theora_state *th,ogg_int64_t granulepos);
|
||||
|
||||
/**
|
||||
* Convert a granulepos to absolute time in seconds. The granulepos is
|
||||
* interpreted in the context of a given theora_state handle, and gives
|
||||
* the end time of a frame's presentation as used in Ogg mux ordering.
|
||||
*
|
||||
* \param th A previously initialized theora_state handle (encode or decode)
|
||||
* \param granulepos The granulepos to convert.
|
||||
* \returns The absolute time in seconds corresponding to \a granulepos.
|
||||
* This is the "end time" for the frame, or the latest time it should
|
||||
* be displayed.
|
||||
* It is not the presentation time.
|
||||
* \retval -1. The given granulepos is undefined (i.e. negative).
|
||||
*/
|
||||
extern double theora_granule_time(theora_state *th,ogg_int64_t granulepos);
|
||||
|
||||
/**
|
||||
* Initialize a theora_info structure. All values within the given theora_info
|
||||
* structure are initialized, and space is allocated within libtheora for
|
||||
* internal codec setup data.
|
||||
* \param c A theora_info struct to initialize.
|
||||
*/
|
||||
extern void theora_info_init(theora_info *c);
|
||||
|
||||
/**
|
||||
* Clear a theora_info structure. All values within the given theora_info
|
||||
* structure are cleared, and associated internal codec setup data is freed.
|
||||
* \param c A theora_info struct to initialize.
|
||||
*/
|
||||
extern void theora_info_clear(theora_info *c);
|
||||
|
||||
/**
|
||||
* Free all internal data associated with a theora_state handle.
|
||||
* \param t A theora_state handle.
|
||||
*/
|
||||
extern void theora_clear(theora_state *t);
|
||||
|
||||
/**
|
||||
* Initialize an allocated theora_comment structure
|
||||
* \param tc An allocated theora_comment structure
|
||||
**/
|
||||
extern void theora_comment_init(theora_comment *tc);
|
||||
|
||||
/**
|
||||
* Add a comment to an initialized theora_comment structure
|
||||
* \param tc A previously initialized theora comment structure
|
||||
* \param comment A null-terminated string encoding the comment in the form
|
||||
* "TAG=the value"
|
||||
*
|
||||
* Neither theora_comment_add() nor theora_comment_add_tag() support
|
||||
* comments containing null values, although the bitstream format
|
||||
* supports this. To add such comments you will need to manipulate
|
||||
* the theora_comment structure directly.
|
||||
**/
|
||||
|
||||
extern void theora_comment_add(theora_comment *tc, char *comment);
|
||||
|
||||
/**
|
||||
* Add a comment to an initialized theora_comment structure.
|
||||
* \param tc A previously initialized theora comment structure
|
||||
* \param tag A null-terminated string containing the tag
|
||||
* associated with the comment.
|
||||
* \param value The corresponding value as a null-terminated string
|
||||
*
|
||||
* Neither theora_comment_add() nor theora_comment_add_tag() support
|
||||
* comments containing null values, although the bitstream format
|
||||
* supports this. To add such comments you will need to manipulate
|
||||
* the theora_comment structure directly.
|
||||
**/
|
||||
extern void theora_comment_add_tag(theora_comment *tc,
|
||||
char *tag, char *value);
|
||||
|
||||
/**
|
||||
* Look up a comment value by tag.
|
||||
* \param tc Tn initialized theora_comment structure
|
||||
* \param tag The tag to look up
|
||||
* \param count The instance of the tag. The same tag can appear multiple
|
||||
* times, each with a distinct and ordered value, so an index
|
||||
* is required to retrieve them all.
|
||||
* \returns A pointer to the queried tag's value
|
||||
* \retval NULL No matching tag is found
|
||||
*
|
||||
* \note Use theora_comment_query_count() to get the legal range for the
|
||||
* count parameter.
|
||||
**/
|
||||
|
||||
extern char *theora_comment_query(theora_comment *tc, char *tag, int count);
|
||||
|
||||
/** Look up the number of instances of a tag.
|
||||
* \param tc An initialized theora_comment structure
|
||||
* \param tag The tag to look up
|
||||
* \returns The number on instances of a particular tag.
|
||||
*
|
||||
* Call this first when querying for a specific tag and then interate
|
||||
* over the number of instances with separate calls to
|
||||
* theora_comment_query() to retrieve all instances in order.
|
||||
**/
|
||||
extern int theora_comment_query_count(theora_comment *tc, char *tag);
|
||||
|
||||
/**
|
||||
* Clear an allocated theora_comment struct so that it can be freed.
|
||||
* \param tc An allocated theora_comment structure.
|
||||
**/
|
||||
extern void theora_comment_clear(theora_comment *tc);
|
||||
|
||||
/**Encoder control function.
|
||||
* This is used to provide advanced control the encoding process.
|
||||
* \param th A #theora_state handle.
|
||||
* \param req The control code to process.
|
||||
* See \ref encctlcodes_old "the list of available
|
||||
* control codes" for details.
|
||||
* \param buf The parameters for this control code.
|
||||
* \param buf_sz The size of the parameter buffer.*/
|
||||
extern int theora_control(theora_state *th,int req,void *buf,size_t buf_sz);
|
||||
|
||||
/* @} */ /* end oldfuncs doxygen group */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* _O_THEORA_H_ */
|
||||
333
engine/thirdparty/libtheora/theora/theoradec.h
vendored
Normal file
333
engine/thirdparty/libtheora/theora/theoradec.h
vendored
Normal file
|
|
@ -0,0 +1,333 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: theora.h,v 1.8 2004/03/15 22:17:32 derf Exp $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/**\file
|
||||
* The <tt>libtheoradec</tt> C decoding API.*/
|
||||
|
||||
#if !defined(_O_THEORA_THEORADEC_H_)
|
||||
# define _O_THEORA_THEORADEC_H_ (1)
|
||||
# include <stddef.h>
|
||||
# include <ogg/ogg.h>
|
||||
# include "codec.h"
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/**\name th_decode_ctl() codes
|
||||
* \anchor decctlcodes
|
||||
* These are the available request codes for th_decode_ctl().
|
||||
* By convention, these are odd, to distinguish them from the
|
||||
* \ref encctlcodes "encoder control codes".
|
||||
* Keep any experimental or vendor-specific values above \c 0x8000.*/
|
||||
/*@{*/
|
||||
/**Gets the maximum post-processing level.
|
||||
* The decoder supports a post-processing filter that can improve
|
||||
* the appearance of the decoded images. This returns the highest
|
||||
* level setting for this post-processor, corresponding to maximum
|
||||
* improvement and computational expense.
|
||||
*
|
||||
* \param[out] _buf int: The maximum post-processing level.
|
||||
* \retval TH_EFAULT \a _dec_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_DECCTL_GET_PPLEVEL_MAX (1)
|
||||
/**Sets the post-processing level.
|
||||
* By default, post-processing is disabled.
|
||||
*
|
||||
* Sets the level of post-processing to use when decoding the
|
||||
* compressed stream. This must be a value between zero (off)
|
||||
* and the maximum returned by TH_DECCTL_GET_PPLEVEL_MAX.
|
||||
*
|
||||
* \param[in] _buf int: The new post-processing level.
|
||||
* 0 to disable; larger values use more CPU.
|
||||
* \retval TH_EFAULT \a _dec_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
|
||||
* post-processing level is out of bounds.
|
||||
* The maximum post-processing level may be
|
||||
* implementation-specific, and can be obtained via
|
||||
* #TH_DECCTL_GET_PPLEVEL_MAX.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_DECCTL_SET_PPLEVEL (3)
|
||||
/**Sets the granule position.
|
||||
* Call this after a seek, before decoding the first frame, to ensure that the
|
||||
* proper granule position is returned for all subsequent frames.
|
||||
* If you track timestamps yourself and do not use the granule position
|
||||
* returned by the decoder, then you need not call this function.
|
||||
*
|
||||
* \param[in] _buf <tt>ogg_int64_t</tt>: The granule position of the next
|
||||
* frame.
|
||||
* \retval TH_EFAULT \a _dec_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(ogg_int64_t)</tt>, or the
|
||||
* granule position is negative.*/
|
||||
#define TH_DECCTL_SET_GRANPOS (5)
|
||||
/**Sets the striped decode callback function.
|
||||
* If set, this function will be called as each piece of a frame is fully
|
||||
* decoded in th_decode_packetin().
|
||||
* You can pass in a #th_stripe_callback with
|
||||
* th_stripe_callback#stripe_decoded set to <tt>NULL</tt> to disable the
|
||||
* callbacks at any point.
|
||||
* Enabling striped decode does not prevent you from calling
|
||||
* th_decode_ycbcr_out() after the frame is fully decoded.
|
||||
*
|
||||
* \param[in] _buf #th_stripe_callback: The callback parameters.
|
||||
* \retval TH_EFAULT \a _dec_ctx or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not
|
||||
* <tt>sizeof(th_stripe_callback)</tt>.*/
|
||||
#define TH_DECCTL_SET_STRIPE_CB (7)
|
||||
|
||||
/**Sets the macroblock display mode. Set to 0 to disable displaying
|
||||
* macroblocks.*/
|
||||
#define TH_DECCTL_SET_TELEMETRY_MBMODE (9)
|
||||
/**Sets the motion vector display mode. Set to 0 to disable displaying motion
|
||||
* vectors.*/
|
||||
#define TH_DECCTL_SET_TELEMETRY_MV (11)
|
||||
/**Sets the adaptive quantization display mode. Set to 0 to disable displaying
|
||||
* adaptive quantization. */
|
||||
#define TH_DECCTL_SET_TELEMETRY_QI (13)
|
||||
/**Sets the bitstream breakdown visualization mode. Set to 0 to disable
|
||||
* displaying bitstream breakdown.*/
|
||||
#define TH_DECCTL_SET_TELEMETRY_BITS (15)
|
||||
/*@}*/
|
||||
|
||||
|
||||
|
||||
/**A callback function for striped decode.
|
||||
* This is a function pointer to an application-provided function that will be
|
||||
* called each time a section of the image is fully decoded in
|
||||
* th_decode_packetin().
|
||||
* This allows the application to process the section immediately, while it is
|
||||
* still in cache.
|
||||
* Note that the frame is decoded bottom to top, so \a _yfrag0 will steadily
|
||||
* decrease with each call until it reaches 0, at which point the full frame
|
||||
* is decoded.
|
||||
* The number of fragment rows made available in each call depends on the pixel
|
||||
* format and the number of post-processing filters enabled, and may not even
|
||||
* be constant for the entire frame.
|
||||
* If a non-<tt>NULL</tt> \a _granpos pointer is passed to
|
||||
* th_decode_packetin(), the granule position for the frame will be stored
|
||||
* in it before the first callback is made.
|
||||
* If an entire frame is dropped (a 0-byte packet), then no callbacks will be
|
||||
* made at all for that frame.
|
||||
* \param _ctx An application-provided context pointer.
|
||||
* \param _buf The image buffer for the decoded frame.
|
||||
* \param _yfrag0 The Y coordinate of the first row of 8x8 fragments
|
||||
* decoded.
|
||||
* Multiply this by 8 to obtain the pixel row number in the
|
||||
* luma plane.
|
||||
* If the chroma planes are subsampled in the Y direction,
|
||||
* this will always be divisible by two.
|
||||
* \param _yfrag_end The Y coordinate of the first row of 8x8 fragments past
|
||||
* the newly decoded section.
|
||||
* If the chroma planes are subsampled in the Y direction,
|
||||
* this will always be divisible by two.
|
||||
* I.e., this section contains fragment rows
|
||||
* <tt>\a _yfrag0 ...\a _yfrag_end -1</tt>.*/
|
||||
typedef void (*th_stripe_decoded_func)(void *_ctx,th_ycbcr_buffer _buf,
|
||||
int _yfrag0,int _yfrag_end);
|
||||
|
||||
/**The striped decode callback data to pass to #TH_DECCTL_SET_STRIPE_CB.*/
|
||||
typedef struct{
|
||||
/**An application-provided context pointer.
|
||||
* This will be passed back verbatim to the application.*/
|
||||
void *ctx;
|
||||
/**The callback function pointer.*/
|
||||
th_stripe_decoded_func stripe_decoded;
|
||||
}th_stripe_callback;
|
||||
|
||||
|
||||
|
||||
/**\name Decoder state
|
||||
The following data structures are opaque, and their contents are not
|
||||
publicly defined by this API.
|
||||
Referring to their internals directly is unsupported, and may break without
|
||||
warning.*/
|
||||
/*@{*/
|
||||
/**The decoder context.*/
|
||||
typedef struct th_dec_ctx th_dec_ctx;
|
||||
/**Setup information.
|
||||
This contains auxiliary information (Huffman tables and quantization
|
||||
parameters) decoded from the setup header by th_decode_headerin() to be
|
||||
passed to th_decode_alloc().
|
||||
It can be re-used to initialize any number of decoders, and can be freed
|
||||
via th_setup_free() at any time.*/
|
||||
typedef struct th_setup_info th_setup_info;
|
||||
/*@}*/
|
||||
|
||||
|
||||
|
||||
/**\defgroup decfuncs Functions for Decoding*/
|
||||
/*@{*/
|
||||
/**\name Functions for decoding
|
||||
* You must link to <tt>libtheoradec</tt> if you use any of the
|
||||
* functions in this section.
|
||||
*
|
||||
* The functions are listed in the order they are used in a typical decode.
|
||||
* The basic steps are:
|
||||
* - Parse the header packets by repeatedly calling th_decode_headerin().
|
||||
* - Allocate a #th_dec_ctx handle with th_decode_alloc().
|
||||
* - Call th_setup_free() to free any memory used for codec setup
|
||||
* information.
|
||||
* - Perform any additional decoder configuration with th_decode_ctl().
|
||||
* - For each video data packet:
|
||||
* - Submit the packet to the decoder via th_decode_packetin().
|
||||
* - Retrieve the uncompressed video data via th_decode_ycbcr_out().
|
||||
* - Call th_decode_free() to release all decoder memory.*/
|
||||
/*@{*/
|
||||
/**Decodes the header packets of a Theora stream.
|
||||
* This should be called on the initial packets of the stream, in succession,
|
||||
* until it returns <tt>0</tt>, indicating that all headers have been
|
||||
* processed, or an error is encountered.
|
||||
* At least three header packets are required, and additional optional header
|
||||
* packets may follow.
|
||||
* This can be used on the first packet of any logical stream to determine if
|
||||
* that stream is a Theora stream.
|
||||
* \param _info A #th_info structure to fill in.
|
||||
* This must have been previously initialized with
|
||||
* th_info_init().
|
||||
* The application may immediately begin using the contents of
|
||||
* this structure after the first header is decoded, though it
|
||||
* must continue to be passed in on all subsequent calls.
|
||||
* \param _tc A #th_comment structure to fill in.
|
||||
* The application may immediately begin using the contents of
|
||||
* this structure after the second header is decoded, though it
|
||||
* must continue to be passed in on all subsequent calls.
|
||||
* \param _setup Returns a pointer to additional, private setup information
|
||||
* needed by the decoder.
|
||||
* The contents of this pointer must be initialized to
|
||||
* <tt>NULL</tt> on the first call, and the returned value must
|
||||
* continue to be passed in on all subsequent calls.
|
||||
* \param _op An <tt>ogg_packet</tt> structure which contains one of the
|
||||
* initial packets of an Ogg logical stream.
|
||||
* \return A positive value indicates that a Theora header was successfully
|
||||
* processed.
|
||||
* \retval 0 The first video data packet was encountered after all
|
||||
* required header packets were parsed.
|
||||
* The packet just passed in on this call should be saved
|
||||
* and fed to th_decode_packetin() to begin decoding
|
||||
* video data.
|
||||
* \retval TH_EFAULT One of \a _info, \a _tc, or \a _setup was
|
||||
* <tt>NULL</tt>.
|
||||
* \retval TH_EBADHEADER \a _op was <tt>NULL</tt>, the packet was not the next
|
||||
* header packet in the expected sequence, or the format
|
||||
* of the header data was invalid.
|
||||
* \retval TH_EVERSION The packet data was a Theora info header, but for a
|
||||
* bitstream version not decodable with this version of
|
||||
* <tt>libtheoradec</tt>.
|
||||
* \retval TH_ENOTFORMAT The packet was not a Theora header.
|
||||
*/
|
||||
extern int th_decode_headerin(th_info *_info,th_comment *_tc,
|
||||
th_setup_info **_setup,ogg_packet *_op);
|
||||
/**Allocates a decoder instance.
|
||||
*
|
||||
* <b>Security Warning:</b> The Theora format supports very large frame sizes,
|
||||
* potentially even larger than the address space of a 32-bit machine, and
|
||||
* creating a decoder context allocates the space for several frames of data.
|
||||
* If the allocation fails here, your program will crash, possibly at some
|
||||
* future point because the OS kernel returned a valid memory range and will
|
||||
* only fail when it tries to map the pages in it the first time they are
|
||||
* used.
|
||||
* Even if it succeeds, you may experience a denial of service if the frame
|
||||
* size is large enough to cause excessive paging.
|
||||
* If you are integrating libtheora in a larger application where such things
|
||||
* are undesirable, it is highly recommended that you check the frame size in
|
||||
* \a _info before calling this function and refuse to decode streams where it
|
||||
* is larger than some reasonable maximum.
|
||||
* libtheora will not check this for you, because there may be machines that
|
||||
* can handle such streams and applications that wish to.
|
||||
* \param _info A #th_info struct filled via th_decode_headerin().
|
||||
* \param _setup A #th_setup_info handle returned via
|
||||
* th_decode_headerin().
|
||||
* \return The initialized #th_dec_ctx handle.
|
||||
* \retval NULL If the decoding parameters were invalid.*/
|
||||
extern th_dec_ctx *th_decode_alloc(const th_info *_info,
|
||||
const th_setup_info *_setup);
|
||||
/**Releases all storage used for the decoder setup information.
|
||||
* This should be called after you no longer want to create any decoders for
|
||||
* a stream whose headers you have parsed with th_decode_headerin().
|
||||
* \param _setup The setup information to free.
|
||||
* This can safely be <tt>NULL</tt>.*/
|
||||
extern void th_setup_free(th_setup_info *_setup);
|
||||
/**Decoder control function.
|
||||
* This is used to provide advanced control of the decoding process.
|
||||
* \param _dec A #th_dec_ctx handle.
|
||||
* \param _req The control code to process.
|
||||
* See \ref decctlcodes "the list of available control codes"
|
||||
* for details.
|
||||
* \param _buf The parameters for this control code.
|
||||
* \param _buf_sz The size of the parameter buffer.
|
||||
* \return Possible return values depend on the control code used.
|
||||
* See \ref decctlcodes "the list of control codes" for
|
||||
* specific values. Generally 0 indicates success.*/
|
||||
extern int th_decode_ctl(th_dec_ctx *_dec,int _req,void *_buf,
|
||||
size_t _buf_sz);
|
||||
/**Submits a packet containing encoded video data to the decoder.
|
||||
* \param _dec A #th_dec_ctx handle.
|
||||
* \param _op An <tt>ogg_packet</tt> containing encoded video data.
|
||||
* \param _granpos Returns the granule position of the decoded packet.
|
||||
* If non-<tt>NULL</tt>, the granule position for this specific
|
||||
* packet is stored in this location.
|
||||
* This is computed incrementally from previously decoded
|
||||
* packets.
|
||||
* After a seek, the correct granule position must be set via
|
||||
* #TH_DECCTL_SET_GRANPOS for this to work properly.
|
||||
* \retval 0 Success.
|
||||
* A new decoded frame can be retrieved by calling
|
||||
* th_decode_ycbcr_out().
|
||||
* \retval TH_DUPFRAME The packet represented a dropped frame (either a
|
||||
* 0-byte frame or an INTER frame with no coded blocks).
|
||||
* The player can skip the call to th_decode_ycbcr_out(),
|
||||
* as the contents of the decoded frame buffer have not
|
||||
* changed.
|
||||
* \retval TH_EFAULT \a _dec or \a _op was <tt>NULL</tt>.
|
||||
* \retval TH_EBADPACKET \a _op does not contain encoded video data.
|
||||
* \retval TH_EIMPL The video data uses bitstream features which this
|
||||
* library does not support.*/
|
||||
extern int th_decode_packetin(th_dec_ctx *_dec,const ogg_packet *_op,
|
||||
ogg_int64_t *_granpos);
|
||||
/**Outputs the next available frame of decoded Y'CbCr data.
|
||||
* If a striped decode callback has been set with #TH_DECCTL_SET_STRIPE_CB,
|
||||
* then the application does not need to call this function.
|
||||
* \param _dec A #th_dec_ctx handle.
|
||||
* \param _ycbcr A video buffer structure to fill in.
|
||||
* <tt>libtheoradec</tt> will fill in all the members of this
|
||||
* structure, including the pointers to the uncompressed video
|
||||
* data.
|
||||
* The memory for this video data is owned by
|
||||
* <tt>libtheoradec</tt>.
|
||||
* It may be freed or overwritten without notification when
|
||||
* subsequent frames are decoded.
|
||||
* \retval 0 Success
|
||||
* \retval TH_EFAULT \a _dec or \a _ycbcr was <tt>NULL</tt>.
|
||||
*/
|
||||
extern int th_decode_ycbcr_out(th_dec_ctx *_dec,
|
||||
th_ycbcr_buffer _ycbcr);
|
||||
/**Frees an allocated decoder instance.
|
||||
* \param _dec A #th_dec_ctx handle.*/
|
||||
extern void th_decode_free(th_dec_ctx *_dec);
|
||||
/*@}*/
|
||||
/*@}*/
|
||||
|
||||
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
548
engine/thirdparty/libtheora/theora/theoraenc.h
vendored
Normal file
548
engine/thirdparty/libtheora/theora/theoraenc.h
vendored
Normal file
|
|
@ -0,0 +1,548 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: theora.h,v 1.8 2004/03/15 22:17:32 derf Exp $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/**\file
|
||||
* The <tt>libtheoraenc</tt> C encoding API.*/
|
||||
|
||||
#if !defined(_O_THEORA_THEORAENC_H_)
|
||||
# define _O_THEORA_THEORAENC_H_ (1)
|
||||
# include <stddef.h>
|
||||
# include <ogg/ogg.h>
|
||||
# include "codec.h"
|
||||
|
||||
#if defined(__cplusplus)
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
/**\name th_encode_ctl() codes
|
||||
* \anchor encctlcodes
|
||||
* These are the available request codes for th_encode_ctl().
|
||||
* By convention, these are even, to distinguish them from the
|
||||
* \ref decctlcodes "decoder control codes".
|
||||
* Keep any experimental or vendor-specific values above \c 0x8000.*/
|
||||
/*@{*/
|
||||
/**Sets the Huffman tables to use.
|
||||
* The tables are copied, not stored by reference, so they can be freed after
|
||||
* this call.
|
||||
* <tt>NULL</tt> may be specified to revert to the default tables.
|
||||
*
|
||||
* \param[in] _buf <tt>#th_huff_code[#TH_NHUFFMAN_TABLES][#TH_NDCT_TOKENS]</tt>
|
||||
* \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL Encoding has already begun or one or more of the given
|
||||
* tables is not full or prefix-free, \a _buf is
|
||||
* <tt>NULL</tt> and \a _buf_sz is not zero, or \a _buf is
|
||||
* non-<tt>NULL</tt> and \a _buf_sz is not
|
||||
* <tt>sizeof(#th_huff_code)*#TH_NHUFFMAN_TABLES*#TH_NDCT_TOKENS</tt>.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_HUFFMAN_CODES (0)
|
||||
/**Sets the quantization parameters to use.
|
||||
* The parameters are copied, not stored by reference, so they can be freed
|
||||
* after this call.
|
||||
* <tt>NULL</tt> may be specified to revert to the default parameters.
|
||||
*
|
||||
* \param[in] _buf #th_quant_info
|
||||
* \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL Encoding has already begun, \a _buf is
|
||||
* <tt>NULL</tt> and \a _buf_sz is not zero,
|
||||
* or \a _buf is non-<tt>NULL</tt> and
|
||||
* \a _buf_sz is not <tt>sizeof(#th_quant_info)</tt>.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_QUANT_PARAMS (2)
|
||||
/**Sets the maximum distance between key frames.
|
||||
* This can be changed during an encode, but will be bounded by
|
||||
* <tt>1<<th_info#keyframe_granule_shift</tt>.
|
||||
* If it is set before encoding begins, th_info#keyframe_granule_shift will
|
||||
* be enlarged appropriately.
|
||||
*
|
||||
* \param[in] _buf <tt>ogg_uint32_t</tt>: The maximum distance between key
|
||||
* frames.
|
||||
* \param[out] _buf <tt>ogg_uint32_t</tt>: The actual maximum distance set.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(ogg_uint32_t)</tt>.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE (4)
|
||||
/**Disables any encoder features that would prevent lossless transcoding back
|
||||
* to VP3.
|
||||
* This primarily means disabling block-adaptive quantization and always coding
|
||||
* all four luma blocks in a macro block when 4MV is used.
|
||||
* It also includes using the VP3 quantization tables and Huffman codes; if you
|
||||
* set them explicitly after calling this function, the resulting stream will
|
||||
* not be VP3-compatible.
|
||||
* If you enable VP3-compatibility when encoding 4:2:2 or 4:4:4 source
|
||||
* material, or when using a picture region smaller than the full frame (e.g.
|
||||
* a non-multiple-of-16 width or height), then non-VP3 bitstream features will
|
||||
* still be disabled, but the stream will still not be VP3-compatible, as VP3
|
||||
* was not capable of encoding such formats.
|
||||
* If you call this after encoding has already begun, then the quantization
|
||||
* tables and codebooks cannot be changed, but the frame-level features will
|
||||
* be enabled or disabled as requested.
|
||||
*
|
||||
* \param[in] _buf <tt>int</tt>: a non-zero value to enable VP3 compatibility,
|
||||
* or 0 to disable it (the default).
|
||||
* \param[out] _buf <tt>int</tt>: 1 if all bitstream features required for
|
||||
* VP3-compatibility could be set, and 0 otherwise.
|
||||
* The latter will be returned if the pixel format is not
|
||||
* 4:2:0, the picture region is smaller than the full frame,
|
||||
* or if encoding has begun, preventing the quantization
|
||||
* tables and codebooks from being set.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_VP3_COMPATIBLE (10)
|
||||
/**Gets the maximum speed level.
|
||||
* Higher speed levels favor quicker encoding over better quality per bit.
|
||||
* Depending on the encoding mode, and the internal algorithms used, quality
|
||||
* may actually improve, but in this case bitrate will also likely increase.
|
||||
* In any case, overall rate/distortion performance will probably decrease.
|
||||
* The maximum value, and the meaning of each value, may change depending on
|
||||
* the current encoding mode (VBR vs. constant quality, etc.).
|
||||
*
|
||||
* \param[out] _buf <tt>int</tt>: The maximum encoding speed level.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
|
||||
* \retval TH_EIMPL Not supported by this implementation in the current
|
||||
* encoding mode.*/
|
||||
#define TH_ENCCTL_GET_SPLEVEL_MAX (12)
|
||||
/**Sets the speed level.
|
||||
* The current speed level may be retrieved using #TH_ENCCTL_GET_SPLEVEL.
|
||||
*
|
||||
* \param[in] _buf <tt>int</tt>: The new encoding speed level.
|
||||
* 0 is slowest, larger values use less CPU.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
|
||||
* encoding speed level is out of bounds.
|
||||
* The maximum encoding speed level may be
|
||||
* implementation- and encoding mode-specific, and can be
|
||||
* obtained via #TH_ENCCTL_GET_SPLEVEL_MAX.
|
||||
* \retval TH_EIMPL Not supported by this implementation in the current
|
||||
* encoding mode.*/
|
||||
#define TH_ENCCTL_SET_SPLEVEL (14)
|
||||
/**Gets the current speed level.
|
||||
* The default speed level may vary according to encoder implementation, but if
|
||||
* this control code is not supported (it returns #TH_EIMPL), the default may
|
||||
* be assumed to be the slowest available speed (0).
|
||||
* The maximum encoding speed level may be implementation- and encoding
|
||||
* mode-specific, and can be obtained via #TH_ENCCTL_GET_SPLEVEL_MAX.
|
||||
*
|
||||
* \param[out] _buf <tt>int</tt>: The current encoding speed level.
|
||||
* 0 is slowest, larger values use less CPU.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>.
|
||||
* \retval TH_EIMPL Not supported by this implementation in the current
|
||||
* encoding mode.*/
|
||||
#define TH_ENCCTL_GET_SPLEVEL (16)
|
||||
/**Sets the number of duplicates of the next frame to produce.
|
||||
* Although libtheora can encode duplicate frames very cheaply, it costs some
|
||||
* amount of CPU to detect them, and a run of duplicates cannot span a
|
||||
* keyframe boundary.
|
||||
* This control code tells the encoder to produce the specified number of extra
|
||||
* duplicates of the next frame.
|
||||
* This allows the encoder to make smarter keyframe placement decisions and
|
||||
* rate control decisions, and reduces CPU usage as well, when compared to
|
||||
* just submitting the same frame for encoding multiple times.
|
||||
* This setting only applies to the next frame submitted for encoding.
|
||||
* You MUST call th_encode_packetout() repeatedly until it returns 0, or the
|
||||
* extra duplicate frames will be lost.
|
||||
*
|
||||
* \param[in] _buf <tt>int</tt>: The number of duplicates to produce.
|
||||
* If this is negative or zero, no duplicates will be produced.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or the
|
||||
* number of duplicates is greater than or equal to the
|
||||
* maximum keyframe interval.
|
||||
* In the latter case, NO duplicate frames will be produced.
|
||||
* You must ensure that the maximum keyframe interval is set
|
||||
* larger than the maximum number of duplicates you will
|
||||
* ever wish to insert prior to encoding.
|
||||
* \retval TH_EIMPL Not supported by this implementation in the current
|
||||
* encoding mode.*/
|
||||
#define TH_ENCCTL_SET_DUP_COUNT (18)
|
||||
/**Modifies the default bitrate management behavior.
|
||||
* Use to allow or disallow frame dropping, and to enable or disable capping
|
||||
* bit reservoir overflows and underflows.
|
||||
* See \ref encctlcodes "the list of available flags".
|
||||
* The flags are set by default to
|
||||
* <tt>#TH_RATECTL_DROP_FRAMES|#TH_RATECTL_CAP_OVERFLOW</tt>.
|
||||
*
|
||||
* \param[in] _buf <tt>int</tt>: Any combination of
|
||||
* \ref ratectlflags "the available flags":
|
||||
* - #TH_RATECTL_DROP_FRAMES: Enable frame dropping.
|
||||
* - #TH_RATECTL_CAP_OVERFLOW: Don't bank excess bits for later
|
||||
* use.
|
||||
* - #TH_RATECTL_CAP_UNDERFLOW: Don't try to make up shortfalls
|
||||
* later.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt> or rate control
|
||||
* is not enabled.
|
||||
* \retval TH_EIMPL Not supported by this implementation in the current
|
||||
* encoding mode.*/
|
||||
#define TH_ENCCTL_SET_RATE_FLAGS (20)
|
||||
/**Sets the size of the bitrate management bit reservoir as a function
|
||||
* of number of frames.
|
||||
* The reservoir size affects how quickly bitrate management reacts to
|
||||
* instantaneous changes in the video complexity.
|
||||
* Larger reservoirs react more slowly, and provide better overall quality, but
|
||||
* require more buffering by a client, adding more latency to live streams.
|
||||
* By default, libtheora sets the reservoir to the maximum distance between
|
||||
* keyframes, subject to a minimum and maximum limit.
|
||||
* This call may be used to increase or decrease the reservoir, increasing or
|
||||
* decreasing the allowed temporary variance in bitrate.
|
||||
* An implementation may impose some limits on the size of a reservoir it can
|
||||
* handle, in which case the actual reservoir size may not be exactly what was
|
||||
* requested.
|
||||
* The actual value set will be returned.
|
||||
*
|
||||
* \param[in] _buf <tt>int</tt>: Requested size of the reservoir measured in
|
||||
* frames.
|
||||
* \param[out] _buf <tt>int</tt>: The actual size of the reservoir set.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(int)</tt>, or rate control
|
||||
* is not enabled. The buffer has an implementation
|
||||
* defined minimum and maximum size and the value in _buf
|
||||
* will be adjusted to match the actual value set.
|
||||
* \retval TH_EIMPL Not supported by this implementation in the current
|
||||
* encoding mode.*/
|
||||
#define TH_ENCCTL_SET_RATE_BUFFER (22)
|
||||
/**Enable pass 1 of two-pass encoding mode and retrieve the first pass metrics.
|
||||
* Pass 1 mode must be enabled before the first frame is encoded, and a target
|
||||
* bitrate must have already been specified to the encoder.
|
||||
* Although this does not have to be the exact rate that will be used in the
|
||||
* second pass, closer values may produce better results.
|
||||
* The first call returns the size of the two-pass header data, along with some
|
||||
* placeholder content, and sets the encoder into pass 1 mode implicitly.
|
||||
* This call sets the encoder to pass 1 mode implicitly.
|
||||
* Then, a subsequent call must be made after each call to
|
||||
* th_encode_ycbcr_in() to retrieve the metrics for that frame.
|
||||
* An additional, final call must be made to retrieve the summary data,
|
||||
* containing such information as the total number of frames, etc.
|
||||
* This must be stored in place of the placeholder data that was returned
|
||||
* in the first call, before the frame metrics data.
|
||||
* All of this data must be presented back to the encoder during pass 2 using
|
||||
* #TH_ENCCTL_2PASS_IN.
|
||||
*
|
||||
* \param[out] <tt>char *</tt>_buf: Returns a pointer to internal storage
|
||||
* containing the two pass metrics data.
|
||||
* This storage is only valid until the next call, or until the
|
||||
* encoder context is freed, and must be copied by the
|
||||
* application.
|
||||
* \retval >=0 The number of bytes of metric data available in the
|
||||
* returned buffer.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL \a _buf_sz is not <tt>sizeof(char *)</tt>, no target
|
||||
* bitrate has been set, or the first call was made after
|
||||
* the first frame was submitted for encoding.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_2PASS_OUT (24)
|
||||
/**Submits two-pass encoding metric data collected the first encoding pass to
|
||||
* the second pass.
|
||||
* The first call must be made before the first frame is encoded, and a target
|
||||
* bitrate must have already been specified to the encoder.
|
||||
* It sets the encoder to pass 2 mode implicitly; this cannot be disabled.
|
||||
* The encoder may require reading data from some or all of the frames in
|
||||
* advance, depending on, e.g., the reservoir size used in the second pass.
|
||||
* You must call this function repeatedly before each frame to provide data
|
||||
* until either a) it fails to consume all of the data presented or b) all of
|
||||
* the pass 1 data has been consumed.
|
||||
* In the first case, you must save the remaining data to be presented after
|
||||
* the next frame.
|
||||
* You can call this function with a NULL argument to get an upper bound on
|
||||
* the number of bytes that will be required before the next frame.
|
||||
*
|
||||
* When pass 2 is first enabled, the default bit reservoir is set to the entire
|
||||
* file; this gives maximum flexibility but can lead to very high peak rates.
|
||||
* You can subsequently set it to another value with #TH_ENCCTL_SET_RATE_BUFFER
|
||||
* (e.g., to set it to the keyframe interval for non-live streaming), however,
|
||||
* you may then need to provide more data before the next frame.
|
||||
*
|
||||
* \param[in] _buf <tt>char[]</tt>: A buffer containing the data returned by
|
||||
* #TH_ENCCTL_2PASS_OUT in pass 1.
|
||||
* You may pass <tt>NULL</tt> for \a _buf to return an upper
|
||||
* bound on the number of additional bytes needed before the
|
||||
* next frame.
|
||||
* The summary data returned at the end of pass 1 must be at
|
||||
* the head of the buffer on the first call with a
|
||||
* non-<tt>NULL</tt> \a _buf, and the placeholder data
|
||||
* returned at the start of pass 1 should be omitted.
|
||||
* After each call you should advance this buffer by the number
|
||||
* of bytes consumed.
|
||||
* \retval >0 The number of bytes of metric data required/consumed.
|
||||
* \retval 0 No more data is required before the next frame.
|
||||
* \retval TH_EFAULT \a _enc is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL No target bitrate has been set, or the first call was
|
||||
* made after the first frame was submitted for
|
||||
* encoding.
|
||||
* \retval TH_ENOTFORMAT The data did not appear to be pass 1 from a compatible
|
||||
* implementation of this library.
|
||||
* \retval TH_EBADHEADER The data was invalid; this may be returned when
|
||||
* attempting to read an aborted pass 1 file that still
|
||||
* has the placeholder data in place of the summary
|
||||
* data.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_2PASS_IN (26)
|
||||
/**Sets the current encoding quality.
|
||||
* This is only valid so long as no bitrate has been specified, either through
|
||||
* the #th_info struct used to initialize the encoder or through
|
||||
* #TH_ENCCTL_SET_BITRATE (this restriction may be relaxed in a future
|
||||
* version).
|
||||
* If it is set before the headers are emitted, the target quality encoded in
|
||||
* them will be updated.
|
||||
*
|
||||
* \param[in] _buf <tt>int</tt>: The new target quality, in the range 0...63,
|
||||
* inclusive.
|
||||
* \retval 0 Success.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL A target bitrate has already been specified, or the
|
||||
* quality index was not in the range 0...63.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_QUALITY (28)
|
||||
/**Sets the current encoding bitrate.
|
||||
* Once a bitrate is set, the encoder must use a rate-controlled mode for all
|
||||
* future frames (this restriction may be relaxed in a future version).
|
||||
* If it is set before the headers are emitted, the target bitrate encoded in
|
||||
* them will be updated.
|
||||
* Due to the buffer delay, the exact bitrate of each section of the encode is
|
||||
* not guaranteed.
|
||||
* The encoder may have already used more bits than allowed for the frames it
|
||||
* has encoded, expecting to make them up in future frames, or it may have
|
||||
* used fewer, holding the excess in reserve.
|
||||
* The exact transition between the two bitrates is not well-defined by this
|
||||
* API, but may be affected by flags set with #TH_ENCCTL_SET_RATE_FLAGS.
|
||||
* After a number of frames equal to the buffer delay, one may expect further
|
||||
* output to average at the target bitrate.
|
||||
*
|
||||
* \param[in] _buf <tt>long</tt>: The new target bitrate, in bits per second.
|
||||
* \retval 0 Success.
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL The target bitrate was not positive.
|
||||
* A future version of this library may allow passing 0
|
||||
* to disabled rate-controlled mode and return to a
|
||||
* quality-based mode, in which case this function will
|
||||
* not return an error for that value.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_BITRATE (30)
|
||||
/**Sets the configuration to be compatible with that from the given setup
|
||||
* header.
|
||||
* This sets the Huffman codebooks and quantization parameters to match those
|
||||
* found in the given setup header.
|
||||
* This guarantees that packets encoded by this encoder will be decodable using
|
||||
* a decoder configured with the passed-in setup header.
|
||||
* It does <em>not</em> guarantee that th_encode_flushheader() will produce a
|
||||
* bit-identical setup header, only that they will be compatible.
|
||||
* If you need a bit-identical setup header, then use the one you passed into
|
||||
* this command, and not the one returned by th_encode_flushheader().
|
||||
*
|
||||
* This also does <em>not</em> enable or disable VP3 compatibility; that is not
|
||||
* signaled in the setup header (or anywhere else in the encoded stream), and
|
||||
* is controlled independently by the #TH_ENCCTL_SET_VP3_COMPATIBLE function.
|
||||
* If you wish to enable VP3 compatibility mode <em>and</em> want the codebooks
|
||||
* and quantization parameters to match the given setup header, you should
|
||||
* enable VP3 compatibility before invoking this command, otherwise the
|
||||
* codebooks and quantization parameters will be reset to the VP3 defaults.
|
||||
*
|
||||
* The current encoder does not support Huffman codebooks which do not contain
|
||||
* codewords for all 32 tokens.
|
||||
* Such codebooks are legal, according to the specification, but cannot be
|
||||
* configured with this function.
|
||||
*
|
||||
* \param[in] _buf <tt>unsigned char[]</tt>: The encoded setup header to copy
|
||||
* the configuration from.
|
||||
* This should be the original,
|
||||
* undecoded setup header packet,
|
||||
* and <em>not</em> a #th_setup_info
|
||||
* structure filled in by
|
||||
* th_decode_headerin().
|
||||
* \retval TH_EFAULT \a _enc or \a _buf is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL Encoding has already begun, so the codebooks and
|
||||
* quantization parameters cannot be changed, or the
|
||||
* data in the setup header was not supported by this
|
||||
* encoder.
|
||||
* \retval TH_EBADHEADER \a _buf did not contain a valid setup header packet.
|
||||
* \retval TH_ENOTFORMAT \a _buf did not contain a Theora header at all.
|
||||
* \retval TH_EIMPL Not supported by this implementation.*/
|
||||
#define TH_ENCCTL_SET_COMPAT_CONFIG (32)
|
||||
|
||||
/*@}*/
|
||||
|
||||
|
||||
/**\name TH_ENCCTL_SET_RATE_FLAGS flags
|
||||
* \anchor ratectlflags
|
||||
* These are the flags available for use with #TH_ENCCTL_SET_RATE_FLAGS.*/
|
||||
/*@{*/
|
||||
/**Drop frames to keep within bitrate buffer constraints.
|
||||
* This can have a severe impact on quality, but is the only way to ensure that
|
||||
* bitrate targets are met at low rates during sudden bursts of activity.
|
||||
* It is enabled by default.*/
|
||||
#define TH_RATECTL_DROP_FRAMES (0x1)
|
||||
/**Ignore bitrate buffer overflows.
|
||||
* If the encoder uses so few bits that the reservoir of available bits
|
||||
* overflows, ignore the excess.
|
||||
* The encoder will not try to use these extra bits in future frames.
|
||||
* At high rates this may cause the result to be undersized, but allows a
|
||||
* client to play the stream using a finite buffer; it should normally be
|
||||
* enabled, which is the default.*/
|
||||
#define TH_RATECTL_CAP_OVERFLOW (0x2)
|
||||
/**Ignore bitrate buffer underflows.
|
||||
* If the encoder uses so many bits that the reservoir of available bits
|
||||
* underflows, ignore the deficit.
|
||||
* The encoder will not try to make up these extra bits in future frames.
|
||||
* At low rates this may cause the result to be oversized; it should normally
|
||||
* be disabled, which is the default.*/
|
||||
#define TH_RATECTL_CAP_UNDERFLOW (0x4)
|
||||
/*@}*/
|
||||
|
||||
|
||||
|
||||
/**The quantization parameters used by VP3.*/
|
||||
extern const th_quant_info TH_VP31_QUANT_INFO;
|
||||
|
||||
/**The Huffman tables used by VP3.*/
|
||||
extern const th_huff_code
|
||||
TH_VP31_HUFF_CODES[TH_NHUFFMAN_TABLES][TH_NDCT_TOKENS];
|
||||
|
||||
|
||||
|
||||
/**\name Encoder state
|
||||
The following data structure is opaque, and its contents are not publicly
|
||||
defined by this API.
|
||||
Referring to its internals directly is unsupported, and may break without
|
||||
warning.*/
|
||||
/*@{*/
|
||||
/**The encoder context.*/
|
||||
typedef struct th_enc_ctx th_enc_ctx;
|
||||
/*@}*/
|
||||
|
||||
|
||||
|
||||
/**\defgroup encfuncs Functions for Encoding*/
|
||||
/*@{*/
|
||||
/**\name Functions for encoding
|
||||
* You must link to <tt>libtheoraenc</tt> and <tt>libtheoradec</tt>
|
||||
* if you use any of the functions in this section.
|
||||
*
|
||||
* The functions are listed in the order they are used in a typical encode.
|
||||
* The basic steps are:
|
||||
* - Fill in a #th_info structure with details on the format of the video you
|
||||
* wish to encode.
|
||||
* - Allocate a #th_enc_ctx handle with th_encode_alloc().
|
||||
* - Perform any additional encoder configuration required with
|
||||
* th_encode_ctl().
|
||||
* - Repeatedly call th_encode_flushheader() to retrieve all the header
|
||||
* packets.
|
||||
* - For each uncompressed frame:
|
||||
* - Submit the uncompressed frame via th_encode_ycbcr_in()
|
||||
* - Repeatedly call th_encode_packetout() to retrieve any video
|
||||
* data packets that are ready.
|
||||
* - Call th_encode_free() to release all encoder memory.*/
|
||||
/*@{*/
|
||||
/**Allocates an encoder instance.
|
||||
* \param _info A #th_info struct filled with the desired encoding parameters.
|
||||
* \return The initialized #th_enc_ctx handle.
|
||||
* \retval NULL If the encoding parameters were invalid.*/
|
||||
extern th_enc_ctx *th_encode_alloc(const th_info *_info);
|
||||
/**Encoder control function.
|
||||
* This is used to provide advanced control the encoding process.
|
||||
* \param _enc A #th_enc_ctx handle.
|
||||
* \param _req The control code to process.
|
||||
* See \ref encctlcodes "the list of available control codes"
|
||||
* for details.
|
||||
* \param _buf The parameters for this control code.
|
||||
* \param _buf_sz The size of the parameter buffer.
|
||||
* \return Possible return values depend on the control code used.
|
||||
* See \ref encctlcodes "the list of control codes" for
|
||||
* specific values. Generally 0 indicates success.*/
|
||||
extern int th_encode_ctl(th_enc_ctx *_enc,int _req,void *_buf,size_t _buf_sz);
|
||||
/**Outputs the next header packet.
|
||||
* This should be called repeatedly after encoder initialization until it
|
||||
* returns 0 in order to get all of the header packets, in order, before
|
||||
* encoding actual video data.
|
||||
* \param _enc A #th_enc_ctx handle.
|
||||
* \param _comments The metadata to place in the comment header, when it is
|
||||
* encoded.
|
||||
* \param _op An <tt>ogg_packet</tt> structure to fill.
|
||||
* All of the elements of this structure will be set,
|
||||
* including a pointer to the header data.
|
||||
* The memory for the header data is owned by
|
||||
* <tt>libtheoraenc</tt>, and may be invalidated when the
|
||||
* next encoder function is called.
|
||||
* \return A positive value indicates that a header packet was successfully
|
||||
* produced.
|
||||
* \retval 0 No packet was produced, and no more header packets remain.
|
||||
* \retval TH_EFAULT \a _enc, \a _comments, or \a _op was <tt>NULL</tt>.*/
|
||||
extern int th_encode_flushheader(th_enc_ctx *_enc,
|
||||
th_comment *_comments,ogg_packet *_op);
|
||||
/**Submits an uncompressed frame to the encoder.
|
||||
* \param _enc A #th_enc_ctx handle.
|
||||
* \param _ycbcr A buffer of Y'CbCr data to encode.
|
||||
* If the width and height of the buffer matches the frame size
|
||||
* the encoder was initialized with, the encoder will only
|
||||
* reference the portion inside the picture region.
|
||||
* Any data outside this region will be ignored, and need not map
|
||||
* to a valid address.
|
||||
* Alternatively, you can pass a buffer equal to the size of the
|
||||
* picture region, if this is less than the full frame size.
|
||||
* When using subsampled chroma planes, odd picture sizes or odd
|
||||
* picture offsets may require an unexpected chroma plane size,
|
||||
* and their use is generally discouraged, as they will not be
|
||||
* well-supported by players and other media frameworks.
|
||||
* See Section 4.4 of
|
||||
* <a href="http://www.theora.org/doc/Theora.pdf">the Theora
|
||||
* specification</a> for details if you wish to use them anyway.
|
||||
* \retval 0 Success.
|
||||
* \retval TH_EFAULT \a _enc or \a _ycbcr is <tt>NULL</tt>.
|
||||
* \retval TH_EINVAL The buffer size matches neither the frame size nor the
|
||||
* picture size the encoder was initialized with, or
|
||||
* encoding has already completed.*/
|
||||
extern int th_encode_ycbcr_in(th_enc_ctx *_enc,th_ycbcr_buffer _ycbcr);
|
||||
/**Retrieves encoded video data packets.
|
||||
* This should be called repeatedly after each frame is submitted to flush any
|
||||
* encoded packets, until it returns 0.
|
||||
* The encoder will not buffer these packets as subsequent frames are
|
||||
* compressed, so a failure to do so will result in lost video data.
|
||||
* \note Currently the encoder operates in a one-frame-in, one-packet-out
|
||||
* manner.
|
||||
* However, this may be changed in the future.
|
||||
* \param _enc A #th_enc_ctx handle.
|
||||
* \param _last Set this flag to a non-zero value if no more uncompressed
|
||||
* frames will be submitted.
|
||||
* This ensures that a proper EOS flag is set on the last packet.
|
||||
* \param _op An <tt>ogg_packet</tt> structure to fill.
|
||||
* All of the elements of this structure will be set, including a
|
||||
* pointer to the video data.
|
||||
* The memory for the video data is owned by
|
||||
* <tt>libtheoraenc</tt>, and may be invalidated when the next
|
||||
* encoder function is called.
|
||||
* \return A positive value indicates that a video data packet was successfully
|
||||
* produced.
|
||||
* \retval 0 No packet was produced, and no more encoded video data
|
||||
* remains.
|
||||
* \retval TH_EFAULT \a _enc or \a _op was <tt>NULL</tt>.*/
|
||||
extern int th_encode_packetout(th_enc_ctx *_enc,int _last,ogg_packet *_op);
|
||||
/**Frees an allocated encoder instance.
|
||||
* \param _enc A #th_enc_ctx handle.*/
|
||||
extern void th_encode_free(th_enc_ctx *_enc);
|
||||
/*@}*/
|
||||
/*@}*/
|
||||
|
||||
|
||||
|
||||
#if defined(__cplusplus)
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
1368
engine/thirdparty/libtheora/tokenize.c
vendored
Normal file
1368
engine/thirdparty/libtheora/tokenize.c
vendored
Normal file
File diff suppressed because it is too large
Load diff
904
engine/thirdparty/libtheora/x86/mmxencfrag.c
vendored
Normal file
904
engine/thirdparty/libtheora/x86/mmxencfrag.c
vendored
Normal file
|
|
@ -0,0 +1,904 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
|
||||
|
||||
********************************************************************/
|
||||
#include <stddef.h>
|
||||
#include "x86enc.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
ptrdiff_t ystride3;
|
||||
ptrdiff_t ret;
|
||||
__asm__ __volatile__(
|
||||
/*Load the first 4 rows of each block.*/
|
||||
"movq (%[src]),%%mm0\n\t"
|
||||
"movq (%[ref]),%%mm1\n\t"
|
||||
"movq (%[src],%[ystride]),%%mm2\n\t"
|
||||
"movq (%[ref],%[ystride]),%%mm3\n\t"
|
||||
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
|
||||
"movq (%[src],%[ystride],2),%%mm4\n\t"
|
||||
"movq (%[ref],%[ystride],2),%%mm5\n\t"
|
||||
"movq (%[src],%[ystride3]),%%mm6\n\t"
|
||||
"movq (%[ref],%[ystride3]),%%mm7\n\t"
|
||||
/*Compute their SADs and add them in %%mm0*/
|
||||
"psadbw %%mm1,%%mm0\n\t"
|
||||
"psadbw %%mm3,%%mm2\n\t"
|
||||
"lea (%[src],%[ystride],4),%[src]\n\t"
|
||||
"paddw %%mm2,%%mm0\n\t"
|
||||
"lea (%[ref],%[ystride],4),%[ref]\n\t"
|
||||
/*Load the next 3 rows as registers become available.*/
|
||||
"movq (%[src]),%%mm2\n\t"
|
||||
"movq (%[ref]),%%mm3\n\t"
|
||||
"psadbw %%mm5,%%mm4\n\t"
|
||||
"psadbw %%mm7,%%mm6\n\t"
|
||||
"paddw %%mm4,%%mm0\n\t"
|
||||
"movq (%[ref],%[ystride]),%%mm5\n\t"
|
||||
"movq (%[src],%[ystride]),%%mm4\n\t"
|
||||
"paddw %%mm6,%%mm0\n\t"
|
||||
"movq (%[ref],%[ystride],2),%%mm7\n\t"
|
||||
"movq (%[src],%[ystride],2),%%mm6\n\t"
|
||||
/*Start adding their SADs to %%mm0*/
|
||||
"psadbw %%mm3,%%mm2\n\t"
|
||||
"psadbw %%mm5,%%mm4\n\t"
|
||||
"paddw %%mm2,%%mm0\n\t"
|
||||
"psadbw %%mm7,%%mm6\n\t"
|
||||
/*Load last row as registers become available.*/
|
||||
"movq (%[src],%[ystride3]),%%mm2\n\t"
|
||||
"movq (%[ref],%[ystride3]),%%mm3\n\t"
|
||||
/*And finish adding up their SADs.*/
|
||||
"paddw %%mm4,%%mm0\n\t"
|
||||
"psadbw %%mm3,%%mm2\n\t"
|
||||
"paddw %%mm6,%%mm0\n\t"
|
||||
"paddw %%mm2,%%mm0\n\t"
|
||||
"movd %%mm0,%[ret]\n\t"
|
||||
:[ret]"=a"(ret),[src]"+r"(_src),[ref]"+r"(_ref),[ystride3]"=&r"(ystride3)
|
||||
:[ystride]"r"((ptrdiff_t)_ystride)
|
||||
);
|
||||
return (unsigned)ret;
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _thresh){
|
||||
/*Early termination is for suckers.*/
|
||||
return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
|
||||
}
|
||||
|
||||
/*Assumes the first two rows of %[ref1] and %[ref2] are in %%mm0...%%mm3, the
|
||||
first two rows of %[src] are in %%mm4,%%mm5, and {1}x8 is in %%mm7.
|
||||
We pre-load the next two rows of data as registers become available.*/
|
||||
#define OC_SAD2_LOOP \
|
||||
"#OC_SAD2_LOOP\n\t" \
|
||||
/*We want to compute (%%mm0+%%mm1>>1) on unsigned bytes without overflow, but \
|
||||
pavgb computes (%%mm0+%%mm1+1>>1). \
|
||||
The latter is exactly 1 too large when the low bit of two corresponding \
|
||||
bytes is only set in one of them. \
|
||||
Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
|
||||
correct the output of pavgb. \
|
||||
TODO: This should be rewritten to compute ~pavgb(~a,~b) instead, which \
|
||||
schedules better; currently, however, this function is unused.*/ \
|
||||
"movq %%mm0,%%mm6\n\t" \
|
||||
"lea (%[ref1],%[ystride],2),%[ref1]\n\t" \
|
||||
"pxor %%mm1,%%mm0\n\t" \
|
||||
"pavgb %%mm1,%%mm6\n\t" \
|
||||
"lea (%[ref2],%[ystride],2),%[ref2]\n\t" \
|
||||
"movq %%mm2,%%mm1\n\t" \
|
||||
"pand %%mm7,%%mm0\n\t" \
|
||||
"pavgb %%mm3,%%mm2\n\t" \
|
||||
"pxor %%mm3,%%mm1\n\t" \
|
||||
"movq (%[ref2],%[ystride]),%%mm3\n\t" \
|
||||
"psubb %%mm0,%%mm6\n\t" \
|
||||
"movq (%[ref1]),%%mm0\n\t" \
|
||||
"pand %%mm7,%%mm1\n\t" \
|
||||
"psadbw %%mm6,%%mm4\n\t" \
|
||||
"movd %[ret],%%mm6\n\t" \
|
||||
"psubb %%mm1,%%mm2\n\t" \
|
||||
"movq (%[ref2]),%%mm1\n\t" \
|
||||
"lea (%[src],%[ystride],2),%[src]\n\t" \
|
||||
"psadbw %%mm2,%%mm5\n\t" \
|
||||
"movq (%[ref1],%[ystride]),%%mm2\n\t" \
|
||||
"paddw %%mm4,%%mm5\n\t" \
|
||||
"movq (%[src]),%%mm4\n\t" \
|
||||
"paddw %%mm5,%%mm6\n\t" \
|
||||
"movq (%[src],%[ystride]),%%mm5\n\t" \
|
||||
"movd %%mm6,%[ret]\n\t" \
|
||||
|
||||
/*Same as above, but does not pre-load the next two rows.*/
|
||||
#define OC_SAD2_TAIL \
|
||||
"#OC_SAD2_TAIL\n\t" \
|
||||
"movq %%mm0,%%mm6\n\t" \
|
||||
"pavgb %%mm1,%%mm0\n\t" \
|
||||
"pxor %%mm1,%%mm6\n\t" \
|
||||
"movq %%mm2,%%mm1\n\t" \
|
||||
"pand %%mm7,%%mm6\n\t" \
|
||||
"pavgb %%mm3,%%mm2\n\t" \
|
||||
"pxor %%mm3,%%mm1\n\t" \
|
||||
"psubb %%mm6,%%mm0\n\t" \
|
||||
"pand %%mm7,%%mm1\n\t" \
|
||||
"psadbw %%mm0,%%mm4\n\t" \
|
||||
"psubb %%mm1,%%mm2\n\t" \
|
||||
"movd %[ret],%%mm6\n\t" \
|
||||
"psadbw %%mm2,%%mm5\n\t" \
|
||||
"paddw %%mm4,%%mm5\n\t" \
|
||||
"paddw %%mm5,%%mm6\n\t" \
|
||||
"movd %%mm6,%[ret]\n\t" \
|
||||
|
||||
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||
unsigned _thresh){
|
||||
ptrdiff_t ret;
|
||||
__asm__ __volatile__(
|
||||
"movq (%[ref1]),%%mm0\n\t"
|
||||
"movq (%[ref2]),%%mm1\n\t"
|
||||
"movq (%[ref1],%[ystride]),%%mm2\n\t"
|
||||
"movq (%[ref2],%[ystride]),%%mm3\n\t"
|
||||
"xor %[ret],%[ret]\n\t"
|
||||
"movq (%[src]),%%mm4\n\t"
|
||||
"pxor %%mm7,%%mm7\n\t"
|
||||
"pcmpeqb %%mm6,%%mm6\n\t"
|
||||
"movq (%[src],%[ystride]),%%mm5\n\t"
|
||||
"psubb %%mm6,%%mm7\n\t"
|
||||
OC_SAD2_LOOP
|
||||
OC_SAD2_LOOP
|
||||
OC_SAD2_LOOP
|
||||
OC_SAD2_TAIL
|
||||
:[ret]"=&a"(ret),[src]"+r"(_src),[ref1]"+r"(_ref1),[ref2]"+r"(_ref2)
|
||||
:[ystride]"r"((ptrdiff_t)_ystride)
|
||||
);
|
||||
return (unsigned)ret;
|
||||
}
|
||||
|
||||
/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
|
||||
16-bit difference in %%mm0...%%mm7.*/
|
||||
#define OC_LOAD_SUB_8x4(_off) \
|
||||
"#OC_LOAD_SUB_8x4\n\t" \
|
||||
"movd "#_off"(%[src]),%%mm0\n\t" \
|
||||
"movd "#_off"(%[ref]),%%mm4\n\t" \
|
||||
"movd "#_off"(%[src],%[src_ystride]),%%mm1\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"movd "#_off"(%[ref],%[ref_ystride]),%%mm5\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"movd "#_off"(%[src]),%%mm2\n\t" \
|
||||
"movd "#_off"(%[ref]),%%mm7\n\t" \
|
||||
"movd "#_off"(%[src],%[src_ystride]),%%mm3\n\t" \
|
||||
"movd "#_off"(%[ref],%[ref_ystride]),%%mm6\n\t" \
|
||||
"punpcklbw %%mm4,%%mm0\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"punpcklbw %%mm4,%%mm4\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"psubw %%mm4,%%mm0\n\t" \
|
||||
"movd "#_off"(%[src]),%%mm4\n\t" \
|
||||
"movq %%mm0,"OC_MEM_OFFS(_off*2,buf)"\n\t" \
|
||||
"movd "#_off"(%[ref]),%%mm0\n\t" \
|
||||
"punpcklbw %%mm5,%%mm1\n\t" \
|
||||
"punpcklbw %%mm5,%%mm5\n\t" \
|
||||
"psubw %%mm5,%%mm1\n\t" \
|
||||
"movd "#_off"(%[src],%[src_ystride]),%%mm5\n\t" \
|
||||
"punpcklbw %%mm7,%%mm2\n\t" \
|
||||
"punpcklbw %%mm7,%%mm7\n\t" \
|
||||
"psubw %%mm7,%%mm2\n\t" \
|
||||
"movd "#_off"(%[ref],%[ref_ystride]),%%mm7\n\t" \
|
||||
"punpcklbw %%mm6,%%mm3\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"punpcklbw %%mm6,%%mm6\n\t" \
|
||||
"psubw %%mm6,%%mm3\n\t" \
|
||||
"movd "#_off"(%[src]),%%mm6\n\t" \
|
||||
"punpcklbw %%mm0,%%mm4\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"punpcklbw %%mm0,%%mm0\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"psubw %%mm0,%%mm4\n\t" \
|
||||
"movd "#_off"(%[ref]),%%mm0\n\t" \
|
||||
"punpcklbw %%mm7,%%mm5\n\t" \
|
||||
"neg %[src_ystride]\n\t" \
|
||||
"punpcklbw %%mm7,%%mm7\n\t" \
|
||||
"psubw %%mm7,%%mm5\n\t" \
|
||||
"movd "#_off"(%[src],%[src_ystride]),%%mm7\n\t" \
|
||||
"punpcklbw %%mm0,%%mm6\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"punpcklbw %%mm0,%%mm0\n\t" \
|
||||
"neg %[ref_ystride]\n\t" \
|
||||
"psubw %%mm0,%%mm6\n\t" \
|
||||
"movd "#_off"(%[ref],%[ref_ystride]),%%mm0\n\t" \
|
||||
"lea (%[src],%[src_ystride],8),%[src]\n\t" \
|
||||
"punpcklbw %%mm0,%%mm7\n\t" \
|
||||
"neg %[src_ystride]\n\t" \
|
||||
"punpcklbw %%mm0,%%mm0\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],8),%[ref]\n\t" \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"neg %[ref_ystride]\n\t" \
|
||||
"movq "OC_MEM_OFFS(_off*2,buf)",%%mm0\n\t" \
|
||||
|
||||
/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
|
||||
#define OC_LOAD_8x4(_off) \
|
||||
"#OC_LOAD_8x4\n\t" \
|
||||
"movd "#_off"(%[src]),%%mm0\n\t" \
|
||||
"movd "#_off"(%[src],%[ystride]),%%mm1\n\t" \
|
||||
"movd "#_off"(%[src],%[ystride],2),%%mm2\n\t" \
|
||||
"pxor %%mm7,%%mm7\n\t" \
|
||||
"movd "#_off"(%[src],%[ystride3]),%%mm3\n\t" \
|
||||
"punpcklbw %%mm7,%%mm0\n\t" \
|
||||
"movd "#_off"(%[src4]),%%mm4\n\t" \
|
||||
"punpcklbw %%mm7,%%mm1\n\t" \
|
||||
"movd "#_off"(%[src4],%[ystride]),%%mm5\n\t" \
|
||||
"punpcklbw %%mm7,%%mm2\n\t" \
|
||||
"movd "#_off"(%[src4],%[ystride],2),%%mm6\n\t" \
|
||||
"punpcklbw %%mm7,%%mm3\n\t" \
|
||||
"movd "#_off"(%[src4],%[ystride3]),%%mm7\n\t" \
|
||||
"punpcklbw %%mm4,%%mm4\n\t" \
|
||||
"punpcklbw %%mm5,%%mm5\n\t" \
|
||||
"psrlw $8,%%mm4\n\t" \
|
||||
"psrlw $8,%%mm5\n\t" \
|
||||
"punpcklbw %%mm6,%%mm6\n\t" \
|
||||
"punpcklbw %%mm7,%%mm7\n\t" \
|
||||
"psrlw $8,%%mm6\n\t" \
|
||||
"psrlw $8,%%mm7\n\t" \
|
||||
|
||||
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
|
||||
The transform is performed in place, except that outputs 0-3 are swapped with
|
||||
outputs 4-7.
|
||||
Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
|
||||
perform this stage in place with no temporary registers).*/
|
||||
#define OC_HADAMARD_AB_8x4 \
|
||||
"#OC_HADAMARD_AB_8x4\n\t" \
|
||||
/*Stage A: \
|
||||
Outputs 0-3 are swapped with 4-7 here.*/ \
|
||||
"paddw %%mm1,%%mm5\n\t" \
|
||||
"paddw %%mm2,%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"psubw %%mm5,%%mm1\n\t" \
|
||||
"psubw %%mm6,%%mm2\n\t" \
|
||||
"paddw %%mm3,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
"psubw %%mm7,%%mm3\n\t" \
|
||||
"psubw %%mm4,%%mm0\n\t" \
|
||||
/*Stage B:*/ \
|
||||
"paddw %%mm2,%%mm0\n\t" \
|
||||
"paddw %%mm3,%%mm1\n\t" \
|
||||
"paddw %%mm6,%%mm4\n\t" \
|
||||
"paddw %%mm7,%%mm5\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
"paddw %%mm6,%%mm6\n\t" \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
"psubw %%mm0,%%mm2\n\t" \
|
||||
"psubw %%mm1,%%mm3\n\t" \
|
||||
"psubw %%mm4,%%mm6\n\t" \
|
||||
"psubw %%mm5,%%mm7\n\t" \
|
||||
|
||||
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
|
||||
Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
|
||||
place with no temporary registers).*/
|
||||
#define OC_HADAMARD_C_8x4 \
|
||||
"#OC_HADAMARD_C_8x4\n\t" \
|
||||
/*Stage C:*/ \
|
||||
"paddw %%mm1,%%mm0\n\t" \
|
||||
"paddw %%mm3,%%mm2\n\t" \
|
||||
"paddw %%mm5,%%mm4\n\t" \
|
||||
"paddw %%mm7,%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
"psubw %%mm0,%%mm1\n\t" \
|
||||
"psubw %%mm2,%%mm3\n\t" \
|
||||
"psubw %%mm4,%%mm5\n\t" \
|
||||
"psubw %%mm6,%%mm7\n\t" \
|
||||
|
||||
/*Performs an 8-point 1-D Hadamard transform.
|
||||
The transform is performed in place, except that outputs 0-3 are swapped with
|
||||
outputs 4-7.
|
||||
Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
|
||||
in place with no temporary registers).*/
|
||||
#define OC_HADAMARD_8x4 \
|
||||
OC_HADAMARD_AB_8x4 \
|
||||
OC_HADAMARD_C_8x4 \
|
||||
|
||||
/*Performs the first part of the final stage of the Hadamard transform and
|
||||
summing of absolute values.
|
||||
At the end of this part, %%mm1 will contain the DC coefficient of the
|
||||
transform.*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
|
||||
/*We use the fact that \
|
||||
(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
|
||||
to merge the final butterfly with the abs and the first stage of \
|
||||
accumulation. \
|
||||
Thus we can avoid using pabsw, which is not available until SSSE3. \
|
||||
Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
|
||||
implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
|
||||
registers). \
|
||||
Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
|
||||
This implementation is only 26 (+4 for spilling registers).*/ \
|
||||
"#OC_HADAMARD_C_ABS_ACCUM_A_8x4\n\t" \
|
||||
"movq %%mm7,"OC_MEM_OFFS(_r7,buf)"\n\t" \
|
||||
"movq %%mm6,"OC_MEM_OFFS(_r6,buf)"\n\t" \
|
||||
/*mm7={0x7FFF}x4 \
|
||||
mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
|
||||
"pcmpeqb %%mm7,%%mm7\n\t" \
|
||||
"movq %%mm0,%%mm6\n\t" \
|
||||
"psrlw $1,%%mm7\n\t" \
|
||||
"paddw %%mm1,%%mm6\n\t" \
|
||||
"pmaxsw %%mm1,%%mm0\n\t" \
|
||||
"paddsw %%mm7,%%mm6\n\t" \
|
||||
"psubw %%mm6,%%mm0\n\t" \
|
||||
/*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
|
||||
mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
|
||||
"movq %%mm2,%%mm6\n\t" \
|
||||
"movq %%mm4,%%mm1\n\t" \
|
||||
"pmaxsw %%mm3,%%mm2\n\t" \
|
||||
"pmaxsw %%mm5,%%mm4\n\t" \
|
||||
"paddw %%mm3,%%mm6\n\t" \
|
||||
"paddw %%mm5,%%mm1\n\t" \
|
||||
"movq "OC_MEM_OFFS(_r7,buf)",%%mm3\n\t" \
|
||||
|
||||
/*Performs the second part of the final stage of the Hadamard transform and
|
||||
summing of absolute values.*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
|
||||
"#OC_HADAMARD_C_ABS_ACCUM_B_8x4\n\t" \
|
||||
"paddsw %%mm7,%%mm6\n\t" \
|
||||
"movq "OC_MEM_OFFS(_r6,buf)",%%mm5\n\t" \
|
||||
"paddsw %%mm7,%%mm1\n\t" \
|
||||
"psubw %%mm6,%%mm2\n\t" \
|
||||
"psubw %%mm1,%%mm4\n\t" \
|
||||
/*mm7={1}x4 (needed for the horizontal add that follows) \
|
||||
mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
|
||||
"movq %%mm3,%%mm6\n\t" \
|
||||
"pmaxsw %%mm5,%%mm3\n\t" \
|
||||
"paddw %%mm2,%%mm0\n\t" \
|
||||
"paddw %%mm5,%%mm6\n\t" \
|
||||
"paddw %%mm4,%%mm0\n\t" \
|
||||
"paddsw %%mm7,%%mm6\n\t" \
|
||||
"paddw %%mm3,%%mm0\n\t" \
|
||||
"psrlw $14,%%mm7\n\t" \
|
||||
"psubw %%mm6,%%mm0\n\t" \
|
||||
|
||||
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
|
||||
absolute value of each component, and accumulates everything into mm0.
|
||||
This is the only portion of SATD which requires MMXEXT (we could use plain
|
||||
MMX, but it takes 4 instructions and an extra register to work around the
|
||||
lack of a pmaxsw, which is a pretty serious penalty).*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
|
||||
|
||||
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
|
||||
component, and accumulates everything into mm0.
|
||||
Note that mm0 will have an extra 4 added to each column, and that after
|
||||
removing this value, the remainder will be half the conventional value.*/
|
||||
#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) \
|
||||
OC_HADAMARD_AB_8x4 \
|
||||
OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7)
|
||||
|
||||
/*Performs two 4x4 transposes (mostly) in place.
|
||||
On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
|
||||
contains rows {a,b,c,d}.
|
||||
On output, {0x40,0x50,0x60,0x70}+_off(%[buf]) contains {e,f,g,h}^T, and
|
||||
{mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
|
||||
#define OC_TRANSPOSE_4x4x2(_off) \
|
||||
"#OC_TRANSPOSE_4x4x2\n\t" \
|
||||
/*First 4x4 transpose:*/ \
|
||||
"movq %%mm5,"OC_MEM_OFFS(0x10+(_off),buf)"\n\t" \
|
||||
/*mm0 = e3 e2 e1 e0 \
|
||||
mm1 = f3 f2 f1 f0 \
|
||||
mm2 = g3 g2 g1 g0 \
|
||||
mm3 = h3 h2 h1 h0*/ \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklwd %%mm3,%%mm2\n\t" \
|
||||
"punpckhwd %%mm3,%%mm5\n\t" \
|
||||
"movq %%mm0,%%mm3\n\t" \
|
||||
"punpcklwd %%mm1,%%mm0\n\t" \
|
||||
"punpckhwd %%mm1,%%mm3\n\t" \
|
||||
/*mm0 = f1 e1 f0 e0 \
|
||||
mm3 = f3 e3 f2 e2 \
|
||||
mm2 = h1 g1 h0 g0 \
|
||||
mm5 = h3 g3 h2 g2*/ \
|
||||
"movq %%mm0,%%mm1\n\t" \
|
||||
"punpckldq %%mm2,%%mm0\n\t" \
|
||||
"punpckhdq %%mm2,%%mm1\n\t" \
|
||||
"movq %%mm3,%%mm2\n\t" \
|
||||
"punpckhdq %%mm5,%%mm3\n\t" \
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x40+(_off),buf)"\n\t" \
|
||||
"punpckldq %%mm5,%%mm2\n\t" \
|
||||
/*mm0 = h0 g0 f0 e0 \
|
||||
mm1 = h1 g1 f1 e1 \
|
||||
mm2 = h2 g2 f2 e2 \
|
||||
mm3 = h3 g3 f3 e3*/ \
|
||||
"movq "OC_MEM_OFFS(0x10+(_off),buf)",%%mm5\n\t" \
|
||||
/*Second 4x4 transpose:*/ \
|
||||
/*mm4 = a3 a2 a1 a0 \
|
||||
mm5 = b3 b2 b1 b0 \
|
||||
mm6 = c3 c2 c1 c0 \
|
||||
mm7 = d3 d2 d1 d0*/ \
|
||||
"movq %%mm6,%%mm0\n\t" \
|
||||
"punpcklwd %%mm7,%%mm6\n\t" \
|
||||
"movq %%mm1,"OC_MEM_OFFS(0x50+(_off),buf)"\n\t" \
|
||||
"punpckhwd %%mm7,%%mm0\n\t" \
|
||||
"movq %%mm4,%%mm7\n\t" \
|
||||
"punpcklwd %%mm5,%%mm4\n\t" \
|
||||
"movq %%mm2,"OC_MEM_OFFS(0x60+(_off),buf)"\n\t" \
|
||||
"punpckhwd %%mm5,%%mm7\n\t" \
|
||||
/*mm4 = b1 a1 b0 a0 \
|
||||
mm7 = b3 a3 b2 a2 \
|
||||
mm6 = d1 c1 d0 c0 \
|
||||
mm0 = d3 c3 d2 c2*/ \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"punpckldq %%mm6,%%mm4\n\t" \
|
||||
"movq %%mm3,"OC_MEM_OFFS(0x70+(_off),buf)"\n\t" \
|
||||
"punpckhdq %%mm6,%%mm5\n\t" \
|
||||
"movq %%mm7,%%mm6\n\t" \
|
||||
"punpckhdq %%mm0,%%mm7\n\t" \
|
||||
"punpckldq %%mm0,%%mm6\n\t" \
|
||||
/*mm4 = d0 c0 b0 a0 \
|
||||
mm5 = d1 c1 b1 a1 \
|
||||
mm6 = d2 c2 b2 a2 \
|
||||
mm7 = d3 c3 b3 a3*/ \
|
||||
|
||||
static unsigned oc_int_frag_satd_mmxext(int *_dc,
|
||||
const unsigned char *_src,int _src_ystride,
|
||||
const unsigned char *_ref,int _ref_ystride){
|
||||
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||
unsigned ret;
|
||||
unsigned ret2;
|
||||
int dc;
|
||||
__asm__ __volatile__(
|
||||
OC_LOAD_SUB_8x4(0x00)
|
||||
OC_HADAMARD_8x4
|
||||
OC_TRANSPOSE_4x4x2(0x00)
|
||||
/*Finish swapping out this 8x4 block to make room for the next one.
|
||||
mm0...mm3 have been swapped out already.*/
|
||||
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
|
||||
"movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
|
||||
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
|
||||
"movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
|
||||
OC_LOAD_SUB_8x4(0x04)
|
||||
OC_HADAMARD_8x4
|
||||
OC_TRANSPOSE_4x4x2(0x08)
|
||||
/*Here the first 4x4 block of output from the last transpose is the second
|
||||
4x4 block of input for the next transform.
|
||||
We have cleverly arranged that it already be in the appropriate place, so
|
||||
we only have to do half the loads.*/
|
||||
"movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
|
||||
"movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
|
||||
"movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
|
||||
"movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
|
||||
/*We split out the stages here so we can save the DC coefficient in the
|
||||
middle.*/
|
||||
OC_HADAMARD_AB_8x4
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
|
||||
"movd %%mm1,%[dc]\n\t"
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
|
||||
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||
Now we finally have to promote things to dwords.
|
||||
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
|
||||
latency of pmaddwd by starting the next series of loads now.*/
|
||||
"pmaddwd %%mm7,%%mm0\n\t"
|
||||
"movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
|
||||
"movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
|
||||
"movq %%mm0,%%mm4\n\t"
|
||||
"movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
|
||||
"punpckhdq %%mm0,%%mm0\n\t"
|
||||
"movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
|
||||
"paddd %%mm0,%%mm4\n\t"
|
||||
"movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
|
||||
"movd %%mm4,%[ret2]\n\t"
|
||||
"movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
|
||||
"movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
|
||||
"movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
|
||||
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
|
||||
"pmaddwd %%mm7,%%mm0\n\t"
|
||||
/*Subtract abs(dc) from 2*ret2.*/
|
||||
"movsx %w[dc],%[dc]\n\t"
|
||||
"cdq\n\t"
|
||||
"lea (%[ret],%[ret2],2),%[ret2]\n\t"
|
||||
"movq %%mm0,%%mm4\n\t"
|
||||
"punpckhdq %%mm0,%%mm0\n\t"
|
||||
"xor %[dc],%[ret]\n\t"
|
||||
"paddd %%mm0,%%mm4\n\t"
|
||||
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
|
||||
added to them, a factor of two removed, and the DC value included;
|
||||
correct the final sum here.*/
|
||||
"sub %[ret],%[ret2]\n\t"
|
||||
"movd %%mm4,%[ret]\n\t"
|
||||
"lea -64(%[ret2],%[ret],2),%[ret]\n\t"
|
||||
/*Although it looks like we're using 8 registers here, gcc can alias %[ret]
|
||||
and %[ret2] with some of the inputs, since for once we don't write to
|
||||
them until after we're done using everything but %[buf].*/
|
||||
/*Note that _src_ystride and _ref_ystride must be given non-overlapping
|
||||
constraints, otherewise if gcc can prove they're equal it will allocate
|
||||
them to the same register (which is bad); _src and _ref face a similar
|
||||
problem, though those are never actually the same.*/
|
||||
:[ret]"=d"(ret),[ret2]"=r"(ret2),[dc]"=a"(dc),
|
||||
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
|
||||
:[src]"r"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
|
||||
[ref]"r"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
|
||||
/*We have to use neg, so we actually clobber the condition codes for once
|
||||
(not to mention cmp, sub, and add).*/
|
||||
:"cc"
|
||||
);
|
||||
*_dc=dc;
|
||||
return ret;
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
|
||||
}
|
||||
|
||||
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
|
||||
we can share code with oc_enc_frag_satd2_mmxext().*/
|
||||
void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
|
||||
__asm__ __volatile__(
|
||||
/*Load the first 3 rows.*/
|
||||
"movq (%[src1]),%%mm0\n\t"
|
||||
"movq (%[src2]),%%mm1\n\t"
|
||||
"movq (%[src1],%[src_ystride]),%%mm2\n\t"
|
||||
"lea (%[src1],%[src_ystride],2),%[src1]\n\t"
|
||||
"movq (%[src2],%[src_ystride]),%%mm3\n\t"
|
||||
"lea (%[src2],%[src_ystride],2),%[src2]\n\t"
|
||||
"pxor %%mm7,%%mm7\n\t"
|
||||
"movq (%[src1]),%%mm4\n\t"
|
||||
"pcmpeqb %%mm6,%%mm6\n\t"
|
||||
"movq (%[src2]),%%mm5\n\t"
|
||||
/*mm7={1}x8.*/
|
||||
"psubb %%mm6,%%mm7\n\t"
|
||||
/*Start averaging %%mm0 and %%mm1 into %%mm6.*/
|
||||
"movq %%mm0,%%mm6\n\t"
|
||||
"pxor %%mm1,%%mm0\n\t"
|
||||
"pavgb %%mm1,%%mm6\n\t"
|
||||
/*%%mm1 is free, start averaging %%mm3 into %%mm2 using %%mm1.*/
|
||||
"movq %%mm2,%%mm1\n\t"
|
||||
"pand %%mm7,%%mm0\n\t"
|
||||
"pavgb %%mm3,%%mm2\n\t"
|
||||
"pxor %%mm3,%%mm1\n\t"
|
||||
/*%%mm3 is free.*/
|
||||
"psubb %%mm0,%%mm6\n\t"
|
||||
/*%%mm0 is free, start loading the next row.*/
|
||||
"movq (%[src1],%[src_ystride]),%%mm0\n\t"
|
||||
/*Start averaging %%mm5 and %%mm4 using %%mm3.*/
|
||||
"movq %%mm4,%%mm3\n\t"
|
||||
/*%%mm6 (row 0) is done; write it out.*/
|
||||
"movq %%mm6,(%[dst])\n\t"
|
||||
"pand %%mm7,%%mm1\n\t"
|
||||
"pavgb %%mm5,%%mm4\n\t"
|
||||
"psubb %%mm1,%%mm2\n\t"
|
||||
/*%%mm1 is free, continue loading the next row.*/
|
||||
"movq (%[src2],%[src_ystride]),%%mm1\n\t"
|
||||
"pxor %%mm5,%%mm3\n\t"
|
||||
"lea (%[src1],%[src_ystride],2),%[src1]\n\t"
|
||||
/*%%mm2 (row 1) is done; write it out.*/
|
||||
"movq %%mm2,(%[dst],%[dst_ystride])\n\t"
|
||||
"pand %%mm7,%%mm3\n\t"
|
||||
/*Start loading the next row.*/
|
||||
"movq (%[src1]),%%mm2\n\t"
|
||||
"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
|
||||
"psubb %%mm3,%%mm4\n\t"
|
||||
"lea (%[src2],%[src_ystride],2),%[src2]\n\t"
|
||||
/*%%mm4 (row 2) is done; write it out.*/
|
||||
"movq %%mm4,(%[dst])\n\t"
|
||||
/*Continue loading the next row.*/
|
||||
"movq (%[src2]),%%mm3\n\t"
|
||||
/*Start averaging %%mm0 and %%mm1 into %%mm6.*/
|
||||
"movq %%mm0,%%mm6\n\t"
|
||||
"pxor %%mm1,%%mm0\n\t"
|
||||
/*Start loading the next row.*/
|
||||
"movq (%[src1],%[src_ystride]),%%mm4\n\t"
|
||||
"pavgb %%mm1,%%mm6\n\t"
|
||||
/*%%mm1 is free; start averaging %%mm3 into %%mm2 using %%mm1.*/
|
||||
"movq %%mm2,%%mm1\n\t"
|
||||
"pand %%mm7,%%mm0\n\t"
|
||||
/*Continue loading the next row.*/
|
||||
"movq (%[src2],%[src_ystride]),%%mm5\n\t"
|
||||
"pavgb %%mm3,%%mm2\n\t"
|
||||
"lea (%[src1],%[src_ystride],2),%[src1]\n\t"
|
||||
"pxor %%mm3,%%mm1\n\t"
|
||||
/*%%mm3 is free.*/
|
||||
"psubb %%mm0,%%mm6\n\t"
|
||||
/*%%mm0 is free, start loading the next row.*/
|
||||
"movq (%[src1]),%%mm0\n\t"
|
||||
/*Start averaging %%mm5 into %%mm4 using %%mm3.*/
|
||||
"movq %%mm4,%%mm3\n\t"
|
||||
/*%%mm6 (row 3) is done; write it out.*/
|
||||
"movq %%mm6,(%[dst],%[dst_ystride])\n\t"
|
||||
"pand %%mm7,%%mm1\n\t"
|
||||
"lea (%[src2],%[src_ystride],2),%[src2]\n\t"
|
||||
"pavgb %%mm5,%%mm4\n\t"
|
||||
"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
|
||||
"psubb %%mm1,%%mm2\n\t"
|
||||
/*%%mm1 is free; continue loading the next row.*/
|
||||
"movq (%[src2]),%%mm1\n\t"
|
||||
"pxor %%mm5,%%mm3\n\t"
|
||||
/*%%mm2 (row 4) is done; write it out.*/
|
||||
"movq %%mm2,(%[dst])\n\t"
|
||||
"pand %%mm7,%%mm3\n\t"
|
||||
/*Start loading the next row.*/
|
||||
"movq (%[src1],%[src_ystride]),%%mm2\n\t"
|
||||
"psubb %%mm3,%%mm4\n\t"
|
||||
/*Start averaging %%mm0 and %%mm1 into %%mm6.*/
|
||||
"movq %%mm0,%%mm6\n\t"
|
||||
/*Continue loading the next row.*/
|
||||
"movq (%[src2],%[src_ystride]),%%mm3\n\t"
|
||||
/*%%mm4 (row 5) is done; write it out.*/
|
||||
"movq %%mm4,(%[dst],%[dst_ystride])\n\t"
|
||||
"pxor %%mm1,%%mm0\n\t"
|
||||
"pavgb %%mm1,%%mm6\n\t"
|
||||
/*%%mm4 is free; start averaging %%mm3 into %%mm2 using %%mm4.*/
|
||||
"movq %%mm2,%%mm4\n\t"
|
||||
"pand %%mm7,%%mm0\n\t"
|
||||
"pavgb %%mm3,%%mm2\n\t"
|
||||
"pxor %%mm3,%%mm4\n\t"
|
||||
"lea (%[dst],%[dst_ystride],2),%[dst]\n\t"
|
||||
"psubb %%mm0,%%mm6\n\t"
|
||||
"pand %%mm7,%%mm4\n\t"
|
||||
/*%%mm6 (row 6) is done, write it out.*/
|
||||
"movq %%mm6,(%[dst])\n\t"
|
||||
"psubb %%mm4,%%mm2\n\t"
|
||||
/*%%mm2 (row 7) is done, write it out.*/
|
||||
"movq %%mm2,(%[dst],%[dst_ystride])\n\t"
|
||||
:[dst]"+r"(_dst),[src1]"+r"(_src1),[src2]"+r"(_src2)
|
||||
:[dst_ystride]"r"((ptrdiff_t)_dst_ystride),
|
||||
[src_ystride]"r"((ptrdiff_t)_src_ystride)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
|
||||
OC_ALIGN8(unsigned char ref[64]);
|
||||
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
|
||||
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
|
||||
const unsigned char *_src,int _ystride){
|
||||
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||
unsigned ret;
|
||||
unsigned ret2;
|
||||
int dc;
|
||||
__asm__ __volatile__(
|
||||
OC_LOAD_8x4(0x00)
|
||||
OC_HADAMARD_8x4
|
||||
OC_TRANSPOSE_4x4x2(0x00)
|
||||
/*Finish swapping out this 8x4 block to make room for the next one.
|
||||
mm0...mm3 have been swapped out already.*/
|
||||
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
|
||||
"movq %%mm5,"OC_MEM_OFFS(0x10,buf)"\n\t"
|
||||
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
|
||||
"movq %%mm7,"OC_MEM_OFFS(0x30,buf)"\n\t"
|
||||
OC_LOAD_8x4(0x04)
|
||||
OC_HADAMARD_8x4
|
||||
OC_TRANSPOSE_4x4x2(0x08)
|
||||
/*Here the first 4x4 block of output from the last transpose is the second
|
||||
4x4 block of input for the next transform.
|
||||
We have cleverly arranged that it already be in the appropriate place, so
|
||||
we only have to do half the loads.*/
|
||||
"movq "OC_MEM_OFFS(0x10,buf)",%%mm1\n\t"
|
||||
"movq "OC_MEM_OFFS(0x20,buf)",%%mm2\n\t"
|
||||
"movq "OC_MEM_OFFS(0x30,buf)",%%mm3\n\t"
|
||||
"movq "OC_MEM_OFFS(0x00,buf)",%%mm0\n\t"
|
||||
/*We split out the stages here so we can save the DC coefficient in the
|
||||
middle.*/
|
||||
OC_HADAMARD_AB_8x4
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
|
||||
"movd %%mm1,%[dc]\n\t"
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
|
||||
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||
Now we finally have to promote things to dwords.
|
||||
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
|
||||
latency of pmaddwd by starting the next series of loads now.*/
|
||||
"pmaddwd %%mm7,%%mm0\n\t"
|
||||
"movq "OC_MEM_OFFS(0x50,buf)",%%mm1\n\t"
|
||||
"movq "OC_MEM_OFFS(0x58,buf)",%%mm5\n\t"
|
||||
"movq "OC_MEM_OFFS(0x60,buf)",%%mm2\n\t"
|
||||
"movq %%mm0,%%mm4\n\t"
|
||||
"movq "OC_MEM_OFFS(0x68,buf)",%%mm6\n\t"
|
||||
"punpckhdq %%mm0,%%mm0\n\t"
|
||||
"movq "OC_MEM_OFFS(0x70,buf)",%%mm3\n\t"
|
||||
"paddd %%mm0,%%mm4\n\t"
|
||||
"movq "OC_MEM_OFFS(0x78,buf)",%%mm7\n\t"
|
||||
"movd %%mm4,%[ret]\n\t"
|
||||
"movq "OC_MEM_OFFS(0x40,buf)",%%mm0\n\t"
|
||||
"movq "OC_MEM_OFFS(0x48,buf)",%%mm4\n\t"
|
||||
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
|
||||
"pmaddwd %%mm7,%%mm0\n\t"
|
||||
/*We assume that the DC coefficient is always positive (which is true,
|
||||
because the input to the INTRA transform was not a difference).*/
|
||||
"movzx %w[dc],%[dc]\n\t"
|
||||
"add %[ret],%[ret]\n\t"
|
||||
"sub %[dc],%[ret]\n\t"
|
||||
"movq %%mm0,%%mm4\n\t"
|
||||
"punpckhdq %%mm0,%%mm0\n\t"
|
||||
"paddd %%mm0,%%mm4\n\t"
|
||||
"movd %%mm4,%[ret2]\n\t"
|
||||
"lea -64(%[ret],%[ret2],2),%[ret]\n\t"
|
||||
/*Although it looks like we're using 8 registers here, gcc can alias %[ret]
|
||||
and %[ret2] with some of the inputs, since for once we don't write to
|
||||
them until after we're done using everything but %[buf] (which is also
|
||||
listed as an output to ensure gcc _doesn't_ alias them against it).*/
|
||||
:[ret]"=a"(ret),[ret2]"=r"(ret2),[dc]"=r"(dc),
|
||||
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
|
||||
:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
|
||||
[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
|
||||
/*We have to use sub, so we actually clobber the condition codes for once
|
||||
(not to mention add).*/
|
||||
:"cc"
|
||||
);
|
||||
*_dc=dc;
|
||||
return ret;
|
||||
}
|
||||
|
||||
void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
|
||||
const unsigned char *_src,const unsigned char *_ref,int _ystride){
|
||||
int i;
|
||||
__asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
|
||||
for(i=4;i-->0;){
|
||||
__asm__ __volatile__(
|
||||
/*mm0=[src]*/
|
||||
"movq (%[src]),%%mm0\n\t"
|
||||
/*mm1=[ref]*/
|
||||
"movq (%[ref]),%%mm1\n\t"
|
||||
/*mm4=[src+ystride]*/
|
||||
"movq (%[src],%[ystride]),%%mm4\n\t"
|
||||
/*mm5=[ref+ystride]*/
|
||||
"movq (%[ref],%[ystride]),%%mm5\n\t"
|
||||
/*Compute [src]-[ref].*/
|
||||
"movq %%mm0,%%mm2\n\t"
|
||||
"punpcklbw %%mm7,%%mm0\n\t"
|
||||
"movq %%mm1,%%mm3\n\t"
|
||||
"punpckhbw %%mm7,%%mm2\n\t"
|
||||
"punpcklbw %%mm7,%%mm1\n\t"
|
||||
"punpckhbw %%mm7,%%mm3\n\t"
|
||||
"psubw %%mm1,%%mm0\n\t"
|
||||
"psubw %%mm3,%%mm2\n\t"
|
||||
/*Compute [src+ystride]-[ref+ystride].*/
|
||||
"movq %%mm4,%%mm1\n\t"
|
||||
"punpcklbw %%mm7,%%mm4\n\t"
|
||||
"movq %%mm5,%%mm3\n\t"
|
||||
"punpckhbw %%mm7,%%mm1\n\t"
|
||||
"lea (%[src],%[ystride],2),%[src]\n\t"
|
||||
"punpcklbw %%mm7,%%mm5\n\t"
|
||||
"lea (%[ref],%[ystride],2),%[ref]\n\t"
|
||||
"punpckhbw %%mm7,%%mm3\n\t"
|
||||
"psubw %%mm5,%%mm4\n\t"
|
||||
"psubw %%mm3,%%mm1\n\t"
|
||||
/*Write the answer out.*/
|
||||
"movq %%mm0,0x00(%[residue])\n\t"
|
||||
"movq %%mm2,0x08(%[residue])\n\t"
|
||||
"movq %%mm4,0x10(%[residue])\n\t"
|
||||
"movq %%mm1,0x18(%[residue])\n\t"
|
||||
"lea 0x20(%[residue]),%[residue]\n\t"
|
||||
:[residue]"+r"(_residue),[src]"+r"(_src),[ref]"+r"(_ref)
|
||||
:[ystride]"r"((ptrdiff_t)_ystride)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
|
||||
const unsigned char *_src,int _ystride){
|
||||
ptrdiff_t ystride3;
|
||||
__asm__ __volatile__(
|
||||
/*mm0=[src]*/
|
||||
"movq (%[src]),%%mm0\n\t"
|
||||
/*mm1=[src+ystride]*/
|
||||
"movq (%[src],%[ystride]),%%mm1\n\t"
|
||||
/*mm6={-1}x4*/
|
||||
"pcmpeqw %%mm6,%%mm6\n\t"
|
||||
/*mm2=[src+2*ystride]*/
|
||||
"movq (%[src],%[ystride],2),%%mm2\n\t"
|
||||
/*[ystride3]=3*[ystride]*/
|
||||
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t"
|
||||
/*mm6={1}x4*/
|
||||
"psllw $15,%%mm6\n\t"
|
||||
/*mm3=[src+3*ystride]*/
|
||||
"movq (%[src],%[ystride3]),%%mm3\n\t"
|
||||
/*mm6={128}x4*/
|
||||
"psrlw $8,%%mm6\n\t"
|
||||
/*mm7=0*/
|
||||
"pxor %%mm7,%%mm7\n\t"
|
||||
/*[src]=[src]+4*[ystride]*/
|
||||
"lea (%[src],%[ystride],4),%[src]\n\t"
|
||||
/*Compute [src]-128 and [src+ystride]-128*/
|
||||
"movq %%mm0,%%mm4\n\t"
|
||||
"punpcklbw %%mm7,%%mm0\n\t"
|
||||
"movq %%mm1,%%mm5\n\t"
|
||||
"punpckhbw %%mm7,%%mm4\n\t"
|
||||
"psubw %%mm6,%%mm0\n\t"
|
||||
"punpcklbw %%mm7,%%mm1\n\t"
|
||||
"psubw %%mm6,%%mm4\n\t"
|
||||
"punpckhbw %%mm7,%%mm5\n\t"
|
||||
"psubw %%mm6,%%mm1\n\t"
|
||||
"psubw %%mm6,%%mm5\n\t"
|
||||
/*Write the answer out.*/
|
||||
"movq %%mm0,0x00(%[residue])\n\t"
|
||||
"movq %%mm4,0x08(%[residue])\n\t"
|
||||
"movq %%mm1,0x10(%[residue])\n\t"
|
||||
"movq %%mm5,0x18(%[residue])\n\t"
|
||||
/*mm0=[src+4*ystride]*/
|
||||
"movq (%[src]),%%mm0\n\t"
|
||||
/*mm1=[src+5*ystride]*/
|
||||
"movq (%[src],%[ystride]),%%mm1\n\t"
|
||||
/*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
|
||||
"movq %%mm2,%%mm4\n\t"
|
||||
"punpcklbw %%mm7,%%mm2\n\t"
|
||||
"movq %%mm3,%%mm5\n\t"
|
||||
"punpckhbw %%mm7,%%mm4\n\t"
|
||||
"psubw %%mm6,%%mm2\n\t"
|
||||
"punpcklbw %%mm7,%%mm3\n\t"
|
||||
"psubw %%mm6,%%mm4\n\t"
|
||||
"punpckhbw %%mm7,%%mm5\n\t"
|
||||
"psubw %%mm6,%%mm3\n\t"
|
||||
"psubw %%mm6,%%mm5\n\t"
|
||||
/*Write the answer out.*/
|
||||
"movq %%mm2,0x20(%[residue])\n\t"
|
||||
"movq %%mm4,0x28(%[residue])\n\t"
|
||||
"movq %%mm3,0x30(%[residue])\n\t"
|
||||
"movq %%mm5,0x38(%[residue])\n\t"
|
||||
/*mm2=[src+6*ystride]*/
|
||||
"movq (%[src],%[ystride],2),%%mm2\n\t"
|
||||
/*mm3=[src+7*ystride]*/
|
||||
"movq (%[src],%[ystride3]),%%mm3\n\t"
|
||||
/*Compute [src+4*ystride]-128 and [src+5*ystride]-128*/
|
||||
"movq %%mm0,%%mm4\n\t"
|
||||
"punpcklbw %%mm7,%%mm0\n\t"
|
||||
"movq %%mm1,%%mm5\n\t"
|
||||
"punpckhbw %%mm7,%%mm4\n\t"
|
||||
"psubw %%mm6,%%mm0\n\t"
|
||||
"punpcklbw %%mm7,%%mm1\n\t"
|
||||
"psubw %%mm6,%%mm4\n\t"
|
||||
"punpckhbw %%mm7,%%mm5\n\t"
|
||||
"psubw %%mm6,%%mm1\n\t"
|
||||
"psubw %%mm6,%%mm5\n\t"
|
||||
/*Write the answer out.*/
|
||||
"movq %%mm0,0x40(%[residue])\n\t"
|
||||
"movq %%mm4,0x48(%[residue])\n\t"
|
||||
"movq %%mm1,0x50(%[residue])\n\t"
|
||||
"movq %%mm5,0x58(%[residue])\n\t"
|
||||
/*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
|
||||
"movq %%mm2,%%mm4\n\t"
|
||||
"punpcklbw %%mm7,%%mm2\n\t"
|
||||
"movq %%mm3,%%mm5\n\t"
|
||||
"punpckhbw %%mm7,%%mm4\n\t"
|
||||
"psubw %%mm6,%%mm2\n\t"
|
||||
"punpcklbw %%mm7,%%mm3\n\t"
|
||||
"psubw %%mm6,%%mm4\n\t"
|
||||
"punpckhbw %%mm7,%%mm5\n\t"
|
||||
"psubw %%mm6,%%mm3\n\t"
|
||||
"psubw %%mm6,%%mm5\n\t"
|
||||
/*Write the answer out.*/
|
||||
"movq %%mm2,0x60(%[residue])\n\t"
|
||||
"movq %%mm4,0x68(%[residue])\n\t"
|
||||
"movq %%mm3,0x70(%[residue])\n\t"
|
||||
"movq %%mm5,0x78(%[residue])\n\t"
|
||||
:[src]"+r"(_src),[ystride3]"=&r"(ystride3)
|
||||
:[residue]"r"(_residue),[ystride]"r"((ptrdiff_t)_ystride)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
|
||||
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride){
|
||||
oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
|
||||
}
|
||||
|
||||
#endif
|
||||
678
engine/thirdparty/libtheora/x86/mmxfdct.c
vendored
Normal file
678
engine/thirdparty/libtheora/x86/mmxfdct.c
vendored
Normal file
|
|
@ -0,0 +1,678 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************/
|
||||
/*MMX fDCT implementation for x86_32*/
|
||||
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
|
||||
#include "x86enc.h"
|
||||
#include "x86zigzag.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
# define OC_FDCT_STAGE1_8x4 \
|
||||
"#OC_FDCT_STAGE1_8x4\n\t" \
|
||||
/*Stage 1:*/ \
|
||||
/*mm0=t7'=t0-t7*/ \
|
||||
"psubw %%mm7,%%mm0\n\t" \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
/*mm1=t6'=t1-t6*/ \
|
||||
"psubw %%mm6,%%mm1\n\t" \
|
||||
"paddw %%mm6,%%mm6\n\t" \
|
||||
/*mm2=t5'=t2-t5*/ \
|
||||
"psubw %%mm5,%%mm2\n\t" \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
/*mm3=t4'=t3-t4*/ \
|
||||
"psubw %%mm4,%%mm3\n\t" \
|
||||
"paddw %%mm4,%%mm4\n\t" \
|
||||
/*mm7=t0'=t0+t7*/ \
|
||||
"paddw %%mm0,%%mm7\n\t" \
|
||||
/*mm6=t1'=t1+t6*/ \
|
||||
"paddw %%mm1,%%mm6\n\t" \
|
||||
/*mm5=t2'=t2+t5*/ \
|
||||
"paddw %%mm2,%%mm5\n\t" \
|
||||
/*mm4=t3'=t3+t4*/ \
|
||||
"paddw %%mm3,%%mm4\n\t" \
|
||||
|
||||
# define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
|
||||
"#OC_FDCT8x4\n\t" \
|
||||
/*Stage 2:*/ \
|
||||
/*mm7=t3''=t0'-t3'*/ \
|
||||
"psubw %%mm4,%%mm7\n\t" \
|
||||
"paddw %%mm4,%%mm4\n\t" \
|
||||
/*mm6=t2''=t1'-t2'*/ \
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
"movq %%mm7,"_r6"(%[y])\n\t" \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
/*mm1=t5''=t6'-t5'*/ \
|
||||
"psubw %%mm2,%%mm1\n\t" \
|
||||
"movq %%mm6,"_r2"(%[y])\n\t" \
|
||||
/*mm4=t0''=t0'+t3'*/ \
|
||||
"paddw %%mm7,%%mm4\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
/*mm5=t1''=t1'+t2'*/ \
|
||||
"movq %%mm4,"_r0"(%[y])\n\t" \
|
||||
"paddw %%mm6,%%mm5\n\t" \
|
||||
/*mm2=t6''=t6'+t5'*/ \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
"movq %%mm5,"_r4"(%[y])\n\t" \
|
||||
/*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
|
||||
/*mm4, mm5, mm6, mm7 are free.*/ \
|
||||
/*Stage 3:*/ \
|
||||
/*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
|
||||
"mov $0x5A806A0A,%[a]\n\t" \
|
||||
"pcmpeqb %%mm6,%%mm6\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"psrlw $15,%%mm6\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"paddw %%mm6,%%mm6\n\t" \
|
||||
/*mm0=0, m2={-1}x4 \
|
||||
mm5:mm4=t5''*27146+0xB500*/ \
|
||||
"movq %%mm1,%%mm4\n\t" \
|
||||
"movq %%mm1,%%mm5\n\t" \
|
||||
"punpcklwd %%mm6,%%mm4\n\t" \
|
||||
"movq %%mm2,"_r3"(%[y])\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"movq %%mm0,"_r7"(%[y])\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" \
|
||||
"pxor %%mm0,%%mm0\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"pcmpeqb %%mm2,%%mm2\n\t" \
|
||||
/*mm2=t6'', mm1=t5''+(t5''!=0) \
|
||||
mm4=(t5''*27146+0xB500>>16)*/ \
|
||||
"pcmpeqw %%mm1,%%mm0\n\t" \
|
||||
"psrad $16,%%mm4\n\t" \
|
||||
"psubw %%mm2,%%mm0\n\t" \
|
||||
"movq "_r3"(%[y]),%%mm2\n\t" \
|
||||
"psrad $16,%%mm5\n\t" \
|
||||
"paddw %%mm0,%%mm1\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
/*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
|
||||
"paddw %%mm1,%%mm4\n\t" \
|
||||
"movq "_r7"(%[y]),%%mm0\n\t" \
|
||||
"psraw $1,%%mm4\n\t" \
|
||||
"movq %%mm3,%%mm1\n\t" \
|
||||
/*mm3=t4''=t4'+s*/ \
|
||||
"paddw %%mm4,%%mm3\n\t" \
|
||||
/*mm1=t5'''=t4'-s*/ \
|
||||
"psubw %%mm4,%%mm1\n\t" \
|
||||
/*mm1=0, mm3={-1}x4 \
|
||||
mm5:mm4=t6''*27146+0xB500*/ \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklwd %%mm6,%%mm4\n\t" \
|
||||
"movq %%mm1,"_r5"(%[y])\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"movq %%mm3,"_r1"(%[y])\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" \
|
||||
"pxor %%mm1,%%mm1\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"pcmpeqb %%mm3,%%mm3\n\t" \
|
||||
/*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
|
||||
"psrad $16,%%mm4\n\t" \
|
||||
"pcmpeqw %%mm2,%%mm1\n\t" \
|
||||
"psrad $16,%%mm5\n\t" \
|
||||
"psubw %%mm3,%%mm1\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
/*mm1=t1'' \
|
||||
mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
|
||||
"paddw %%mm2,%%mm4\n\t" \
|
||||
"movq "_r4"(%[y]),%%mm1\n\t" \
|
||||
"psraw $1,%%mm4\n\t" \
|
||||
"movq %%mm0,%%mm2\n\t" \
|
||||
/*mm7={54491-0x7FFF,0x7FFF}x2 \
|
||||
mm0=t7''=t7'+s*/ \
|
||||
"paddw %%mm4,%%mm0\n\t" \
|
||||
/*mm2=t6'''=t7'-s*/ \
|
||||
"psubw %%mm4,%%mm2\n\t" \
|
||||
/*Stage 4:*/ \
|
||||
/*mm0=0, mm2=t0'' \
|
||||
mm5:mm4=t1''*27146+0xB500*/ \
|
||||
"movq %%mm1,%%mm4\n\t" \
|
||||
"movq %%mm1,%%mm5\n\t" \
|
||||
"punpcklwd %%mm6,%%mm4\n\t" \
|
||||
"movq %%mm2,"_r3"(%[y])\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"movq "_r0"(%[y]),%%mm2\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" \
|
||||
"movq %%mm0,"_r7"(%[y])\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"pxor %%mm0,%%mm0\n\t" \
|
||||
/*mm7={27146,0x4000>>1}x2 \
|
||||
mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
|
||||
"psrad $16,%%mm4\n\t" \
|
||||
"mov $0x20006A0A,%[a]\n\t" \
|
||||
"pcmpeqw %%mm1,%%mm0\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"psrad $16,%%mm5\n\t" \
|
||||
"psubw %%mm3,%%mm0\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
"paddw %%mm1,%%mm0\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"paddw %%mm4,%%mm0\n\t" \
|
||||
/*mm6={0x00000E3D}x2 \
|
||||
mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklwd %%mm6,%%mm4\n\t" \
|
||||
"mov $0x0E3D,%[a]\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" \
|
||||
"movd %[a],%%mm6\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"pxor %%mm1,%%mm1\n\t" \
|
||||
"punpckldq %%mm6,%%mm6\n\t" \
|
||||
"pcmpeqw %%mm2,%%mm1\n\t" \
|
||||
/*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
|
||||
"psrad $16,%%mm4\n\t" \
|
||||
"psubw %%mm3,%%mm1\n\t" \
|
||||
"psrad $16,%%mm5\n\t" \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
"movq "_r5"(%[y]),%%mm1\n\t" \
|
||||
"paddw %%mm2,%%mm4\n\t" \
|
||||
/*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
|
||||
The naive implementation could cause overflow, so we use \
|
||||
u=(r&s)+((r^s)>>1).*/ \
|
||||
"movq "_r3"(%[y]),%%mm2\n\t" \
|
||||
"movq %%mm0,%%mm7\n\t" \
|
||||
"pxor %%mm4,%%mm0\n\t" \
|
||||
"pand %%mm4,%%mm7\n\t" \
|
||||
"psraw $1,%%mm0\n\t" \
|
||||
"mov $0x7FFF54DC,%[a]\n\t" \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
/*mm7={54491-0x7FFF,0x7FFF}x2 \
|
||||
mm4=_y[4]=v=r-u*/ \
|
||||
"psubw %%mm0,%%mm4\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"movq %%mm4,"_r4"(%[y])\n\t" \
|
||||
/*mm0=0, mm7={36410}x4 \
|
||||
mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
|
||||
"movq %%mm1,%%mm4\n\t" \
|
||||
"movq %%mm1,%%mm5\n\t" \
|
||||
"punpcklwd %%mm1,%%mm4\n\t" \
|
||||
"mov $0x8E3A8E3A,%[a]\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"movq %%mm0,"_r0"(%[y])\n\t" \
|
||||
"punpckhwd %%mm1,%%mm5\n\t" \
|
||||
"pxor %%mm0,%%mm0\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"pcmpeqw %%mm0,%%mm1\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"psubw %%mm3,%%mm1\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"paddd %%mm6,%%mm4\n\t" \
|
||||
"paddd %%mm6,%%mm5\n\t" \
|
||||
/*mm0=0 \
|
||||
mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
|
||||
"movq %%mm2,%%mm6\n\t" \
|
||||
"movq %%mm2,%%mm3\n\t" \
|
||||
"pmulhw %%mm7,%%mm6\n\t" \
|
||||
"paddw %%mm2,%%mm1\n\t" \
|
||||
"pmullw %%mm7,%%mm3\n\t" \
|
||||
"pxor %%mm0,%%mm0\n\t" \
|
||||
"paddw %%mm1,%%mm6\n\t" \
|
||||
"movq %%mm3,%%mm1\n\t" \
|
||||
"punpckhwd %%mm6,%%mm3\n\t" \
|
||||
"punpcklwd %%mm6,%%mm1\n\t" \
|
||||
/*mm3={-1}x4, mm6={1}x4 \
|
||||
mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
|
||||
"paddd %%mm3,%%mm5\n\t" \
|
||||
"paddd %%mm1,%%mm4\n\t" \
|
||||
"psrad $16,%%mm5\n\t" \
|
||||
"pxor %%mm6,%%mm6\n\t" \
|
||||
"psrad $16,%%mm4\n\t" \
|
||||
"pcmpeqb %%mm3,%%mm3\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
"psubw %%mm3,%%mm6\n\t" \
|
||||
/*mm1=t7'', mm7={26568,0x3400}x2 \
|
||||
mm2=s=t6'''-(36410*u>>16)*/ \
|
||||
"movq %%mm4,%%mm1\n\t" \
|
||||
"mov $0x340067C8,%[a]\n\t" \
|
||||
"pmulhw %%mm7,%%mm4\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"movq %%mm1,"_r5"(%[y])\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"paddw %%mm1,%%mm4\n\t" \
|
||||
"movq "_r7"(%[y]),%%mm1\n\t" \
|
||||
"psubw %%mm4,%%mm2\n\t" \
|
||||
/*mm6={0x00007B1B}x2 \
|
||||
mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklwd %%mm6,%%mm4\n\t" \
|
||||
"pcmpeqw %%mm2,%%mm0\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"mov $0x7B1B,%[a]\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" \
|
||||
"movd %[a],%%mm6\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"psubw %%mm3,%%mm0\n\t" \
|
||||
"punpckldq %%mm6,%%mm6\n\t" \
|
||||
/*mm7={64277-0x7FFF,0x7FFF}x2 \
|
||||
mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
|
||||
"psrad $17,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm2\n\t" \
|
||||
"psrad $17,%%mm5\n\t" \
|
||||
"mov $0x7FFF7B16,%[a]\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"paddw %%mm4,%%mm2\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
/*mm0=0, mm7={12785}x4 \
|
||||
mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
|
||||
"movq %%mm1,%%mm4\n\t" \
|
||||
"movq %%mm1,%%mm5\n\t" \
|
||||
"movq %%mm2,"_r3"(%[y])\n\t" \
|
||||
"punpcklwd %%mm1,%%mm4\n\t" \
|
||||
"movq "_r1"(%[y]),%%mm2\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"mov $0x31F131F1,%[a]\n\t" \
|
||||
"punpckhwd %%mm1,%%mm5\n\t" \
|
||||
"pxor %%mm0,%%mm0\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"pcmpeqw %%mm0,%%mm1\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"psubw %%mm3,%%mm1\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"paddd %%mm6,%%mm4\n\t" \
|
||||
"paddd %%mm6,%%mm5\n\t" \
|
||||
/*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
|
||||
"movq %%mm2,%%mm6\n\t" \
|
||||
"movq %%mm2,%%mm3\n\t" \
|
||||
"pmulhw %%mm7,%%mm6\n\t" \
|
||||
"pmullw %%mm7,%%mm3\n\t" \
|
||||
"paddw %%mm1,%%mm6\n\t" \
|
||||
"movq %%mm3,%%mm1\n\t" \
|
||||
"punpckhwd %%mm6,%%mm3\n\t" \
|
||||
"punpcklwd %%mm6,%%mm1\n\t" \
|
||||
/*mm3={-1}x4, mm6={1}x4 \
|
||||
mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
|
||||
"paddd %%mm3,%%mm5\n\t" \
|
||||
"paddd %%mm1,%%mm4\n\t" \
|
||||
"psrad $16,%%mm5\n\t" \
|
||||
"pxor %%mm6,%%mm6\n\t" \
|
||||
"psrad $16,%%mm4\n\t" \
|
||||
"pcmpeqb %%mm3,%%mm3\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
"psubw %%mm3,%%mm6\n\t" \
|
||||
/*mm1=t3'', mm7={20539,0x3000}x2 \
|
||||
mm4=s=(12785*u>>16)-t4''*/ \
|
||||
"movq %%mm4,"_r1"(%[y])\n\t" \
|
||||
"pmulhw %%mm7,%%mm4\n\t" \
|
||||
"mov $0x3000503B,%[a]\n\t" \
|
||||
"movq "_r6"(%[y]),%%mm1\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"psubw %%mm2,%%mm4\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
/*mm6={0x00006CB7}x2 \
|
||||
mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"movq %%mm4,%%mm2\n\t" \
|
||||
"punpcklwd %%mm6,%%mm4\n\t" \
|
||||
"pcmpeqw %%mm2,%%mm0\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"mov $0x6CB7,%[a]\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" \
|
||||
"movd %[a],%%mm6\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"psubw %%mm3,%%mm0\n\t" \
|
||||
"punpckldq %%mm6,%%mm6\n\t" \
|
||||
/*mm7={60547-0x7FFF,0x7FFF}x2 \
|
||||
mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
|
||||
"psrad $20,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm2\n\t" \
|
||||
"psrad $20,%%mm5\n\t" \
|
||||
"mov $0x7FFF6C84,%[a]\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"paddw %%mm4,%%mm2\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
/*mm0=0, mm7={25080}x4 \
|
||||
mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
|
||||
"movq %%mm1,%%mm4\n\t" \
|
||||
"movq %%mm1,%%mm5\n\t" \
|
||||
"movq %%mm2,"_r7"(%[y])\n\t" \
|
||||
"punpcklwd %%mm1,%%mm4\n\t" \
|
||||
"movq "_r2"(%[y]),%%mm2\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"mov $0x61F861F8,%[a]\n\t" \
|
||||
"punpckhwd %%mm1,%%mm5\n\t" \
|
||||
"pxor %%mm0,%%mm0\n\t" \
|
||||
"pmaddwd %%mm7,%%mm5\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"pcmpeqw %%mm0,%%mm1\n\t" \
|
||||
"psubw %%mm3,%%mm1\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"paddd %%mm6,%%mm4\n\t" \
|
||||
"paddd %%mm6,%%mm5\n\t" \
|
||||
/*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
|
||||
"movq %%mm2,%%mm6\n\t" \
|
||||
"movq %%mm2,%%mm3\n\t" \
|
||||
"pmulhw %%mm7,%%mm6\n\t" \
|
||||
"pmullw %%mm7,%%mm3\n\t" \
|
||||
"paddw %%mm1,%%mm6\n\t" \
|
||||
"movq %%mm3,%%mm1\n\t" \
|
||||
"punpckhwd %%mm6,%%mm3\n\t" \
|
||||
"punpcklwd %%mm6,%%mm1\n\t" \
|
||||
/*mm1={-1}x4 \
|
||||
mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
|
||||
"paddd %%mm3,%%mm5\n\t" \
|
||||
"paddd %%mm1,%%mm4\n\t" \
|
||||
"psrad $16,%%mm5\n\t" \
|
||||
"mov $0x28005460,%[a]\n\t" \
|
||||
"psrad $16,%%mm4\n\t" \
|
||||
"pcmpeqb %%mm1,%%mm1\n\t" \
|
||||
"packssdw %%mm5,%%mm4\n\t" \
|
||||
/*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
|
||||
mm4=s=(25080*u>>16)-t2''*/ \
|
||||
"movq %%mm4,%%mm6\n\t" \
|
||||
"pmulhw %%mm7,%%mm4\n\t" \
|
||||
"pxor %%mm5,%%mm5\n\t" \
|
||||
"movd %[a],%%mm7\n\t" \
|
||||
"psubw %%mm1,%%mm5\n\t" \
|
||||
"punpckldq %%mm7,%%mm7\n\t" \
|
||||
"psubw %%mm2,%%mm4\n\t" \
|
||||
/*mm2=s+(s!=0) \
|
||||
mm4:mm3=s*21600+0x2800*/ \
|
||||
"movq %%mm4,%%mm3\n\t" \
|
||||
"movq %%mm4,%%mm2\n\t" \
|
||||
"punpckhwd %%mm5,%%mm4\n\t" \
|
||||
"pcmpeqw %%mm2,%%mm0\n\t" \
|
||||
"pmaddwd %%mm7,%%mm4\n\t" \
|
||||
"psubw %%mm1,%%mm0\n\t" \
|
||||
"punpcklwd %%mm5,%%mm3\n\t" \
|
||||
"paddw %%mm0,%%mm2\n\t" \
|
||||
"pmaddwd %%mm7,%%mm3\n\t" \
|
||||
/*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
|
||||
mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
|
||||
"movq "_r4"(%[y]),%%mm0\n\t" \
|
||||
"psrad $18,%%mm4\n\t" \
|
||||
"movq "_r5"(%[y]),%%mm5\n\t" \
|
||||
"psrad $18,%%mm3\n\t" \
|
||||
"movq "_r7"(%[y]),%%mm1\n\t" \
|
||||
"packssdw %%mm4,%%mm3\n\t" \
|
||||
"movq "_r0"(%[y]),%%mm4\n\t" \
|
||||
"paddw %%mm2,%%mm3\n\t" \
|
||||
|
||||
/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
|
||||
On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
|
||||
{mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
|
||||
# define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \
|
||||
"#OC_TRANSPOSE8x4\n\t" \
|
||||
/*First 4x4 transpose:*/ \
|
||||
/*mm0 = e3 e2 e1 e0 \
|
||||
mm5 = f3 f2 f1 f0 \
|
||||
mm3 = g3 g2 g1 g0 \
|
||||
mm1 = h3 h2 h1 h0*/ \
|
||||
"movq %%mm0,%%mm2\n\t" \
|
||||
"punpcklwd %%mm5,%%mm0\n\t" \
|
||||
"punpckhwd %%mm5,%%mm2\n\t" \
|
||||
"movq %%mm3,%%mm5\n\t" \
|
||||
"punpcklwd %%mm1,%%mm3\n\t" \
|
||||
"punpckhwd %%mm1,%%mm5\n\t" \
|
||||
/*mm0 = f1 e1 f0 e0 \
|
||||
mm2 = f3 e3 f2 e2 \
|
||||
mm3 = h1 g1 h0 g0 \
|
||||
mm5 = h3 g3 h2 g2*/ \
|
||||
"movq %%mm0,%%mm1\n\t" \
|
||||
"punpckldq %%mm3,%%mm0\n\t" \
|
||||
"movq %%mm0,"_r4"(%[y])\n\t" \
|
||||
"punpckhdq %%mm3,%%mm1\n\t" \
|
||||
"movq "_r1"(%[y]),%%mm0\n\t" \
|
||||
"movq %%mm2,%%mm3\n\t" \
|
||||
"punpckldq %%mm5,%%mm2\n\t" \
|
||||
"punpckhdq %%mm5,%%mm3\n\t" \
|
||||
"movq "_r3"(%[y]),%%mm5\n\t" \
|
||||
/*_y[4] = h0 g0 f0 e0 \
|
||||
mm1 = h1 g1 f1 e1 \
|
||||
mm2 = h2 g2 f2 e2 \
|
||||
mm3 = h3 g3 f3 e3*/ \
|
||||
/*Second 4x4 transpose:*/ \
|
||||
/*mm4 = a3 a2 a1 a0 \
|
||||
mm0 = b3 b2 b1 b0 \
|
||||
mm6 = c3 c2 c1 c0 \
|
||||
mm5 = d3 d2 d1 d0*/ \
|
||||
"movq %%mm4,%%mm7\n\t" \
|
||||
"punpcklwd %%mm0,%%mm4\n\t" \
|
||||
"punpckhwd %%mm0,%%mm7\n\t" \
|
||||
"movq %%mm6,%%mm0\n\t" \
|
||||
"punpcklwd %%mm5,%%mm6\n\t" \
|
||||
"punpckhwd %%mm5,%%mm0\n\t" \
|
||||
/*mm4 = b1 a1 b0 a0 \
|
||||
mm7 = b3 a3 b2 a2 \
|
||||
mm6 = d1 c1 d0 c0 \
|
||||
mm0 = d3 c3 d2 c2*/ \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"punpckldq %%mm6,%%mm4\n\t" \
|
||||
"punpckhdq %%mm6,%%mm5\n\t" \
|
||||
"movq %%mm7,%%mm6\n\t" \
|
||||
"punpckhdq %%mm0,%%mm7\n\t" \
|
||||
"punpckldq %%mm0,%%mm6\n\t" \
|
||||
/*mm4 = d0 c0 b0 a0 \
|
||||
mm5 = d1 c1 b1 a1 \
|
||||
mm6 = d2 c2 b2 a2 \
|
||||
mm7 = d3 c3 b3 a3*/ \
|
||||
|
||||
/*MMX implementation of the fDCT.*/
|
||||
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||
ptrdiff_t a;
|
||||
__asm__ __volatile__(
|
||||
/*Add two extra bits of working precision to improve accuracy; any more and
|
||||
we could overflow.*/
|
||||
/*We also add biases to correct for some systematic error that remains in
|
||||
the full fDCT->iDCT round trip.*/
|
||||
"movq 0x00(%[x]),%%mm0\n\t"
|
||||
"movq 0x10(%[x]),%%mm1\n\t"
|
||||
"movq 0x20(%[x]),%%mm2\n\t"
|
||||
"movq 0x30(%[x]),%%mm3\n\t"
|
||||
"pcmpeqb %%mm4,%%mm4\n\t"
|
||||
"pxor %%mm7,%%mm7\n\t"
|
||||
"movq %%mm0,%%mm5\n\t"
|
||||
"psllw $2,%%mm0\n\t"
|
||||
"pcmpeqw %%mm7,%%mm5\n\t"
|
||||
"movq 0x70(%[x]),%%mm7\n\t"
|
||||
"psllw $2,%%mm1\n\t"
|
||||
"psubw %%mm4,%%mm5\n\t"
|
||||
"psllw $2,%%mm2\n\t"
|
||||
"mov $1,%[a]\n\t"
|
||||
"pslld $16,%%mm5\n\t"
|
||||
"movd %[a],%%mm6\n\t"
|
||||
"psllq $16,%%mm5\n\t"
|
||||
"mov $0x10001,%[a]\n\t"
|
||||
"psllw $2,%%mm3\n\t"
|
||||
"movd %[a],%%mm4\n\t"
|
||||
"punpckhwd %%mm6,%%mm5\n\t"
|
||||
"psubw %%mm6,%%mm1\n\t"
|
||||
"movq 0x60(%[x]),%%mm6\n\t"
|
||||
"paddw %%mm5,%%mm0\n\t"
|
||||
"movq 0x50(%[x]),%%mm5\n\t"
|
||||
"paddw %%mm4,%%mm0\n\t"
|
||||
"movq 0x40(%[x]),%%mm4\n\t"
|
||||
/*We inline stage1 of the transform here so we can get better instruction
|
||||
scheduling with the shifts.*/
|
||||
/*mm0=t7'=t0-t7*/
|
||||
"psllw $2,%%mm7\n\t"
|
||||
"psubw %%mm7,%%mm0\n\t"
|
||||
"psllw $2,%%mm6\n\t"
|
||||
"paddw %%mm7,%%mm7\n\t"
|
||||
/*mm1=t6'=t1-t6*/
|
||||
"psllw $2,%%mm5\n\t"
|
||||
"psubw %%mm6,%%mm1\n\t"
|
||||
"psllw $2,%%mm4\n\t"
|
||||
"paddw %%mm6,%%mm6\n\t"
|
||||
/*mm2=t5'=t2-t5*/
|
||||
"psubw %%mm5,%%mm2\n\t"
|
||||
"paddw %%mm5,%%mm5\n\t"
|
||||
/*mm3=t4'=t3-t4*/
|
||||
"psubw %%mm4,%%mm3\n\t"
|
||||
"paddw %%mm4,%%mm4\n\t"
|
||||
/*mm7=t0'=t0+t7*/
|
||||
"paddw %%mm0,%%mm7\n\t"
|
||||
/*mm6=t1'=t1+t6*/
|
||||
"paddw %%mm1,%%mm6\n\t"
|
||||
/*mm5=t2'=t2+t5*/
|
||||
"paddw %%mm2,%%mm5\n\t"
|
||||
/*mm4=t3'=t3+t4*/
|
||||
"paddw %%mm3,%%mm4\n\t"
|
||||
OC_FDCT8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
|
||||
OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")
|
||||
/*Swap out this 8x4 block for the next one.*/
|
||||
"movq 0x08(%[x]),%%mm0\n\t"
|
||||
"movq %%mm7,0x30(%[y])\n\t"
|
||||
"movq 0x78(%[x]),%%mm7\n\t"
|
||||
"movq %%mm1,0x50(%[y])\n\t"
|
||||
"movq 0x18(%[x]),%%mm1\n\t"
|
||||
"movq %%mm6,0x20(%[y])\n\t"
|
||||
"movq 0x68(%[x]),%%mm6\n\t"
|
||||
"movq %%mm2,0x60(%[y])\n\t"
|
||||
"movq 0x28(%[x]),%%mm2\n\t"
|
||||
"movq %%mm5,0x10(%[y])\n\t"
|
||||
"movq 0x58(%[x]),%%mm5\n\t"
|
||||
"movq %%mm3,0x70(%[y])\n\t"
|
||||
"movq 0x38(%[x]),%%mm3\n\t"
|
||||
/*And increase its working precision, too.*/
|
||||
"psllw $2,%%mm0\n\t"
|
||||
"movq %%mm4,0x00(%[y])\n\t"
|
||||
"psllw $2,%%mm7\n\t"
|
||||
"movq 0x48(%[x]),%%mm4\n\t"
|
||||
/*We inline stage1 of the transform here so we can get better instruction
|
||||
scheduling with the shifts.*/
|
||||
/*mm0=t7'=t0-t7*/
|
||||
"psubw %%mm7,%%mm0\n\t"
|
||||
"psllw $2,%%mm1\n\t"
|
||||
"paddw %%mm7,%%mm7\n\t"
|
||||
"psllw $2,%%mm6\n\t"
|
||||
/*mm1=t6'=t1-t6*/
|
||||
"psubw %%mm6,%%mm1\n\t"
|
||||
"psllw $2,%%mm2\n\t"
|
||||
"paddw %%mm6,%%mm6\n\t"
|
||||
"psllw $2,%%mm5\n\t"
|
||||
/*mm2=t5'=t2-t5*/
|
||||
"psubw %%mm5,%%mm2\n\t"
|
||||
"psllw $2,%%mm3\n\t"
|
||||
"paddw %%mm5,%%mm5\n\t"
|
||||
"psllw $2,%%mm4\n\t"
|
||||
/*mm3=t4'=t3-t4*/
|
||||
"psubw %%mm4,%%mm3\n\t"
|
||||
"paddw %%mm4,%%mm4\n\t"
|
||||
/*mm7=t0'=t0+t7*/
|
||||
"paddw %%mm0,%%mm7\n\t"
|
||||
/*mm6=t1'=t1+t6*/
|
||||
"paddw %%mm1,%%mm6\n\t"
|
||||
/*mm5=t2'=t2+t5*/
|
||||
"paddw %%mm2,%%mm5\n\t"
|
||||
/*mm4=t3'=t3+t4*/
|
||||
"paddw %%mm3,%%mm4\n\t"
|
||||
OC_FDCT8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
|
||||
OC_TRANSPOSE8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")
|
||||
/*Here the first 4x4 block of output from the last transpose is the second
|
||||
4x4 block of input for the next transform.
|
||||
We have cleverly arranged that it already be in the appropriate place,
|
||||
so we only have to do half the stores and loads.*/
|
||||
"movq 0x00(%[y]),%%mm0\n\t"
|
||||
"movq %%mm1,0x58(%[y])\n\t"
|
||||
"movq 0x10(%[y]),%%mm1\n\t"
|
||||
"movq %%mm2,0x68(%[y])\n\t"
|
||||
"movq 0x20(%[y]),%%mm2\n\t"
|
||||
"movq %%mm3,0x78(%[y])\n\t"
|
||||
"movq 0x30(%[y]),%%mm3\n\t"
|
||||
OC_FDCT_STAGE1_8x4
|
||||
OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")
|
||||
/*mm2={-2}x4*/
|
||||
"pcmpeqw %%mm2,%%mm2\n\t"
|
||||
"paddw %%mm2,%%mm2\n\t"
|
||||
/*Round and store the results (no transpose).*/
|
||||
"movq 0x10(%[y]),%%mm7\n\t"
|
||||
"psubw %%mm2,%%mm4\n\t"
|
||||
"psubw %%mm2,%%mm6\n\t"
|
||||
"psraw $2,%%mm4\n\t"
|
||||
"psubw %%mm2,%%mm0\n\t"
|
||||
"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"
|
||||
"movq 0x30(%[y]),%%mm4\n\t"
|
||||
"psraw $2,%%mm6\n\t"
|
||||
"psubw %%mm2,%%mm5\n\t"
|
||||
"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"
|
||||
"psraw $2,%%mm0\n\t"
|
||||
"psubw %%mm2,%%mm3\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x40,buf)"\n\t"
|
||||
"psraw $2,%%mm5\n\t"
|
||||
"psubw %%mm2,%%mm1\n\t"
|
||||
"movq %%mm5,"OC_MEM_OFFS(0x50,buf)"\n\t"
|
||||
"psraw $2,%%mm3\n\t"
|
||||
"psubw %%mm2,%%mm7\n\t"
|
||||
"movq %%mm3,"OC_MEM_OFFS(0x60,buf)"\n\t"
|
||||
"psraw $2,%%mm1\n\t"
|
||||
"psubw %%mm2,%%mm4\n\t"
|
||||
"movq %%mm1,"OC_MEM_OFFS(0x70,buf)"\n\t"
|
||||
"psraw $2,%%mm7\n\t"
|
||||
"movq %%mm7,"OC_MEM_OFFS(0x10,buf)"\n\t"
|
||||
"psraw $2,%%mm4\n\t"
|
||||
"movq %%mm4,"OC_MEM_OFFS(0x30,buf)"\n\t"
|
||||
/*Load the next block.*/
|
||||
"movq 0x40(%[y]),%%mm0\n\t"
|
||||
"movq 0x78(%[y]),%%mm7\n\t"
|
||||
"movq 0x50(%[y]),%%mm1\n\t"
|
||||
"movq 0x68(%[y]),%%mm6\n\t"
|
||||
"movq 0x60(%[y]),%%mm2\n\t"
|
||||
"movq 0x58(%[y]),%%mm5\n\t"
|
||||
"movq 0x70(%[y]),%%mm3\n\t"
|
||||
"movq 0x48(%[y]),%%mm4\n\t"
|
||||
OC_FDCT_STAGE1_8x4
|
||||
OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")
|
||||
/*mm2={-2}x4*/
|
||||
"pcmpeqw %%mm2,%%mm2\n\t"
|
||||
"paddw %%mm2,%%mm2\n\t"
|
||||
/*Round and store the results (no transpose).*/
|
||||
"movq 0x50(%[y]),%%mm7\n\t"
|
||||
"psubw %%mm2,%%mm4\n\t"
|
||||
"psubw %%mm2,%%mm6\n\t"
|
||||
"psraw $2,%%mm4\n\t"
|
||||
"psubw %%mm2,%%mm0\n\t"
|
||||
"movq %%mm4,"OC_MEM_OFFS(0x08,buf)"\n\t"
|
||||
"movq 0x70(%[y]),%%mm4\n\t"
|
||||
"psraw $2,%%mm6\n\t"
|
||||
"psubw %%mm2,%%mm5\n\t"
|
||||
"movq %%mm6,"OC_MEM_OFFS(0x28,buf)"\n\t"
|
||||
"psraw $2,%%mm0\n\t"
|
||||
"psubw %%mm2,%%mm3\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x48,buf)"\n\t"
|
||||
"psraw $2,%%mm5\n\t"
|
||||
"psubw %%mm2,%%mm1\n\t"
|
||||
"movq %%mm5,"OC_MEM_OFFS(0x58,buf)"\n\t"
|
||||
"psraw $2,%%mm3\n\t"
|
||||
"psubw %%mm2,%%mm7\n\t"
|
||||
"movq %%mm3,"OC_MEM_OFFS(0x68,buf)"\n\t"
|
||||
"psraw $2,%%mm1\n\t"
|
||||
"psubw %%mm2,%%mm4\n\t"
|
||||
"movq %%mm1,"OC_MEM_OFFS(0x78,buf)"\n\t"
|
||||
"psraw $2,%%mm7\n\t"
|
||||
"movq %%mm7,"OC_MEM_OFFS(0x18,buf)"\n\t"
|
||||
"psraw $2,%%mm4\n\t"
|
||||
"movq %%mm4,"OC_MEM_OFFS(0x38,buf)"\n\t"
|
||||
/*Final transpose and zig-zag.*/
|
||||
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
|
||||
"movq "OC_MEM_OFFS(16*_row,buf)","_reg"\n\t" \
|
||||
|
||||
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
|
||||
"movq "OC_MEM_OFFS(16*_row+8,buf)","_reg"\n\t" \
|
||||
|
||||
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
|
||||
#undef OC_ZZ_LOAD_ROW_LO
|
||||
#undef OC_ZZ_LOAD_ROW_HI
|
||||
:[a]"=&r"(a),[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))
|
||||
:[y]"r"(_y),[x]"r"(_x)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
|
||||
#endif
|
||||
368
engine/thirdparty/libtheora/x86/mmxfrag.c
vendored
Normal file
368
engine/thirdparty/libtheora/x86/mmxfrag.c
vendored
Normal file
|
|
@ -0,0 +1,368 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/*MMX acceleration of fragment reconstruction for motion compensation.
|
||||
Originally written by Rudolf Marek.
|
||||
Additional optimization by Nils Pipenbrinck.
|
||||
Note: Loops are unrolled for best performance.
|
||||
The iteration each instruction belongs to is marked in the comments as #i.*/
|
||||
#include <stddef.h>
|
||||
#include "x86int.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||
between rows.*/
|
||||
# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
|
||||
do{ \
|
||||
const unsigned char *src; \
|
||||
unsigned char *dst; \
|
||||
ptrdiff_t ystride3; \
|
||||
src=(_src); \
|
||||
dst=(_dst); \
|
||||
__asm__ __volatile__( \
|
||||
/*src+0*ystride*/ \
|
||||
"movq (%[src]),%%mm0\n\t" \
|
||||
/*src+1*ystride*/ \
|
||||
"movq (%[src],%[ystride]),%%mm1\n\t" \
|
||||
/*ystride3=ystride*3*/ \
|
||||
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
|
||||
/*src+2*ystride*/ \
|
||||
"movq (%[src],%[ystride],2),%%mm2\n\t" \
|
||||
/*src+3*ystride*/ \
|
||||
"movq (%[src],%[ystride3]),%%mm3\n\t" \
|
||||
/*dst+0*ystride*/ \
|
||||
"movq %%mm0,(%[dst])\n\t" \
|
||||
/*dst+1*ystride*/ \
|
||||
"movq %%mm1,(%[dst],%[ystride])\n\t" \
|
||||
/*Pointer to next 4.*/ \
|
||||
"lea (%[src],%[ystride],4),%[src]\n\t" \
|
||||
/*dst+2*ystride*/ \
|
||||
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
|
||||
/*dst+3*ystride*/ \
|
||||
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
|
||||
/*Pointer to next 4.*/ \
|
||||
"lea (%[dst],%[ystride],4),%[dst]\n\t" \
|
||||
/*src+0*ystride*/ \
|
||||
"movq (%[src]),%%mm0\n\t" \
|
||||
/*src+1*ystride*/ \
|
||||
"movq (%[src],%[ystride]),%%mm1\n\t" \
|
||||
/*src+2*ystride*/ \
|
||||
"movq (%[src],%[ystride],2),%%mm2\n\t" \
|
||||
/*src+3*ystride*/ \
|
||||
"movq (%[src],%[ystride3]),%%mm3\n\t" \
|
||||
/*dst+0*ystride*/ \
|
||||
"movq %%mm0,(%[dst])\n\t" \
|
||||
/*dst+1*ystride*/ \
|
||||
"movq %%mm1,(%[dst],%[ystride])\n\t" \
|
||||
/*dst+2*ystride*/ \
|
||||
"movq %%mm2,(%[dst],%[ystride],2)\n\t" \
|
||||
/*dst+3*ystride*/ \
|
||||
"movq %%mm3,(%[dst],%[ystride3])\n\t" \
|
||||
:[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \
|
||||
:[ystride]"r"((ptrdiff_t)(_ystride)) \
|
||||
:"memory" \
|
||||
); \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||
between rows.*/
|
||||
void oc_frag_copy_mmx(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride){
|
||||
OC_FRAG_COPY_MMX(_dst,_src,_ystride);
|
||||
}
|
||||
|
||||
/*Copies the fragments specified by the lists of fragment indices from one
|
||||
frame to another.
|
||||
_dst_frame: The reference frame to copy to.
|
||||
_src_frame: The reference frame to copy from.
|
||||
_ystride: The row stride of the reference frames.
|
||||
_fragis: A pointer to a list of fragment indices.
|
||||
_nfragis: The number of fragment indices to copy.
|
||||
_frag_buf_offs: The offsets of fragments in the reference frames.*/
|
||||
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
|
||||
ptrdiff_t fragii;
|
||||
for(fragii=0;fragii<_nfragis;fragii++){
|
||||
ptrdiff_t frag_buf_off;
|
||||
frag_buf_off=_frag_buf_offs[_fragis[fragii]];
|
||||
OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
|
||||
_src_frame+frag_buf_off,_ystride);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t *_residue){
|
||||
__asm__ __volatile__(
|
||||
/*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
|
||||
"pcmpeqw %%mm0,%%mm0\n\t"
|
||||
/*#0 Load low residue.*/
|
||||
"movq 0*8(%[residue]),%%mm1\n\t"
|
||||
/*#0 Load high residue.*/
|
||||
"movq 1*8(%[residue]),%%mm2\n\t"
|
||||
/*Set mm0 to 0x8000800080008000.*/
|
||||
"psllw $15,%%mm0\n\t"
|
||||
/*#1 Load low residue.*/
|
||||
"movq 2*8(%[residue]),%%mm3\n\t"
|
||||
/*#1 Load high residue.*/
|
||||
"movq 3*8(%[residue]),%%mm4\n\t"
|
||||
/*Set mm0 to 0x0080008000800080.*/
|
||||
"psrlw $8,%%mm0\n\t"
|
||||
/*#2 Load low residue.*/
|
||||
"movq 4*8(%[residue]),%%mm5\n\t"
|
||||
/*#2 Load high residue.*/
|
||||
"movq 5*8(%[residue]),%%mm6\n\t"
|
||||
/*#0 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm1\n\t"
|
||||
/*#0 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm2\n\t"
|
||||
/*#0 Pack to byte.*/
|
||||
"packuswb %%mm2,%%mm1\n\t"
|
||||
/*#1 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm3\n\t"
|
||||
/*#1 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm4\n\t"
|
||||
/*#1 Pack to byte.*/
|
||||
"packuswb %%mm4,%%mm3\n\t"
|
||||
/*#2 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm5\n\t"
|
||||
/*#2 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm6\n\t"
|
||||
/*#2 Pack to byte.*/
|
||||
"packuswb %%mm6,%%mm5\n\t"
|
||||
/*#0 Write row.*/
|
||||
"movq %%mm1,(%[dst])\n\t"
|
||||
/*#1 Write row.*/
|
||||
"movq %%mm3,(%[dst],%[ystride])\n\t"
|
||||
/*#2 Write row.*/
|
||||
"movq %%mm5,(%[dst],%[ystride],2)\n\t"
|
||||
/*#3 Load low residue.*/
|
||||
"movq 6*8(%[residue]),%%mm1\n\t"
|
||||
/*#3 Load high residue.*/
|
||||
"movq 7*8(%[residue]),%%mm2\n\t"
|
||||
/*#4 Load high residue.*/
|
||||
"movq 8*8(%[residue]),%%mm3\n\t"
|
||||
/*#4 Load high residue.*/
|
||||
"movq 9*8(%[residue]),%%mm4\n\t"
|
||||
/*#5 Load high residue.*/
|
||||
"movq 10*8(%[residue]),%%mm5\n\t"
|
||||
/*#5 Load high residue.*/
|
||||
"movq 11*8(%[residue]),%%mm6\n\t"
|
||||
/*#3 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm1\n\t"
|
||||
/*#3 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm2\n\t"
|
||||
/*#3 Pack to byte.*/
|
||||
"packuswb %%mm2,%%mm1\n\t"
|
||||
/*#4 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm3\n\t"
|
||||
/*#4 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm4\n\t"
|
||||
/*#4 Pack to byte.*/
|
||||
"packuswb %%mm4,%%mm3\n\t"
|
||||
/*#5 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm5\n\t"
|
||||
/*#5 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm6\n\t"
|
||||
/*#5 Pack to byte.*/
|
||||
"packuswb %%mm6,%%mm5\n\t"
|
||||
/*#3 Write row.*/
|
||||
"movq %%mm1,(%[dst],%[ystride3])\n\t"
|
||||
/*#4 Write row.*/
|
||||
"movq %%mm3,(%[dst4])\n\t"
|
||||
/*#5 Write row.*/
|
||||
"movq %%mm5,(%[dst4],%[ystride])\n\t"
|
||||
/*#6 Load low residue.*/
|
||||
"movq 12*8(%[residue]),%%mm1\n\t"
|
||||
/*#6 Load high residue.*/
|
||||
"movq 13*8(%[residue]),%%mm2\n\t"
|
||||
/*#7 Load low residue.*/
|
||||
"movq 14*8(%[residue]),%%mm3\n\t"
|
||||
/*#7 Load high residue.*/
|
||||
"movq 15*8(%[residue]),%%mm4\n\t"
|
||||
/*#6 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm1\n\t"
|
||||
/*#6 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm2\n\t"
|
||||
/*#6 Pack to byte.*/
|
||||
"packuswb %%mm2,%%mm1\n\t"
|
||||
/*#7 Bias low residue.*/
|
||||
"paddsw %%mm0,%%mm3\n\t"
|
||||
/*#7 Bias high residue.*/
|
||||
"paddsw %%mm0,%%mm4\n\t"
|
||||
/*#7 Pack to byte.*/
|
||||
"packuswb %%mm4,%%mm3\n\t"
|
||||
/*#6 Write row.*/
|
||||
"movq %%mm1,(%[dst4],%[ystride],2)\n\t"
|
||||
/*#7 Write row.*/
|
||||
"movq %%mm3,(%[dst4],%[ystride3])\n\t"
|
||||
:
|
||||
:[residue]"r"(_residue),
|
||||
[dst]"r"(_dst),
|
||||
[dst4]"r"(_dst+(_ystride<<2)),
|
||||
[ystride]"r"((ptrdiff_t)_ystride),
|
||||
[ystride3]"r"((ptrdiff_t)_ystride*3)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
|
||||
void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
|
||||
int _ystride,const ogg_int16_t *_residue){
|
||||
int i;
|
||||
/*Zero mm0.*/
|
||||
__asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
|
||||
for(i=4;i-->0;){
|
||||
__asm__ __volatile__(
|
||||
/*#0 Load source.*/
|
||||
"movq (%[src]),%%mm3\n\t"
|
||||
/*#1 Load source.*/
|
||||
"movq (%[src],%[ystride]),%%mm7\n\t"
|
||||
/*#0 Get copy of src.*/
|
||||
"movq %%mm3,%%mm4\n\t"
|
||||
/*#0 Expand high source.*/
|
||||
"punpckhbw %%mm0,%%mm4\n\t"
|
||||
/*#0 Expand low source.*/
|
||||
"punpcklbw %%mm0,%%mm3\n\t"
|
||||
/*#0 Add residue high.*/
|
||||
"paddsw 8(%[residue]),%%mm4\n\t"
|
||||
/*#1 Get copy of src.*/
|
||||
"movq %%mm7,%%mm2\n\t"
|
||||
/*#0 Add residue low.*/
|
||||
"paddsw (%[residue]), %%mm3\n\t"
|
||||
/*#1 Expand high source.*/
|
||||
"punpckhbw %%mm0,%%mm2\n\t"
|
||||
/*#0 Pack final row pixels.*/
|
||||
"packuswb %%mm4,%%mm3\n\t"
|
||||
/*#1 Expand low source.*/
|
||||
"punpcklbw %%mm0,%%mm7\n\t"
|
||||
/*#1 Add residue low.*/
|
||||
"paddsw 16(%[residue]),%%mm7\n\t"
|
||||
/*#1 Add residue high.*/
|
||||
"paddsw 24(%[residue]),%%mm2\n\t"
|
||||
/*Advance residue.*/
|
||||
"lea 32(%[residue]),%[residue]\n\t"
|
||||
/*#1 Pack final row pixels.*/
|
||||
"packuswb %%mm2,%%mm7\n\t"
|
||||
/*Advance src.*/
|
||||
"lea (%[src],%[ystride],2),%[src]\n\t"
|
||||
/*#0 Write row.*/
|
||||
"movq %%mm3,(%[dst])\n\t"
|
||||
/*#1 Write row.*/
|
||||
"movq %%mm7,(%[dst],%[ystride])\n\t"
|
||||
/*Advance dst.*/
|
||||
"lea (%[dst],%[ystride],2),%[dst]\n\t"
|
||||
:[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src)
|
||||
:[ystride]"r"((ptrdiff_t)_ystride)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
|
||||
int i;
|
||||
/*Zero mm7.*/
|
||||
__asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);
|
||||
for(i=4;i-->0;){
|
||||
__asm__ __volatile__(
|
||||
/*#0 Load src1.*/
|
||||
"movq (%[src1]),%%mm0\n\t"
|
||||
/*#0 Load src2.*/
|
||||
"movq (%[src2]),%%mm2\n\t"
|
||||
/*#0 Copy src1.*/
|
||||
"movq %%mm0,%%mm1\n\t"
|
||||
/*#0 Copy src2.*/
|
||||
"movq %%mm2,%%mm3\n\t"
|
||||
/*#1 Load src1.*/
|
||||
"movq (%[src1],%[ystride]),%%mm4\n\t"
|
||||
/*#0 Unpack lower src1.*/
|
||||
"punpcklbw %%mm7,%%mm0\n\t"
|
||||
/*#1 Load src2.*/
|
||||
"movq (%[src2],%[ystride]),%%mm5\n\t"
|
||||
/*#0 Unpack higher src1.*/
|
||||
"punpckhbw %%mm7,%%mm1\n\t"
|
||||
/*#0 Unpack lower src2.*/
|
||||
"punpcklbw %%mm7,%%mm2\n\t"
|
||||
/*#0 Unpack higher src2.*/
|
||||
"punpckhbw %%mm7,%%mm3\n\t"
|
||||
/*Advance src1 ptr.*/
|
||||
"lea (%[src1],%[ystride],2),%[src1]\n\t"
|
||||
/*Advance src2 ptr.*/
|
||||
"lea (%[src2],%[ystride],2),%[src2]\n\t"
|
||||
/*#0 Lower src1+src2.*/
|
||||
"paddsw %%mm2,%%mm0\n\t"
|
||||
/*#0 Higher src1+src2.*/
|
||||
"paddsw %%mm3,%%mm1\n\t"
|
||||
/*#1 Copy src1.*/
|
||||
"movq %%mm4,%%mm2\n\t"
|
||||
/*#0 Build lo average.*/
|
||||
"psraw $1,%%mm0\n\t"
|
||||
/*#1 Copy src2.*/
|
||||
"movq %%mm5,%%mm3\n\t"
|
||||
/*#1 Unpack lower src1.*/
|
||||
"punpcklbw %%mm7,%%mm4\n\t"
|
||||
/*#0 Build hi average.*/
|
||||
"psraw $1,%%mm1\n\t"
|
||||
/*#1 Unpack higher src1.*/
|
||||
"punpckhbw %%mm7,%%mm2\n\t"
|
||||
/*#0 low+=residue.*/
|
||||
"paddsw (%[residue]),%%mm0\n\t"
|
||||
/*#1 Unpack lower src2.*/
|
||||
"punpcklbw %%mm7,%%mm5\n\t"
|
||||
/*#0 high+=residue.*/
|
||||
"paddsw 8(%[residue]),%%mm1\n\t"
|
||||
/*#1 Unpack higher src2.*/
|
||||
"punpckhbw %%mm7,%%mm3\n\t"
|
||||
/*#1 Lower src1+src2.*/
|
||||
"paddsw %%mm4,%%mm5\n\t"
|
||||
/*#0 Pack and saturate.*/
|
||||
"packuswb %%mm1,%%mm0\n\t"
|
||||
/*#1 Higher src1+src2.*/
|
||||
"paddsw %%mm2,%%mm3\n\t"
|
||||
/*#0 Write row.*/
|
||||
"movq %%mm0,(%[dst])\n\t"
|
||||
/*#1 Build lo average.*/
|
||||
"psraw $1,%%mm5\n\t"
|
||||
/*#1 Build hi average.*/
|
||||
"psraw $1,%%mm3\n\t"
|
||||
/*#1 low+=residue.*/
|
||||
"paddsw 16(%[residue]),%%mm5\n\t"
|
||||
/*#1 high+=residue.*/
|
||||
"paddsw 24(%[residue]),%%mm3\n\t"
|
||||
/*#1 Pack and saturate.*/
|
||||
"packuswb %%mm3,%%mm5\n\t"
|
||||
/*#1 Write row ptr.*/
|
||||
"movq %%mm5,(%[dst],%[ystride])\n\t"
|
||||
/*Advance residue ptr.*/
|
||||
"add $32,%[residue]\n\t"
|
||||
/*Advance dest ptr.*/
|
||||
"lea (%[dst],%[ystride],2),%[dst]\n\t"
|
||||
:[dst]"+r"(_dst),[residue]"+r"(_residue),
|
||||
[src1]"+r"(_src1),[src2]"+r"(_src2)
|
||||
:[ystride]"r"((ptrdiff_t)_ystride)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
void oc_restore_fpu_mmx(void){
|
||||
__asm__ __volatile__("emms\n\t");
|
||||
}
|
||||
#endif
|
||||
558
engine/thirdparty/libtheora/x86/mmxidct.c
vendored
Normal file
558
engine/thirdparty/libtheora/x86/mmxidct.c
vendored
Normal file
|
|
@ -0,0 +1,558 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/*MMX acceleration of Theora's iDCT.
|
||||
Originally written by Rudolf Marek, based on code from On2's VP3.*/
|
||||
#include "x86int.h"
|
||||
#include "../dct.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*These are offsets into the table of constants below.*/
|
||||
/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
|
||||
#define OC_COSINE_OFFSET (0)
|
||||
/*A row of 8's.*/
|
||||
#define OC_EIGHT_OFFSET (56)
|
||||
|
||||
|
||||
|
||||
/*38 cycles*/
|
||||
#define OC_IDCT_BEGIN(_y,_x) \
|
||||
"#OC_IDCT_BEGIN\n\t" \
|
||||
"movq "OC_I(3,_x)",%%mm2\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"movq "OC_J(5,_x)",%%mm7\n\t" \
|
||||
"pmulhw %%mm6,%%mm4\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
|
||||
"pmulhw %%mm7,%%mm6\n\t" \
|
||||
"movq %%mm1,%%mm5\n\t" \
|
||||
"pmulhw %%mm2,%%mm1\n\t" \
|
||||
"movq "OC_I(1,_x)",%%mm3\n\t" \
|
||||
"pmulhw %%mm7,%%mm5\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
|
||||
"paddw %%mm2,%%mm4\n\t" \
|
||||
"paddw %%mm7,%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
"movq "OC_J(7,_x)",%%mm1\n\t" \
|
||||
"paddw %%mm5,%%mm7\n\t" \
|
||||
"movq %%mm0,%%mm5\n\t" \
|
||||
"pmulhw %%mm3,%%mm0\n\t" \
|
||||
"paddw %%mm7,%%mm4\n\t" \
|
||||
"pmulhw %%mm1,%%mm5\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
|
||||
"psubw %%mm2,%%mm6\n\t" \
|
||||
"paddw %%mm3,%%mm0\n\t" \
|
||||
"pmulhw %%mm7,%%mm3\n\t" \
|
||||
"movq "OC_I(2,_x)",%%mm2\n\t" \
|
||||
"pmulhw %%mm1,%%mm7\n\t" \
|
||||
"paddw %%mm1,%%mm5\n\t" \
|
||||
"movq %%mm2,%%mm1\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
|
||||
"psubw %%mm5,%%mm3\n\t" \
|
||||
"movq "OC_J(6,_x)",%%mm5\n\t" \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
"movq %%mm5,%%mm7\n\t" \
|
||||
"psubw %%mm4,%%mm0\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
|
||||
"paddw %%mm4,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"psubw %%mm6,%%mm3\n\t" \
|
||||
"paddw %%mm7,%%mm5\n\t" \
|
||||
"paddw %%mm6,%%mm6\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
|
||||
"paddw %%mm3,%%mm6\n\t" \
|
||||
"movq %%mm4,"OC_I(1,_y)"\n\t" \
|
||||
"psubw %%mm5,%%mm1\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
|
||||
"movq %%mm3,%%mm5\n\t" \
|
||||
"pmulhw %%mm4,%%mm3\n\t" \
|
||||
"paddw %%mm2,%%mm7\n\t" \
|
||||
"movq %%mm6,"OC_I(2,_y)"\n\t" \
|
||||
"movq %%mm0,%%mm2\n\t" \
|
||||
"movq "OC_I(0,_x)",%%mm6\n\t" \
|
||||
"pmulhw %%mm4,%%mm0\n\t" \
|
||||
"paddw %%mm3,%%mm5\n\t" \
|
||||
"movq "OC_J(4,_x)",%%mm3\n\t" \
|
||||
"psubw %%mm1,%%mm5\n\t" \
|
||||
"paddw %%mm0,%%mm2\n\t" \
|
||||
"psubw %%mm3,%%mm6\n\t" \
|
||||
"movq %%mm6,%%mm0\n\t" \
|
||||
"pmulhw %%mm4,%%mm6\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
"paddw %%mm0,%%mm3\n\t" \
|
||||
"paddw %%mm5,%%mm1\n\t" \
|
||||
"pmulhw %%mm3,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm6\n\t" \
|
||||
"psubw %%mm2,%%mm6\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"movq "OC_I(1,_y)",%%mm0\n\t" \
|
||||
"paddw %%mm6,%%mm2\n\t" \
|
||||
"paddw %%mm3,%%mm4\n\t" \
|
||||
"psubw %%mm1,%%mm2\n\t" \
|
||||
"#end OC_IDCT_BEGIN\n\t" \
|
||||
|
||||
/*38+8=46 cycles.*/
|
||||
#define OC_ROW_IDCT(_y,_x) \
|
||||
"#OC_ROW_IDCT\n" \
|
||||
OC_IDCT_BEGIN(_y,_x) \
|
||||
/*r3=D'*/ \
|
||||
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||
/*r4=E'=E-G*/ \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
/*r1=H'+H'*/ \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
/*r7=G+G*/ \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
/*r1=R1=A''+H'*/ \
|
||||
"paddw %%mm2,%%mm1\n\t" \
|
||||
/*r7=G'=E+G*/ \
|
||||
"paddw %%mm4,%%mm7\n\t" \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
"psubw %%mm3,%%mm4\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
/*r6=R6=F'-B''*/ \
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
/*r3=R3=E'+D'*/ \
|
||||
"paddw %%mm4,%%mm3\n\t" \
|
||||
/*r5=R5=F'+B''*/ \
|
||||
"paddw %%mm6,%%mm5\n\t" \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
/*Save R1.*/ \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
/*r0=R0=G.+C.*/ \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
"#end OC_ROW_IDCT\n\t" \
|
||||
|
||||
/*The following macro does two 4x4 transposes in place.
|
||||
At entry, we assume:
|
||||
r0 = a3 a2 a1 a0
|
||||
I(1) = b3 b2 b1 b0
|
||||
r2 = c3 c2 c1 c0
|
||||
r3 = d3 d2 d1 d0
|
||||
|
||||
r4 = e3 e2 e1 e0
|
||||
r5 = f3 f2 f1 f0
|
||||
r6 = g3 g2 g1 g0
|
||||
r7 = h3 h2 h1 h0
|
||||
|
||||
At exit, we have:
|
||||
I(0) = d0 c0 b0 a0
|
||||
I(1) = d1 c1 b1 a1
|
||||
I(2) = d2 c2 b2 a2
|
||||
I(3) = d3 c3 b3 a3
|
||||
|
||||
J(4) = h0 g0 f0 e0
|
||||
J(5) = h1 g1 f1 e1
|
||||
J(6) = h2 g2 f2 e2
|
||||
J(7) = h3 g3 f3 e3
|
||||
|
||||
I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
|
||||
J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
|
||||
|
||||
Since r1 is free at entry, we calculate the Js first.*/
|
||||
/*19 cycles.*/
|
||||
#define OC_TRANSPOSE(_y) \
|
||||
"#OC_TRANSPOSE\n\t" \
|
||||
"movq %%mm4,%%mm1\n\t" \
|
||||
"punpcklwd %%mm5,%%mm4\n\t" \
|
||||
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||
"punpckhwd %%mm5,%%mm1\n\t" \
|
||||
"movq %%mm6,%%mm0\n\t" \
|
||||
"punpcklwd %%mm7,%%mm6\n\t" \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"punpckldq %%mm6,%%mm4\n\t" \
|
||||
"punpckhdq %%mm6,%%mm5\n\t" \
|
||||
"movq %%mm1,%%mm6\n\t" \
|
||||
"movq %%mm4,"OC_J(4,_y)"\n\t" \
|
||||
"punpckhwd %%mm7,%%mm0\n\t" \
|
||||
"movq %%mm5,"OC_J(5,_y)"\n\t" \
|
||||
"punpckhdq %%mm0,%%mm6\n\t" \
|
||||
"movq "OC_I(0,_y)",%%mm4\n\t" \
|
||||
"punpckldq %%mm0,%%mm1\n\t" \
|
||||
"movq "OC_I(1,_y)",%%mm5\n\t" \
|
||||
"movq %%mm4,%%mm0\n\t" \
|
||||
"movq %%mm6,"OC_J(7,_y)"\n\t" \
|
||||
"punpcklwd %%mm5,%%mm0\n\t" \
|
||||
"movq %%mm1,"OC_J(6,_y)"\n\t" \
|
||||
"punpckhwd %%mm5,%%mm4\n\t" \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklwd %%mm3,%%mm2\n\t" \
|
||||
"movq %%mm0,%%mm1\n\t" \
|
||||
"punpckldq %%mm2,%%mm0\n\t" \
|
||||
"punpckhdq %%mm2,%%mm1\n\t" \
|
||||
"movq %%mm4,%%mm2\n\t" \
|
||||
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||
"punpckhwd %%mm3,%%mm5\n\t" \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
"punpckhdq %%mm5,%%mm4\n\t" \
|
||||
"punpckldq %%mm5,%%mm2\n\t" \
|
||||
"movq %%mm4,"OC_I(3,_y)"\n\t" \
|
||||
"movq %%mm2,"OC_I(2,_y)"\n\t" \
|
||||
"#end OC_TRANSPOSE\n\t" \
|
||||
|
||||
/*38+19=57 cycles.*/
|
||||
#define OC_COLUMN_IDCT(_y) \
|
||||
"#OC_COLUMN_IDCT\n" \
|
||||
OC_IDCT_BEGIN(_y,_y) \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
|
||||
/*r1=H'+H'*/ \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
/*r1=R1=A''+H'*/ \
|
||||
"paddw %%mm2,%%mm1\n\t" \
|
||||
/*r2=NR2*/ \
|
||||
"psraw $4,%%mm2\n\t" \
|
||||
/*r4=E'=E-G*/ \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
/*r1=NR1*/ \
|
||||
"psraw $4,%%mm1\n\t" \
|
||||
/*r3=D'*/ \
|
||||
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||
/*r7=G+G*/ \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
/*Store NR2 at I(2).*/ \
|
||||
"movq %%mm2,"OC_I(2,_y)"\n\t" \
|
||||
/*r7=G'=E+G*/ \
|
||||
"paddw %%mm4,%%mm7\n\t" \
|
||||
/*Store NR1 at I(1).*/ \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
"psubw %%mm3,%%mm4\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
|
||||
/*r3=D'+D'*/ \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
/*r3=R3=E'+D'*/ \
|
||||
"paddw %%mm4,%%mm3\n\t" \
|
||||
/*r4=NR4*/ \
|
||||
"psraw $4,%%mm4\n\t" \
|
||||
/*r6=R6=F'-B''*/ \
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
/*r3=NR3*/ \
|
||||
"psraw $4,%%mm3\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
|
||||
/*r5=B''+B''*/ \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
/*r5=R5=F'+B''*/ \
|
||||
"paddw %%mm6,%%mm5\n\t" \
|
||||
/*r6=NR6*/ \
|
||||
"psraw $4,%%mm6\n\t" \
|
||||
/*Store NR4 at J(4).*/ \
|
||||
"movq %%mm4,"OC_J(4,_y)"\n\t" \
|
||||
/*r5=NR5*/ \
|
||||
"psraw $4,%%mm5\n\t" \
|
||||
/*Store NR3 at I(3).*/ \
|
||||
"movq %%mm3,"OC_I(3,_y)"\n\t" \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
|
||||
/*r0=C'+C'*/ \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
/*r0=R0=G'+C'*/ \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
/*r7=NR7*/ \
|
||||
"psraw $4,%%mm7\n\t" \
|
||||
/*Store NR6 at J(6).*/ \
|
||||
"movq %%mm6,"OC_J(6,_y)"\n\t" \
|
||||
/*r0=NR0*/ \
|
||||
"psraw $4,%%mm0\n\t" \
|
||||
/*Store NR5 at J(5).*/ \
|
||||
"movq %%mm5,"OC_J(5,_y)"\n\t" \
|
||||
/*Store NR7 at J(7).*/ \
|
||||
"movq %%mm7,"OC_J(7,_y)"\n\t" \
|
||||
/*Store NR0 at I(0).*/ \
|
||||
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||
"#end OC_COLUMN_IDCT\n\t" \
|
||||
|
||||
static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
int i;
|
||||
/*This routine accepts an 8x8 matrix, but in partially transposed form.
|
||||
Every 4x4 block is transposed.*/
|
||||
__asm__ __volatile__(
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
|
||||
OC_ROW_IDCT(y,x)
|
||||
OC_TRANSPOSE(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y)
|
||||
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y)
|
||||
OC_ROW_IDCT(y,x)
|
||||
OC_TRANSPOSE(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
|
||||
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
|
||||
[c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
|
||||
);
|
||||
__asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
|
||||
for(i=0;i<4;i++){
|
||||
__asm__ __volatile__(
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
|
||||
:[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/*25 cycles.*/
|
||||
#define OC_IDCT_BEGIN_10(_y,_x) \
|
||||
"#OC_IDCT_BEGIN_10\n\t" \
|
||||
"movq "OC_I(3,_x)",%%mm2\n\t" \
|
||||
"nop\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
|
||||
"pmulhw %%mm6,%%mm4\n\t" \
|
||||
"movq "OC_I(1,_x)",%%mm3\n\t" \
|
||||
"pmulhw %%mm2,%%mm1\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
|
||||
"paddw %%mm2,%%mm4\n\t" \
|
||||
"pxor %%mm6,%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm2\n\t" \
|
||||
"movq "OC_I(2,_x)",%%mm5\n\t" \
|
||||
"pmulhw %%mm3,%%mm0\n\t" \
|
||||
"movq %%mm5,%%mm1\n\t" \
|
||||
"paddw %%mm3,%%mm0\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
|
||||
"psubw %%mm2,%%mm6\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
|
||||
"psubw %%mm4,%%mm0\n\t" \
|
||||
"movq "OC_I(2,_x)",%%mm7\n\t" \
|
||||
"paddw %%mm4,%%mm4\n\t" \
|
||||
"paddw %%mm5,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
|
||||
"psubw %%mm6,%%mm3\n\t" \
|
||||
"movq %%mm4,"OC_I(1,_y)"\n\t" \
|
||||
"paddw %%mm6,%%mm6\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
|
||||
"paddw %%mm3,%%mm6\n\t" \
|
||||
"movq %%mm3,%%mm5\n\t" \
|
||||
"pmulhw %%mm4,%%mm3\n\t" \
|
||||
"movq %%mm6,"OC_I(2,_y)"\n\t" \
|
||||
"movq %%mm0,%%mm2\n\t" \
|
||||
"movq "OC_I(0,_x)",%%mm6\n\t" \
|
||||
"pmulhw %%mm4,%%mm0\n\t" \
|
||||
"paddw %%mm3,%%mm5\n\t" \
|
||||
"paddw %%mm0,%%mm2\n\t" \
|
||||
"psubw %%mm1,%%mm5\n\t" \
|
||||
"pmulhw %%mm4,%%mm6\n\t" \
|
||||
"paddw "OC_I(0,_x)",%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
"movq %%mm6,%%mm4\n\t" \
|
||||
"paddw %%mm5,%%mm1\n\t" \
|
||||
"psubw %%mm2,%%mm6\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"movq "OC_I(1,_y)",%%mm0\n\t" \
|
||||
"paddw %%mm6,%%mm2\n\t" \
|
||||
"psubw %%mm1,%%mm2\n\t" \
|
||||
"nop\n\t" \
|
||||
"#end OC_IDCT_BEGIN_10\n\t" \
|
||||
|
||||
/*25+8=33 cycles.*/
|
||||
#define OC_ROW_IDCT_10(_y,_x) \
|
||||
"#OC_ROW_IDCT_10\n\t" \
|
||||
OC_IDCT_BEGIN_10(_y,_x) \
|
||||
/*r3=D'*/ \
|
||||
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||
/*r4=E'=E-G*/ \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
/*r1=H'+H'*/ \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
/*r7=G+G*/ \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
/*r1=R1=A''+H'*/ \
|
||||
"paddw %%mm2,%%mm1\n\t" \
|
||||
/*r7=G'=E+G*/ \
|
||||
"paddw %%mm4,%%mm7\n\t" \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
"psubw %%mm3,%%mm4\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
/*r6=R6=F'-B''*/ \
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
/*r3=R3=E'+D'*/ \
|
||||
"paddw %%mm4,%%mm3\n\t" \
|
||||
/*r5=R5=F'+B''*/ \
|
||||
"paddw %%mm6,%%mm5\n\t" \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
/*Save R1.*/ \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
/*r0=R0=G'+C'*/ \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
"#end OC_ROW_IDCT_10\n\t" \
|
||||
|
||||
/*25+19=44 cycles'*/
|
||||
#define OC_COLUMN_IDCT_10(_y) \
|
||||
"#OC_COLUMN_IDCT_10\n\t" \
|
||||
OC_IDCT_BEGIN_10(_y,_y) \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
|
||||
/*r1=H'+H'*/ \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
/*r1=R1=A''+H'*/ \
|
||||
"paddw %%mm2,%%mm1\n\t" \
|
||||
/*r2=NR2*/ \
|
||||
"psraw $4,%%mm2\n\t" \
|
||||
/*r4=E'=E-G*/ \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
/*r1=NR1*/ \
|
||||
"psraw $4,%%mm1\n\t" \
|
||||
/*r3=D'*/ \
|
||||
"movq "OC_I(2,_y)",%%mm3\n\t" \
|
||||
/*r7=G+G*/ \
|
||||
"paddw %%mm7,%%mm7\n\t" \
|
||||
/*Store NR2 at I(2).*/ \
|
||||
"movq %%mm2,"OC_I(2,_y)"\n\t" \
|
||||
/*r7=G'=E+G*/ \
|
||||
"paddw %%mm4,%%mm7\n\t" \
|
||||
/*Store NR1 at I(1).*/ \
|
||||
"movq %%mm1,"OC_I(1,_y)"\n\t" \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
"psubw %%mm3,%%mm4\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
|
||||
/*r3=D'+D'*/ \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
/*r3=R3=E'+D'*/ \
|
||||
"paddw %%mm4,%%mm3\n\t" \
|
||||
/*r4=NR4*/ \
|
||||
"psraw $4,%%mm4\n\t" \
|
||||
/*r6=R6=F'-B''*/ \
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
/*r3=NR3*/ \
|
||||
"psraw $4,%%mm3\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
|
||||
/*r5=B''+B''*/ \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
/*r5=R5=F'+B''*/ \
|
||||
"paddw %%mm6,%%mm5\n\t" \
|
||||
/*r6=NR6*/ \
|
||||
"psraw $4,%%mm6\n\t" \
|
||||
/*Store NR4 at J(4).*/ \
|
||||
"movq %%mm4,"OC_J(4,_y)"\n\t" \
|
||||
/*r5=NR5*/ \
|
||||
"psraw $4,%%mm5\n\t" \
|
||||
/*Store NR3 at I(3).*/ \
|
||||
"movq %%mm3,"OC_I(3,_y)"\n\t" \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
|
||||
/*r0=C'+C'*/ \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
/*r0=R0=G'+C'*/ \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
/*r7=NR7*/ \
|
||||
"psraw $4,%%mm7\n\t" \
|
||||
/*Store NR6 at J(6).*/ \
|
||||
"movq %%mm6,"OC_J(6,_y)"\n\t" \
|
||||
/*r0=NR0*/ \
|
||||
"psraw $4,%%mm0\n\t" \
|
||||
/*Store NR5 at J(5).*/ \
|
||||
"movq %%mm5,"OC_J(5,_y)"\n\t" \
|
||||
/*Store NR7 at J(7).*/ \
|
||||
"movq %%mm7,"OC_J(7,_y)"\n\t" \
|
||||
/*Store NR0 at I(0).*/ \
|
||||
"movq %%mm0,"OC_I(0,_y)"\n\t" \
|
||||
"#end OC_COLUMN_IDCT_10\n\t" \
|
||||
|
||||
static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
__asm__ __volatile__(
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||
#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
|
||||
/*Done with dequant, descramble, and partial transpose.
|
||||
Now do the iDCT itself.*/
|
||||
OC_ROW_IDCT_10(y,x)
|
||||
OC_TRANSPOSE(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT_10(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT_10(y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
|
||||
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
|
||||
[c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
|
||||
);
|
||||
__asm__ __volatile__(
|
||||
"pxor %%mm0,%%mm0\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
|
||||
:[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
|
||||
);
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||
version of the transform.*/
|
||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||
/*_last_zzi is subtly different from an actual count of the number of
|
||||
coefficients we decoded for this block.
|
||||
It contains the value of zzi BEFORE the final token in the block was
|
||||
decoded.
|
||||
In most cases this is an EOB token (the continuation of an EOB run from a
|
||||
previous block counts), and so this is the same as the coefficient count.
|
||||
However, in the case that the last token was NOT an EOB token, but filled
|
||||
the block up with exactly 64 coefficients, _last_zzi will be less than 64.
|
||||
Provided the last token was not a pure zero run, the minimum value it can
|
||||
be is 46, and so that doesn't affect any of the cases in this routine.
|
||||
However, if the last token WAS a pure zero run of length 63, then _last_zzi
|
||||
will be 1 while the number of coefficients decoded is 64.
|
||||
Thus, we will trigger the following special case, where the real
|
||||
coefficient count would not.
|
||||
Note also that a zero run of length 64 will give _last_zzi a value of 0,
|
||||
but we still process the DC coefficient, which might have a non-zero value
|
||||
due to DC prediction.
|
||||
Although convoluted, this is arguably the correct behavior: it allows us to
|
||||
use a smaller transform when the block ends with a long zero run instead
|
||||
of a normal EOB token.
|
||||
It could be smarter... multiple separate zero runs at the end of a block
|
||||
will fool it, but an encoder that generates these really deserves what it
|
||||
gets.
|
||||
Needless to say we inherited this approach from VP3.*/
|
||||
/*Then perform the iDCT.*/
|
||||
if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
|
||||
else oc_idct8x8_slow_mmx(_y,_x);
|
||||
}
|
||||
|
||||
#endif
|
||||
318
engine/thirdparty/libtheora/x86/mmxloop.h
vendored
Normal file
318
engine/thirdparty/libtheora/x86/mmxloop.h
vendored
Normal file
|
|
@ -0,0 +1,318 @@
|
|||
#if !defined(_x86_mmxloop_H)
|
||||
# define _x86_mmxloop_H (1)
|
||||
# include <stddef.h>
|
||||
# include "x86int.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
|
||||
On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
|
||||
mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
|
||||
#define OC_LOOP_FILTER8_MMX \
|
||||
"#OC_LOOP_FILTER8_MMX\n\t" \
|
||||
/*mm7=0*/ \
|
||||
"pxor %%mm7,%%mm7\n\t" \
|
||||
/*mm6:mm0={a0,...,a7}*/ \
|
||||
"movq %%mm0,%%mm6\n\t" \
|
||||
"punpcklbw %%mm7,%%mm0\n\t" \
|
||||
"punpckhbw %%mm7,%%mm6\n\t" \
|
||||
/*mm3:mm5={d0,...,d7}*/ \
|
||||
"movq %%mm3,%%mm5\n\t" \
|
||||
"punpcklbw %%mm7,%%mm3\n\t" \
|
||||
"punpckhbw %%mm7,%%mm5\n\t" \
|
||||
/*mm6:mm0={a0-d0,...,a7-d7}*/ \
|
||||
"psubw %%mm3,%%mm0\n\t" \
|
||||
"psubw %%mm5,%%mm6\n\t" \
|
||||
/*mm3:mm1={b0,...,b7}*/ \
|
||||
"movq %%mm1,%%mm3\n\t" \
|
||||
"punpcklbw %%mm7,%%mm1\n\t" \
|
||||
"movq %%mm2,%%mm4\n\t" \
|
||||
"punpckhbw %%mm7,%%mm3\n\t" \
|
||||
/*mm5:mm4={c0,...,c7}*/ \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"punpcklbw %%mm7,%%mm4\n\t" \
|
||||
"punpckhbw %%mm7,%%mm5\n\t" \
|
||||
/*mm7={3}x4 \
|
||||
mm5:mm4={c0-b0,...,c7-b7}*/ \
|
||||
"pcmpeqw %%mm7,%%mm7\n\t" \
|
||||
"psubw %%mm1,%%mm4\n\t" \
|
||||
"psrlw $14,%%mm7\n\t" \
|
||||
"psubw %%mm3,%%mm5\n\t" \
|
||||
/*Scale by 3.*/ \
|
||||
"pmullw %%mm7,%%mm4\n\t" \
|
||||
"pmullw %%mm7,%%mm5\n\t" \
|
||||
/*mm7={4}x4 \
|
||||
mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
|
||||
"psrlw $1,%%mm7\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"psllw $2,%%mm7\n\t" \
|
||||
"movq (%[ll]),%%mm0\n\t" \
|
||||
"paddw %%mm6,%%mm5\n\t" \
|
||||
/*R_i has the range [-127,128], so we compute -R_i instead. \
|
||||
mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
"psubw %%mm7,%%mm5\n\t" \
|
||||
"psraw $3,%%mm4\n\t" \
|
||||
"psraw $3,%%mm5\n\t" \
|
||||
"pcmpeqb %%mm7,%%mm7\n\t" \
|
||||
"packsswb %%mm5,%%mm4\n\t" \
|
||||
"pxor %%mm6,%%mm6\n\t" \
|
||||
"pxor %%mm7,%%mm4\n\t" \
|
||||
"packuswb %%mm3,%%mm1\n\t" \
|
||||
/*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
|
||||
/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
|
||||
we have to split things by sign (the other option is to work in 16 bits, \
|
||||
but working in 8 bits gives much better parallelism). \
|
||||
We compute abs(R_i), but save a mask of which terms were negative in mm6. \
|
||||
Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
|
||||
Finally, we split mm4 into positive and negative pieces using the mask in \
|
||||
mm6, and add and subtract them as appropriate.*/ \
|
||||
/*mm4=abs(-R_i)*/ \
|
||||
/*mm7=255-2*L*/ \
|
||||
"pcmpgtb %%mm4,%%mm6\n\t" \
|
||||
"psubb %%mm0,%%mm7\n\t" \
|
||||
"pxor %%mm6,%%mm4\n\t" \
|
||||
"psubb %%mm0,%%mm7\n\t" \
|
||||
"psubb %%mm6,%%mm4\n\t" \
|
||||
/*mm7=255-max(2*L-abs(R_i),0)*/ \
|
||||
"paddusb %%mm4,%%mm7\n\t" \
|
||||
/*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
|
||||
"paddusb %%mm7,%%mm4\n\t" \
|
||||
"psubusb %%mm7,%%mm4\n\t" \
|
||||
/*Now split mm4 by the original sign of -R_i.*/ \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"pand %%mm6,%%mm4\n\t" \
|
||||
"pandn %%mm5,%%mm6\n\t" \
|
||||
/*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
|
||||
/*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
|
||||
"paddusb %%mm4,%%mm1\n\t" \
|
||||
"psubusb %%mm4,%%mm2\n\t" \
|
||||
"psubusb %%mm6,%%mm1\n\t" \
|
||||
"paddusb %%mm6,%%mm2\n\t" \
|
||||
|
||||
/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
|
||||
On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
|
||||
mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}.
|
||||
All other MMX registers are clobbered.*/
|
||||
#define OC_LOOP_FILTER8_MMXEXT \
|
||||
"#OC_LOOP_FILTER8_MMXEXT\n\t" \
|
||||
/*R_i=(a_i-3*b_i+3*c_i-d_i+4>>3) has the range [-127,128], so we compute \
|
||||
-R_i=(-a_i+3*b_i-3*c_i+d_i+3>>3) instead.*/ \
|
||||
/*This first part is based on the transformation \
|
||||
f = -(3*(c-b)+a-d+4>>3) \
|
||||
= -(3*(c+255-b)+(a+255-d)+4-1020>>3) \
|
||||
= -(3*(c+~b)+(a+~d)-1016>>3) \
|
||||
= 127-(3*(c+~b)+(a+~d)>>3) \
|
||||
= 128+~(3*(c+~b)+(a+~d)>>3) (mod 256). \
|
||||
Although pavgb(a,b) = (a+b+1>>1) (biased up), we rely heavily on the \
|
||||
fact that ~pavgb(~a,~b) = (a+b>>1) (biased down). \
|
||||
Using this, the last expression above can be computed in 8 bits of working \
|
||||
precision via: \
|
||||
u = ~pavgb(~b,c); \
|
||||
v = pavgb(b,~c); \
|
||||
This mask is 0 or 0xFF, and controls whether t is biased up or down: \
|
||||
m = u-v; \
|
||||
t = m^pavgb(m^~a,m^d); \
|
||||
f = 128+pavgb(pavgb(t,u),v); \
|
||||
This required some careful analysis to ensure that carries are propagated \
|
||||
correctly in all cases, but has been checked exhaustively.*/ \
|
||||
/*input (a, b, c, d, ., ., ., .)*/ \
|
||||
/*ff=0xFF; \
|
||||
u=b; \
|
||||
v=c; \
|
||||
ll=255-2*L;*/ \
|
||||
"pcmpeqb %%mm7,%%mm7\n\t" \
|
||||
"movq %%mm1,%%mm4\n\t" \
|
||||
"movq %%mm2,%%mm5\n\t" \
|
||||
"movq (%[ll]),%%mm6\n\t" \
|
||||
/*allocated u, v, ll, ff: (a, b, c, d, u, v, ll, ff)*/ \
|
||||
/*u^=ff; \
|
||||
v^=ff;*/ \
|
||||
"pxor %%mm7,%%mm4\n\t" \
|
||||
"pxor %%mm7,%%mm5\n\t" \
|
||||
/*allocated ll: (a, b, c, d, u, v, ll, ff)*/ \
|
||||
/*u=pavgb(u,c); \
|
||||
v=pavgb(v,b);*/ \
|
||||
"pavgb %%mm2,%%mm4\n\t" \
|
||||
"pavgb %%mm1,%%mm5\n\t" \
|
||||
/*u^=ff; \
|
||||
a^=ff;*/ \
|
||||
"pxor %%mm7,%%mm4\n\t" \
|
||||
"pxor %%mm7,%%mm0\n\t" \
|
||||
/*m=u-v;*/ \
|
||||
"psubb %%mm5,%%mm4\n\t" \
|
||||
/*freed u, allocated m: (a, b, c, d, m, v, ll, ff)*/ \
|
||||
/*a^=m; \
|
||||
d^=m;*/ \
|
||||
"pxor %%mm4,%%mm0\n\t" \
|
||||
"pxor %%mm4,%%mm3\n\t" \
|
||||
/*t=pavgb(a,d);*/ \
|
||||
"pavgb %%mm3,%%mm0\n\t" \
|
||||
"psllw $7,%%mm7\n\t" \
|
||||
/*freed a, d, ff, allocated t, of: (t, b, c, ., m, v, ll, of)*/ \
|
||||
/*t^=m; \
|
||||
u=m+v;*/ \
|
||||
"pxor %%mm4,%%mm0\n\t" \
|
||||
"paddb %%mm5,%%mm4\n\t" \
|
||||
/*freed t, m, allocated f, u: (f, b, c, ., u, v, ll, of)*/ \
|
||||
/*f=pavgb(f,u); \
|
||||
of=128;*/ \
|
||||
"pavgb %%mm4,%%mm0\n\t" \
|
||||
"packsswb %%mm7,%%mm7\n\t" \
|
||||
/*freed u, ff, allocated ll: (f, b, c, ., ll, v, ll, of)*/ \
|
||||
/*f=pavgb(f,v);*/ \
|
||||
"pavgb %%mm5,%%mm0\n\t" \
|
||||
"movq %%mm7,%%mm3\n\t" \
|
||||
"movq %%mm6,%%mm4\n\t" \
|
||||
/*freed v, allocated of: (f, b, c, of, ll, ., ll, of)*/ \
|
||||
/*Now compute lflim of R_i=-(128+mm0) cf. Section 7.10 of the sepc.*/ \
|
||||
/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
|
||||
we have to split things by sign (the other option is to work in 16 bits, \
|
||||
but staying in 8 bits gives much better parallelism).*/ \
|
||||
/*Instead of adding the offset of 128 in mm3, we use it to split mm0. \
|
||||
This is the same number of instructions as computing a mask and splitting \
|
||||
after the lflim computation, but has shorter dependency chains.*/ \
|
||||
/*mm0=R_i<0?-R_i:0 (denoted abs(R_i<0))\
|
||||
mm3=R_i>0?R_i:0* (denoted abs(R_i>0))*/ \
|
||||
"psubusb %%mm0,%%mm3\n\t" \
|
||||
"psubusb %%mm7,%%mm0\n\t" \
|
||||
/*mm6=255-max(2*L-abs(R_i<0),0) \
|
||||
mm4=255-max(2*L-abs(R_i>0),0)*/ \
|
||||
"paddusb %%mm3,%%mm4\n\t" \
|
||||
"paddusb %%mm0,%%mm6\n\t" \
|
||||
/*mm0=min(abs(R_i<0),max(2*L-abs(R_i<0),0)) \
|
||||
mm3=min(abs(R_i>0),max(2*L-abs(R_i>0),0))*/ \
|
||||
"paddusb %%mm4,%%mm3\n\t" \
|
||||
"paddusb %%mm6,%%mm0\n\t" \
|
||||
"psubusb %%mm4,%%mm3\n\t" \
|
||||
"psubusb %%mm6,%%mm0\n\t" \
|
||||
/*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
|
||||
/*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
|
||||
"paddusb %%mm3,%%mm1\n\t" \
|
||||
"psubusb %%mm3,%%mm2\n\t" \
|
||||
"psubusb %%mm0,%%mm1\n\t" \
|
||||
"paddusb %%mm0,%%mm2\n\t" \
|
||||
|
||||
#define OC_LOOP_FILTER_V(_filter,_pix,_ystride,_ll) \
|
||||
do{ \
|
||||
ptrdiff_t ystride3__; \
|
||||
__asm__ __volatile__( \
|
||||
/*mm0={a0,...,a7}*/ \
|
||||
"movq (%[pix]),%%mm0\n\t" \
|
||||
/*ystride3=_ystride*3*/ \
|
||||
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
|
||||
/*mm3={d0,...,d7}*/ \
|
||||
"movq (%[pix],%[ystride3]),%%mm3\n\t" \
|
||||
/*mm1={b0,...,b7}*/ \
|
||||
"movq (%[pix],%[ystride]),%%mm1\n\t" \
|
||||
/*mm2={c0,...,c7}*/ \
|
||||
"movq (%[pix],%[ystride],2),%%mm2\n\t" \
|
||||
_filter \
|
||||
/*Write it back out.*/ \
|
||||
"movq %%mm1,(%[pix],%[ystride])\n\t" \
|
||||
"movq %%mm2,(%[pix],%[ystride],2)\n\t" \
|
||||
:[ystride3]"=&r"(ystride3__) \
|
||||
:[pix]"r"(_pix-_ystride*2),[ystride]"r"((ptrdiff_t)(_ystride)), \
|
||||
[ll]"r"(_ll) \
|
||||
:"memory" \
|
||||
); \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
#define OC_LOOP_FILTER_H(_filter,_pix,_ystride,_ll) \
|
||||
do{ \
|
||||
unsigned char *pix__; \
|
||||
ptrdiff_t ystride3__; \
|
||||
ptrdiff_t d__; \
|
||||
pix__=(_pix)-2; \
|
||||
__asm__ __volatile__( \
|
||||
/*x x x x d0 c0 b0 a0*/ \
|
||||
"movd (%[pix]),%%mm0\n\t" \
|
||||
/*x x x x d1 c1 b1 a1*/ \
|
||||
"movd (%[pix],%[ystride]),%%mm1\n\t" \
|
||||
/*ystride3=_ystride*3*/ \
|
||||
"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \
|
||||
/*x x x x d2 c2 b2 a2*/ \
|
||||
"movd (%[pix],%[ystride],2),%%mm2\n\t" \
|
||||
/*x x x x d3 c3 b3 a3*/ \
|
||||
"lea (%[pix],%[ystride],4),%[d]\n\t" \
|
||||
"movd (%[pix],%[ystride3]),%%mm3\n\t" \
|
||||
/*x x x x d4 c4 b4 a4*/ \
|
||||
"movd (%[d]),%%mm4\n\t" \
|
||||
/*x x x x d5 c5 b5 a5*/ \
|
||||
"movd (%[d],%[ystride]),%%mm5\n\t" \
|
||||
/*x x x x d6 c6 b6 a6*/ \
|
||||
"movd (%[d],%[ystride],2),%%mm6\n\t" \
|
||||
/*x x x x d7 c7 b7 a7*/ \
|
||||
"movd (%[d],%[ystride3]),%%mm7\n\t" \
|
||||
/*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \
|
||||
"punpcklbw %%mm1,%%mm0\n\t" \
|
||||
/*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \
|
||||
"punpcklbw %%mm3,%%mm2\n\t" \
|
||||
/*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \
|
||||
"movq %%mm0,%%mm3\n\t" \
|
||||
/*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \
|
||||
"punpcklwd %%mm2,%%mm0\n\t" \
|
||||
/*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \
|
||||
"punpckhwd %%mm2,%%mm3\n\t" \
|
||||
/*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \
|
||||
"movq %%mm0,%%mm1\n\t" \
|
||||
/*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \
|
||||
"punpcklbw %%mm5,%%mm4\n\t" \
|
||||
/*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \
|
||||
"punpcklbw %%mm7,%%mm6\n\t" \
|
||||
/*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
/*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \
|
||||
"punpcklwd %%mm6,%%mm4\n\t" \
|
||||
/*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" \
|
||||
/*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \
|
||||
"movq %%mm3,%%mm2\n\t" \
|
||||
/*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \
|
||||
"punpckldq %%mm4,%%mm0\n\t" \
|
||||
/*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \
|
||||
"punpckhdq %%mm4,%%mm1\n\t" \
|
||||
/*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \
|
||||
"punpckldq %%mm5,%%mm2\n\t" \
|
||||
/*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
|
||||
"punpckhdq %%mm5,%%mm3\n\t" \
|
||||
_filter \
|
||||
/*mm2={b0+R_0'',...,b7+R_7''}*/ \
|
||||
"movq %%mm1,%%mm0\n\t" \
|
||||
/*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
|
||||
"punpcklbw %%mm2,%%mm1\n\t" \
|
||||
/*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \
|
||||
"punpckhbw %%mm2,%%mm0\n\t" \
|
||||
/*[d]=c1 b1 c0 b0*/ \
|
||||
"movd %%mm1,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix])\n\t" \
|
||||
"psrlq $32,%%mm1\n\t" \
|
||||
"shr $16,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix],%[ystride])\n\t" \
|
||||
/*[d]=c3 b3 c2 b2*/ \
|
||||
"movd %%mm1,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix],%[ystride],2)\n\t" \
|
||||
"shr $16,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix],%[ystride3])\n\t" \
|
||||
"lea (%[pix],%[ystride],4),%[pix]\n\t" \
|
||||
/*[d]=c5 b5 c4 b4*/ \
|
||||
"movd %%mm0,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix])\n\t" \
|
||||
"psrlq $32,%%mm0\n\t" \
|
||||
"shr $16,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix],%[ystride])\n\t" \
|
||||
/*[d]=c7 b7 c6 b6*/ \
|
||||
"movd %%mm0,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix],%[ystride],2)\n\t" \
|
||||
"shr $16,%[d]\n\t" \
|
||||
"movw %w[d],1(%[pix],%[ystride3])\n\t" \
|
||||
:[pix]"+r"(pix__),[ystride3]"=&r"(ystride3__),[d]"=&r"(d__) \
|
||||
:[ystride]"r"((ptrdiff_t)(_ystride)),[ll]"r"(_ll) \
|
||||
:"memory" \
|
||||
); \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
# endif
|
||||
#endif
|
||||
226
engine/thirdparty/libtheora/x86/mmxstate.c
vendored
Normal file
226
engine/thirdparty/libtheora/x86/mmxstate.c
vendored
Normal file
|
|
@ -0,0 +1,226 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/*MMX acceleration of complete fragment reconstruction algorithm.
|
||||
Originally written by Rudolf Marek.*/
|
||||
#include <string.h>
|
||||
#include "x86int.h"
|
||||
#include "mmxloop.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||
unsigned char *dst;
|
||||
ptrdiff_t frag_buf_off;
|
||||
int ystride;
|
||||
int refi;
|
||||
/*Apply the inverse transform.*/
|
||||
/*Special case only having a DC component.*/
|
||||
if(_last_zzi<2){
|
||||
/*Note that this value must be unsigned, to keep the __asm__ block from
|
||||
sign-extending it when it puts it in a register.*/
|
||||
ogg_uint16_t p;
|
||||
int i;
|
||||
/*We round this dequant product (and not any of the others) because there's
|
||||
no iDCT rounding.*/
|
||||
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
|
||||
/*Fill _dct_coeffs with p.*/
|
||||
__asm__ __volatile__(
|
||||
/*mm0=0000 0000 0000 AAAA*/
|
||||
"movd %[p],%%mm0\n\t"
|
||||
/*mm0=0000 0000 AAAA AAAA*/
|
||||
"punpcklwd %%mm0,%%mm0\n\t"
|
||||
/*mm0=AAAA AAAA AAAA AAAA*/
|
||||
"punpckldq %%mm0,%%mm0\n\t"
|
||||
:
|
||||
:[p]"r"((unsigned)p)
|
||||
);
|
||||
for(i=0;i<4;i++){
|
||||
__asm__ __volatile__(
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
|
||||
:[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
|
||||
);
|
||||
}
|
||||
}
|
||||
else{
|
||||
/*Dequantize the DC coefficient.*/
|
||||
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
||||
oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
|
||||
}
|
||||
/*Fill in the target buffer.*/
|
||||
frag_buf_off=_state->frag_buf_offs[_fragi];
|
||||
refi=_state->frags[_fragi].refi;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
|
||||
if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
|
||||
else{
|
||||
const unsigned char *ref;
|
||||
int mvoffsets[2];
|
||||
ref=_state->ref_frame_data[refi]+frag_buf_off;
|
||||
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
||||
_state->frag_mvs[_fragi])>1){
|
||||
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
|
||||
_dct_coeffs+64);
|
||||
}
|
||||
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
|
||||
}
|
||||
}
|
||||
|
||||
/*We copy these entire function to inline the actual MMX routines so that we
|
||||
use only a single indirect call.*/
|
||||
|
||||
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
|
||||
memset(_bv,_flimit,8);
|
||||
}
|
||||
|
||||
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
||||
The filter may be run on the bottom edge, affecting pixels in the next row of
|
||||
fragments, so this row also needs to be available.
|
||||
_bv: The bounding values array.
|
||||
_refi: The index of the frame buffer to filter.
|
||||
_pli: The color plane to filter.
|
||||
_fragy0: The Y coordinate of the first fragment row to filter.
|
||||
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
||||
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||
OC_ALIGN8(unsigned char ll[8]);
|
||||
const oc_fragment_plane *fplane;
|
||||
const oc_fragment *frags;
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
unsigned char *ref_frame_data;
|
||||
ptrdiff_t fragi_top;
|
||||
ptrdiff_t fragi_bot;
|
||||
ptrdiff_t fragi0;
|
||||
ptrdiff_t fragi0_end;
|
||||
int ystride;
|
||||
int nhfrags;
|
||||
memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
|
||||
fplane=_state->fplanes+_pli;
|
||||
nhfrags=fplane->nhfrags;
|
||||
fragi_top=fplane->froffset;
|
||||
fragi_bot=fragi_top+fplane->nfrags;
|
||||
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
|
||||
fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
frags=_state->frags;
|
||||
frag_buf_offs=_state->frag_buf_offs;
|
||||
ref_frame_data=_state->ref_frame_data[_refi];
|
||||
/*The following loops are constructed somewhat non-intuitively on purpose.
|
||||
The main idea is: if a block boundary has at least one coded fragment on
|
||||
it, the filter is applied to it.
|
||||
However, the order that the filters are applied in matters, and VP3 chose
|
||||
the somewhat strange ordering used below.*/
|
||||
while(fragi0<fragi0_end){
|
||||
ptrdiff_t fragi;
|
||||
ptrdiff_t fragi_end;
|
||||
fragi=fragi0;
|
||||
fragi_end=fragi+nhfrags;
|
||||
while(fragi<fragi_end){
|
||||
if(frags[fragi].coded){
|
||||
unsigned char *ref;
|
||||
ref=ref_frame_data+frag_buf_offs[fragi];
|
||||
if(fragi>fragi0){
|
||||
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
|
||||
}
|
||||
if(fragi0>fragi_top){
|
||||
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
|
||||
}
|
||||
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
|
||||
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
|
||||
}
|
||||
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
|
||||
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll);
|
||||
}
|
||||
}
|
||||
fragi++;
|
||||
}
|
||||
fragi0+=nhfrags;
|
||||
}
|
||||
}
|
||||
|
||||
void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
|
||||
memset(_bv,~(_flimit<<1),8);
|
||||
}
|
||||
|
||||
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
||||
The filter may be run on the bottom edge, affecting pixels in the next row of
|
||||
fragments, so this row also needs to be available.
|
||||
_bv: The bounding values array.
|
||||
_refi: The index of the frame buffer to filter.
|
||||
_pli: The color plane to filter.
|
||||
_fragy0: The Y coordinate of the first fragment row to filter.
|
||||
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
||||
void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||
const oc_fragment_plane *fplane;
|
||||
const oc_fragment *frags;
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
unsigned char *ref_frame_data;
|
||||
ptrdiff_t fragi_top;
|
||||
ptrdiff_t fragi_bot;
|
||||
ptrdiff_t fragi0;
|
||||
ptrdiff_t fragi0_end;
|
||||
int ystride;
|
||||
int nhfrags;
|
||||
fplane=_state->fplanes+_pli;
|
||||
nhfrags=fplane->nhfrags;
|
||||
fragi_top=fplane->froffset;
|
||||
fragi_bot=fragi_top+fplane->nfrags;
|
||||
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
|
||||
fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
frags=_state->frags;
|
||||
frag_buf_offs=_state->frag_buf_offs;
|
||||
ref_frame_data=_state->ref_frame_data[_refi];
|
||||
/*The following loops are constructed somewhat non-intuitively on purpose.
|
||||
The main idea is: if a block boundary has at least one coded fragment on
|
||||
it, the filter is applied to it.
|
||||
However, the order that the filters are applied in matters, and VP3 chose
|
||||
the somewhat strange ordering used below.*/
|
||||
while(fragi0<fragi0_end){
|
||||
ptrdiff_t fragi;
|
||||
ptrdiff_t fragi_end;
|
||||
fragi=fragi0;
|
||||
fragi_end=fragi+nhfrags;
|
||||
while(fragi<fragi_end){
|
||||
if(frags[fragi].coded){
|
||||
unsigned char *ref;
|
||||
ref=ref_frame_data+frag_buf_offs[fragi];
|
||||
if(fragi>fragi0){
|
||||
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
|
||||
}
|
||||
if(fragi0>fragi_top){
|
||||
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
|
||||
}
|
||||
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
|
||||
OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
|
||||
}
|
||||
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
|
||||
OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv);
|
||||
}
|
||||
}
|
||||
fragi++;
|
||||
}
|
||||
fragi0+=nhfrags;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
501
engine/thirdparty/libtheora/x86/sse2encfrag.c
vendored
Normal file
501
engine/thirdparty/libtheora/x86/sse2encfrag.c
vendored
Normal file
|
|
@ -0,0 +1,501 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
|
||||
|
||||
********************************************************************/
|
||||
#include <stddef.h>
|
||||
#include "x86enc.h"
|
||||
#include "sse2trans.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*Load a 4x8 array of pixels values from %[src] and %[ref] and compute their
|
||||
16-bit differences.
|
||||
On output, these are stored in _m0, xmm1, xmm2, and xmm3.
|
||||
xmm4 and xmm5 are clobbered.*/
|
||||
#define OC_LOAD_SUB_4x8(_m0) \
|
||||
"#OC_LOAD_SUB_4x8\n\t" \
|
||||
/*Load the first three rows.*/ \
|
||||
"movq (%[src]),"_m0"\n\t" \
|
||||
"movq (%[ref]),%%xmm4\n\t" \
|
||||
"movq (%[src],%[ystride]),%%xmm1\n\t" \
|
||||
"movq (%[ref],%[ystride]),%%xmm3\n\t" \
|
||||
"movq (%[src],%[ystride],2),%%xmm2\n\t" \
|
||||
"movq (%[ref],%[ystride],2),%%xmm5\n\t" \
|
||||
/*Unpack and subtract.*/ \
|
||||
"punpcklbw %%xmm4,"_m0"\n\t" \
|
||||
"punpcklbw %%xmm4,%%xmm4\n\t" \
|
||||
"punpcklbw %%xmm3,%%xmm1\n\t" \
|
||||
"punpcklbw %%xmm3,%%xmm3\n\t" \
|
||||
"psubw %%xmm4,"_m0"\n\t" \
|
||||
"psubw %%xmm3,%%xmm1\n\t" \
|
||||
/*Load the last row.*/ \
|
||||
"movq (%[src],%[ystride3]),%%xmm3\n\t" \
|
||||
"movq (%[ref],%[ystride3]),%%xmm4\n\t" \
|
||||
/*Unpack, subtract, and advance the pointers.*/ \
|
||||
"punpcklbw %%xmm5,%%xmm2\n\t" \
|
||||
"punpcklbw %%xmm5,%%xmm5\n\t" \
|
||||
"lea (%[src],%[ystride],4),%[src]\n\t" \
|
||||
"psubw %%xmm5,%%xmm2\n\t" \
|
||||
"punpcklbw %%xmm4,%%xmm3\n\t" \
|
||||
"punpcklbw %%xmm4,%%xmm4\n\t" \
|
||||
"lea (%[ref],%[ystride],4),%[ref]\n\t" \
|
||||
"psubw %%xmm4,%%xmm3\n\t" \
|
||||
|
||||
/*Square and accumulate four rows of differences in _m0, xmm1, xmm2, and xmm3.
|
||||
On output, xmm0 contains the sum of two of the rows, and the other two are
|
||||
added to xmm7.*/
|
||||
#define OC_SSD_4x8(_m0) \
|
||||
"pmaddwd "_m0","_m0"\n\t" \
|
||||
"pmaddwd %%xmm1,%%xmm1\n\t" \
|
||||
"pmaddwd %%xmm2,%%xmm2\n\t" \
|
||||
"pmaddwd %%xmm3,%%xmm3\n\t" \
|
||||
"paddd %%xmm1,"_m0"\n\t" \
|
||||
"paddd %%xmm3,%%xmm2\n\t" \
|
||||
"paddd %%xmm2,%%xmm7\n\t" \
|
||||
|
||||
unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
unsigned ret;
|
||||
__asm__ __volatile__(
|
||||
OC_LOAD_SUB_4x8("%%xmm7")
|
||||
OC_SSD_4x8("%%xmm7")
|
||||
OC_LOAD_SUB_4x8("%%xmm0")
|
||||
OC_SSD_4x8("%%xmm0")
|
||||
"paddd %%xmm0,%%xmm7\n\t"
|
||||
"movdqa %%xmm7,%%xmm6\n\t"
|
||||
"punpckhqdq %%xmm7,%%xmm7\n\t"
|
||||
"paddd %%xmm6,%%xmm7\n\t"
|
||||
"pshufd $1,%%xmm7,%%xmm6\n\t"
|
||||
"paddd %%xmm6,%%xmm7\n\t"
|
||||
"movd %%xmm7,%[ret]\n\t"
|
||||
:[ret]"=a"(ret)
|
||||
:[src]"r"(_src),[ref]"r"(_ref),[ystride]"r"((ptrdiff_t)_ystride),
|
||||
[ystride3]"r"((ptrdiff_t)_ystride*3)
|
||||
);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static const unsigned char __attribute__((aligned(16))) OC_MASK_CONSTS[8]={
|
||||
0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80
|
||||
};
|
||||
|
||||
/*Load a 2x8 array of pixels values from %[src] and %[ref] and compute their
|
||||
horizontal sums as well as their 16-bit differences subject to a mask.
|
||||
%%xmm5 must contain OC_MASK_CONSTS[0...7] and %%xmm6 must contain 0.*/
|
||||
#define OC_LOAD_SUB_MASK_2x8 \
|
||||
"#OC_LOAD_SUB_MASK_2x8\n\t" \
|
||||
/*Start the loads and expand the next 8 bits of the mask.*/ \
|
||||
"shl $8,%[m]\n\t" \
|
||||
"movq (%[src]),%%xmm0\n\t" \
|
||||
"mov %h[m],%b[m]\n\t" \
|
||||
"movq (%[ref]),%%xmm2\n\t" \
|
||||
"movd %[m],%%xmm4\n\t" \
|
||||
"shr $8,%[m]\n\t" \
|
||||
"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
|
||||
"mov %h[m],%b[m]\n\t" \
|
||||
"pand %%xmm6,%%xmm4\n\t" \
|
||||
"pcmpeqb %%xmm6,%%xmm4\n\t" \
|
||||
/*Perform the masking.*/ \
|
||||
"pand %%xmm4,%%xmm0\n\t" \
|
||||
"pand %%xmm4,%%xmm2\n\t" \
|
||||
/*Finish the loads while unpacking the first set of rows, and expand the next
|
||||
8 bits of the mask.*/ \
|
||||
"movd %[m],%%xmm4\n\t" \
|
||||
"movq (%[src],%[ystride]),%%xmm1\n\t" \
|
||||
"pshuflw $0x00,%%xmm4,%%xmm4\n\t" \
|
||||
"movq (%[ref],%[ystride]),%%xmm3\n\t" \
|
||||
"pand %%xmm6,%%xmm4\n\t" \
|
||||
"punpcklbw %%xmm2,%%xmm0\n\t" \
|
||||
"pcmpeqb %%xmm6,%%xmm4\n\t" \
|
||||
"punpcklbw %%xmm2,%%xmm2\n\t" \
|
||||
/*Mask and unpack the second set of rows.*/ \
|
||||
"pand %%xmm4,%%xmm1\n\t" \
|
||||
"pand %%xmm4,%%xmm3\n\t" \
|
||||
"punpcklbw %%xmm3,%%xmm1\n\t" \
|
||||
"punpcklbw %%xmm3,%%xmm3\n\t" \
|
||||
"psubw %%xmm2,%%xmm0\n\t" \
|
||||
"psubw %%xmm3,%%xmm1\n\t" \
|
||||
|
||||
unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,ogg_int64_t _mask){
|
||||
ptrdiff_t ystride;
|
||||
unsigned ret;
|
||||
int i;
|
||||
ystride=_ystride;
|
||||
__asm__ __volatile__(
|
||||
"pxor %%xmm7,%%xmm7\n\t"
|
||||
"movq %[c],%%xmm6\n\t"
|
||||
:
|
||||
:[c]"m"(OC_CONST_ARRAY_OPERAND(unsigned char,OC_MASK_CONSTS,8))
|
||||
);
|
||||
for(i=0;i<4;i++){
|
||||
unsigned m;
|
||||
m=_mask&0xFFFF;
|
||||
_mask>>=16;
|
||||
if(m){
|
||||
__asm__ __volatile__(
|
||||
OC_LOAD_SUB_MASK_2x8
|
||||
"pmaddwd %%xmm0,%%xmm0\n\t"
|
||||
"pmaddwd %%xmm1,%%xmm1\n\t"
|
||||
"paddd %%xmm0,%%xmm7\n\t"
|
||||
"paddd %%xmm1,%%xmm7\n\t"
|
||||
:[src]"+r"(_src),[ref]"+r"(_ref),[ystride]"+r"(ystride),[m]"+Q"(m)
|
||||
);
|
||||
}
|
||||
_src+=2*ystride;
|
||||
_ref+=2*ystride;
|
||||
}
|
||||
__asm__ __volatile__(
|
||||
"movdqa %%xmm7,%%xmm6\n\t"
|
||||
"punpckhqdq %%xmm7,%%xmm7\n\t"
|
||||
"paddd %%xmm6,%%xmm7\n\t"
|
||||
"pshufd $1,%%xmm7,%%xmm6\n\t"
|
||||
"paddd %%xmm6,%%xmm7\n\t"
|
||||
"movd %%xmm7,%[ret]\n\t"
|
||||
:[ret]"=a"(ret)
|
||||
);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
/*Load an 8x8 array of pixel values from %[src] and %[ref] and compute their
|
||||
16-bit difference in %%xmm0...%%xmm7.*/
|
||||
#define OC_LOAD_SUB_8x8 \
|
||||
"#OC_LOAD_SUB_8x8\n\t" \
|
||||
"movq (%[src]),%%xmm0\n\t" \
|
||||
"movq (%[ref]),%%xmm4\n\t" \
|
||||
"movq (%[src],%[src_ystride]),%%xmm1\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"movq (%[ref],%[ref_ystride]),%%xmm5\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"movq (%[src]),%%xmm2\n\t" \
|
||||
"movq (%[ref]),%%xmm7\n\t" \
|
||||
"movq (%[src],%[src_ystride]),%%xmm3\n\t" \
|
||||
"movq (%[ref],%[ref_ystride]),%%xmm6\n\t" \
|
||||
"punpcklbw %%xmm4,%%xmm0\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"punpcklbw %%xmm4,%%xmm4\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"psubw %%xmm4,%%xmm0\n\t" \
|
||||
"movq (%[src]),%%xmm4\n\t" \
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
"movq (%[ref]),%%xmm0\n\t" \
|
||||
"punpcklbw %%xmm5,%%xmm1\n\t" \
|
||||
"punpcklbw %%xmm5,%%xmm5\n\t" \
|
||||
"psubw %%xmm5,%%xmm1\n\t" \
|
||||
"movq (%[src],%[src_ystride]),%%xmm5\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm2\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm7\n\t" \
|
||||
"psubw %%xmm7,%%xmm2\n\t" \
|
||||
"movq (%[ref],%[ref_ystride]),%%xmm7\n\t" \
|
||||
"punpcklbw %%xmm6,%%xmm3\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"punpcklbw %%xmm6,%%xmm6\n\t" \
|
||||
"psubw %%xmm6,%%xmm3\n\t" \
|
||||
"movq (%[src]),%%xmm6\n\t" \
|
||||
"punpcklbw %%xmm0,%%xmm4\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"punpcklbw %%xmm0,%%xmm0\n\t" \
|
||||
"lea (%[src],%[src_ystride],2),%[src]\n\t" \
|
||||
"psubw %%xmm0,%%xmm4\n\t" \
|
||||
"movq (%[ref]),%%xmm0\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm5\n\t" \
|
||||
"neg %[src_ystride]\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm7\n\t" \
|
||||
"psubw %%xmm7,%%xmm5\n\t" \
|
||||
"movq (%[src],%[src_ystride]),%%xmm7\n\t" \
|
||||
"punpcklbw %%xmm0,%%xmm6\n\t" \
|
||||
"lea (%[ref],%[ref_ystride],2),%[ref]\n\t" \
|
||||
"punpcklbw %%xmm0,%%xmm0\n\t" \
|
||||
"neg %[ref_ystride]\n\t" \
|
||||
"psubw %%xmm0,%%xmm6\n\t" \
|
||||
"movq (%[ref],%[ref_ystride]),%%xmm0\n\t" \
|
||||
"punpcklbw %%xmm0,%%xmm7\n\t" \
|
||||
"punpcklbw %%xmm0,%%xmm0\n\t" \
|
||||
"psubw %%xmm0,%%xmm7\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
|
||||
|
||||
/*Load an 8x8 array of pixel values from %[src] into %%xmm0...%%xmm7.*/
|
||||
#define OC_LOAD_8x8 \
|
||||
"#OC_LOAD_8x8\n\t" \
|
||||
"movq (%[src]),%%xmm0\n\t" \
|
||||
"movq (%[src],%[ystride]),%%xmm1\n\t" \
|
||||
"movq (%[src],%[ystride],2),%%xmm2\n\t" \
|
||||
"pxor %%xmm7,%%xmm7\n\t" \
|
||||
"movq (%[src],%[ystride3]),%%xmm3\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm0\n\t" \
|
||||
"movq (%[src4]),%%xmm4\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm1\n\t" \
|
||||
"movq (%[src4],%[ystride]),%%xmm5\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm2\n\t" \
|
||||
"movq (%[src4],%[ystride],2),%%xmm6\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm3\n\t" \
|
||||
"movq (%[src4],%[ystride3]),%%xmm7\n\t" \
|
||||
"punpcklbw %%xmm4,%%xmm4\n\t" \
|
||||
"punpcklbw %%xmm5,%%xmm5\n\t" \
|
||||
"psrlw $8,%%xmm4\n\t" \
|
||||
"psrlw $8,%%xmm5\n\t" \
|
||||
"punpcklbw %%xmm6,%%xmm6\n\t" \
|
||||
"punpcklbw %%xmm7,%%xmm7\n\t" \
|
||||
"psrlw $8,%%xmm6\n\t" \
|
||||
"psrlw $8,%%xmm7\n\t" \
|
||||
|
||||
/*Performs the first two stages of an 8-point 1-D Hadamard transform in place.
|
||||
Outputs 1, 3, 4, and 5 from the second stage are negated (which allows us to
|
||||
perform this stage in place with no temporary registers).*/
|
||||
#define OC_HADAMARD_AB_8x8 \
|
||||
"#OC_HADAMARD_AB_8x8\n\t" \
|
||||
/*Stage A:*/ \
|
||||
"paddw %%xmm5,%%xmm1\n\t" \
|
||||
"paddw %%xmm6,%%xmm2\n\t" \
|
||||
"paddw %%xmm5,%%xmm5\n\t" \
|
||||
"paddw %%xmm6,%%xmm6\n\t" \
|
||||
"psubw %%xmm1,%%xmm5\n\t" \
|
||||
"psubw %%xmm2,%%xmm6\n\t" \
|
||||
"paddw %%xmm7,%%xmm3\n\t" \
|
||||
"paddw %%xmm4,%%xmm0\n\t" \
|
||||
"paddw %%xmm7,%%xmm7\n\t" \
|
||||
"paddw %%xmm4,%%xmm4\n\t" \
|
||||
"psubw %%xmm3,%%xmm7\n\t" \
|
||||
"psubw %%xmm0,%%xmm4\n\t" \
|
||||
/*Stage B:*/ \
|
||||
"paddw %%xmm2,%%xmm0\n\t" \
|
||||
"paddw %%xmm3,%%xmm1\n\t" \
|
||||
"paddw %%xmm6,%%xmm4\n\t" \
|
||||
"paddw %%xmm7,%%xmm5\n\t" \
|
||||
"paddw %%xmm2,%%xmm2\n\t" \
|
||||
"paddw %%xmm3,%%xmm3\n\t" \
|
||||
"paddw %%xmm6,%%xmm6\n\t" \
|
||||
"paddw %%xmm7,%%xmm7\n\t" \
|
||||
"psubw %%xmm0,%%xmm2\n\t" \
|
||||
"psubw %%xmm1,%%xmm3\n\t" \
|
||||
"psubw %%xmm4,%%xmm6\n\t" \
|
||||
"psubw %%xmm5,%%xmm7\n\t" \
|
||||
|
||||
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
|
||||
Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
|
||||
place with no temporary registers).*/
|
||||
#define OC_HADAMARD_C_8x8 \
|
||||
"#OC_HADAMARD_C_8x8\n\t" \
|
||||
/*Stage C:*/ \
|
||||
"paddw %%xmm1,%%xmm0\n\t" \
|
||||
"paddw %%xmm3,%%xmm2\n\t" \
|
||||
"paddw %%xmm5,%%xmm4\n\t" \
|
||||
"paddw %%xmm7,%%xmm6\n\t" \
|
||||
"paddw %%xmm1,%%xmm1\n\t" \
|
||||
"paddw %%xmm3,%%xmm3\n\t" \
|
||||
"paddw %%xmm5,%%xmm5\n\t" \
|
||||
"paddw %%xmm7,%%xmm7\n\t" \
|
||||
"psubw %%xmm0,%%xmm1\n\t" \
|
||||
"psubw %%xmm2,%%xmm3\n\t" \
|
||||
"psubw %%xmm4,%%xmm5\n\t" \
|
||||
"psubw %%xmm6,%%xmm7\n\t" \
|
||||
|
||||
/*Performs an 8-point 1-D Hadamard transform in place.
|
||||
Outputs 1, 2, 4, and 7 are negated (which allows us to perform the transform
|
||||
in place with no temporary registers).*/
|
||||
#define OC_HADAMARD_8x8 \
|
||||
OC_HADAMARD_AB_8x8 \
|
||||
OC_HADAMARD_C_8x8 \
|
||||
|
||||
/*Performs the first part of the final stage of the Hadamard transform and
|
||||
summing of absolute values.
|
||||
At the end of this part, %%xmm1 will contain the DC coefficient of the
|
||||
transform.*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
|
||||
/*We use the fact that \
|
||||
(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
|
||||
to merge the final butterfly with the abs and the first stage of \
|
||||
accumulation. \
|
||||
Thus we can avoid using pabsw, which is not available until SSSE3. \
|
||||
Emulating pabsw takes 3 instructions, so the straightforward SSE2 \
|
||||
implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
|
||||
registers). \
|
||||
Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
|
||||
This implementation is only 26 (+4 for spilling registers).*/ \
|
||||
"#OC_HADAMARD_C_ABS_ACCUM_A_8x8\n\t" \
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
"movdqa %%xmm6,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
/*xmm7={0x7FFF}x4 \
|
||||
xmm4=max(abs(xmm4),abs(xmm5))-0x7FFF*/ \
|
||||
"pcmpeqb %%xmm7,%%xmm7\n\t" \
|
||||
"movdqa %%xmm4,%%xmm6\n\t" \
|
||||
"psrlw $1,%%xmm7\n\t" \
|
||||
"paddw %%xmm5,%%xmm6\n\t" \
|
||||
"pmaxsw %%xmm5,%%xmm4\n\t" \
|
||||
"paddsw %%xmm7,%%xmm6\n\t" \
|
||||
"psubw %%xmm6,%%xmm4\n\t" \
|
||||
/*xmm2=max(abs(xmm2),abs(xmm3))-0x7FFF \
|
||||
xmm0=max(abs(xmm0),abs(xmm1))-0x7FFF*/ \
|
||||
"movdqa %%xmm2,%%xmm6\n\t" \
|
||||
"movdqa %%xmm0,%%xmm5\n\t" \
|
||||
"pmaxsw %%xmm3,%%xmm2\n\t" \
|
||||
"pmaxsw %%xmm1,%%xmm0\n\t" \
|
||||
"paddw %%xmm3,%%xmm6\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm3\n\t" \
|
||||
"paddw %%xmm5,%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm5\n\t" \
|
||||
|
||||
/*Performs the second part of the final stage of the Hadamard transform and
|
||||
summing of absolute values.*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
|
||||
"#OC_HADAMARD_C_ABS_ACCUM_B_8x8\n\t" \
|
||||
"paddsw %%xmm7,%%xmm6\n\t" \
|
||||
"paddsw %%xmm7,%%xmm1\n\t" \
|
||||
"psubw %%xmm6,%%xmm2\n\t" \
|
||||
"psubw %%xmm1,%%xmm0\n\t" \
|
||||
/*xmm7={1}x4 (needed for the horizontal add that follows) \
|
||||
xmm0+=xmm2+xmm4+max(abs(xmm3),abs(xmm5))-0x7FFF*/ \
|
||||
"movdqa %%xmm3,%%xmm6\n\t" \
|
||||
"pmaxsw %%xmm5,%%xmm3\n\t" \
|
||||
"paddw %%xmm2,%%xmm0\n\t" \
|
||||
"paddw %%xmm5,%%xmm6\n\t" \
|
||||
"paddw %%xmm4,%%xmm0\n\t" \
|
||||
"paddsw %%xmm7,%%xmm6\n\t" \
|
||||
"paddw %%xmm3,%%xmm0\n\t" \
|
||||
"psrlw $14,%%xmm7\n\t" \
|
||||
"psubw %%xmm6,%%xmm0\n\t" \
|
||||
|
||||
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
|
||||
absolute value of each component, and accumulates everything into xmm0.*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_8x8 \
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x8 \
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x8 \
|
||||
|
||||
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
|
||||
component, and accumulates everything into xmm0.
|
||||
Note that xmm0 will have an extra 4 added to each column, and that after
|
||||
removing this value, the remainder will be half the conventional value.*/
|
||||
#define OC_HADAMARD_ABS_ACCUM_8x8 \
|
||||
OC_HADAMARD_AB_8x8 \
|
||||
OC_HADAMARD_C_ABS_ACCUM_8x8
|
||||
|
||||
static unsigned oc_int_frag_satd_sse2(int *_dc,
|
||||
const unsigned char *_src,int _src_ystride,
|
||||
const unsigned char *_ref,int _ref_ystride){
|
||||
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||
unsigned ret;
|
||||
unsigned ret2;
|
||||
int dc;
|
||||
__asm__ __volatile__(
|
||||
OC_LOAD_SUB_8x8
|
||||
OC_HADAMARD_8x8
|
||||
OC_TRANSPOSE_8x8
|
||||
/*We split out the stages here so we can save the DC coefficient in the
|
||||
middle.*/
|
||||
OC_HADAMARD_AB_8x8
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x8
|
||||
"movd %%xmm1,%[dc]\n\t"
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x8
|
||||
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||
Now we finally have to promote things to dwords.
|
||||
We break this part out of OC_HADAMARD_ABS_ACCUM_8x8 to hide the long
|
||||
latency of pmaddwd by starting to compute abs(dc) here.*/
|
||||
"pmaddwd %%xmm7,%%xmm0\n\t"
|
||||
"movsx %w[dc],%[dc]\n\t"
|
||||
"cdq\n\t"
|
||||
"movdqa %%xmm0,%%xmm1\n\t"
|
||||
"punpckhqdq %%xmm0,%%xmm0\n\t"
|
||||
"paddd %%xmm1,%%xmm0\n\t"
|
||||
"pshuflw $0xE,%%xmm0,%%xmm1\n\t"
|
||||
"paddd %%xmm1,%%xmm0\n\t"
|
||||
"movd %%xmm0,%[ret]\n\t"
|
||||
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x8 each have an extra 4
|
||||
added to them, a factor of two removed, and the DC value included;
|
||||
correct the final sum here.*/
|
||||
"lea -64(%[ret2],%[ret],2),%[ret]\n\t"
|
||||
"xor %[dc],%[ret2]\n\t"
|
||||
"sub %[ret2],%[ret]\n\t"
|
||||
/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
|
||||
and %[dc] with some of the inputs, since for once we don't write to
|
||||
them until after we're done using everything but %[buf].*/
|
||||
/*Note that _src_ystride and _ref_ystride must be given non-overlapping
|
||||
constraints, otherewise if gcc can prove they're equal it will allocate
|
||||
them to the same register (which is bad); _src and _ref face a similar
|
||||
problem.
|
||||
All four are destructively modified, but if we list them as output
|
||||
constraints, gcc can't alias them with other outputs.*/
|
||||
:[ret]"=r"(ret),[ret2]"=d"(ret2),[dc]"=a"(dc),
|
||||
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
|
||||
:[src]"S"(_src),[src_ystride]"c"((ptrdiff_t)_src_ystride),
|
||||
[ref]"a"(_ref),[ref_ystride]"d"((ptrdiff_t)_ref_ystride)
|
||||
/*We have to use neg, so we actually clobber the condition codes for once
|
||||
(not to mention sub, and add).*/
|
||||
:"cc"
|
||||
);
|
||||
*_dc=dc;
|
||||
return ret;
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
return oc_int_frag_satd_sse2(_dc,_src,_ystride,_ref,_ystride);
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
|
||||
OC_ALIGN8(unsigned char ref[64]);
|
||||
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
|
||||
return oc_int_frag_satd_sse2(_dc,_src,_ystride,ref,8);
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
|
||||
const unsigned char *_src,int _ystride){
|
||||
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||
unsigned ret;
|
||||
int dc;
|
||||
__asm__ __volatile__(
|
||||
OC_LOAD_8x8
|
||||
OC_HADAMARD_8x8
|
||||
OC_TRANSPOSE_8x8
|
||||
/*We split out the stages here so we can save the DC coefficient in the
|
||||
middle.*/
|
||||
OC_HADAMARD_AB_8x8
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x8
|
||||
"movd %%xmm1,%[dc]\n\t"
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x8
|
||||
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||
Now we finally have to promote things to dwords.*/
|
||||
"pmaddwd %%xmm7,%%xmm0\n\t"
|
||||
/*We assume that the DC coefficient is always positive (which is true,
|
||||
because the input to the INTRA transform was not a difference).*/
|
||||
"movzx %w[dc],%[dc]\n\t"
|
||||
"movdqa %%xmm0,%%xmm1\n\t"
|
||||
"punpckhqdq %%xmm0,%%xmm0\n\t"
|
||||
"paddd %%xmm1,%%xmm0\n\t"
|
||||
"pshuflw $0xE,%%xmm0,%%xmm1\n\t"
|
||||
"paddd %%xmm1,%%xmm0\n\t"
|
||||
"movd %%xmm0,%[ret]\n\t"
|
||||
"lea -64(%[ret],%[ret]),%[ret]\n\t"
|
||||
"sub %[dc],%[ret]\n\t"
|
||||
/*Although it looks like we're using 7 registers here, gcc can alias %[ret]
|
||||
and %[dc] with some of the inputs, since for once we don't write to
|
||||
them until after we're done using everything but %[buf].*/
|
||||
:[ret]"=a"(ret),[dc]"=r"(dc),
|
||||
[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16))
|
||||
:[src]"r"(_src),[src4]"r"(_src+4*_ystride),
|
||||
[ystride]"r"((ptrdiff_t)_ystride),[ystride3]"r"((ptrdiff_t)3*_ystride)
|
||||
/*We have to use sub, so we actually clobber the condition codes for once.*/
|
||||
:"cc"
|
||||
);
|
||||
*_dc=dc;
|
||||
return ret;
|
||||
}
|
||||
|
||||
#endif
|
||||
452
engine/thirdparty/libtheora/x86/sse2fdct.c
vendored
Normal file
452
engine/thirdparty/libtheora/x86/sse2fdct.c
vendored
Normal file
|
|
@ -0,0 +1,452 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************/
|
||||
/*SSE2 fDCT implementation for x86_64.*/
|
||||
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
|
||||
#include <stddef.h>
|
||||
#include "x86enc.h"
|
||||
#include "x86zigzag.h"
|
||||
#include "sse2trans.h"
|
||||
|
||||
#if defined(OC_X86_64_ASM)
|
||||
|
||||
# define OC_FDCT_8x8 \
|
||||
/*Note: xmm15={0}x8 and xmm14={-1}x8.*/ \
|
||||
"#OC_FDCT_8x8\n\t" \
|
||||
/*Stage 1:*/ \
|
||||
"movdqa %%xmm0,%%xmm11\n\t" \
|
||||
"movdqa %%xmm1,%%xmm10\n\t" \
|
||||
"movdqa %%xmm2,%%xmm9\n\t" \
|
||||
"movdqa %%xmm3,%%xmm8\n\t" \
|
||||
/*xmm11=t7'=t0-t7*/ \
|
||||
"psubw %%xmm7,%%xmm11\n\t" \
|
||||
/*xmm10=t6'=t1-t6*/ \
|
||||
"psubw %%xmm6,%%xmm10\n\t" \
|
||||
/*xmm9=t5'=t2-t5*/ \
|
||||
"psubw %%xmm5,%%xmm9\n\t" \
|
||||
/*xmm8=t4'=t3-t4*/ \
|
||||
"psubw %%xmm4,%%xmm8\n\t" \
|
||||
/*xmm0=t0'=t0+t7*/ \
|
||||
"paddw %%xmm7,%%xmm0\n\t" \
|
||||
/*xmm1=t1'=t1+t6*/ \
|
||||
"paddw %%xmm6,%%xmm1\n\t" \
|
||||
/*xmm5=t2'=t2+t5*/ \
|
||||
"paddw %%xmm2,%%xmm5\n\t" \
|
||||
/*xmm4=t3'=t3+t4*/ \
|
||||
"paddw %%xmm3,%%xmm4\n\t" \
|
||||
/*xmm2,3,6,7 are now free.*/ \
|
||||
/*Stage 2:*/ \
|
||||
"movdqa %%xmm0,%%xmm3\n\t" \
|
||||
"mov $0x5A806A0A,%[a]\n\t" \
|
||||
"movdqa %%xmm1,%%xmm2\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"movdqa %%xmm10,%%xmm6\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
/*xmm2=t2''=t1'-t2'*/ \
|
||||
"psubw %%xmm5,%%xmm2\n\t" \
|
||||
"pxor %%xmm12,%%xmm12\n\t" \
|
||||
/*xmm3=t3''=t0'-t3'*/ \
|
||||
"psubw %%xmm4,%%xmm3\n\t" \
|
||||
"psubw %%xmm14,%%xmm12\n\t" \
|
||||
/*xmm10=t5''=t6'-t5'*/ \
|
||||
"psubw %%xmm9,%%xmm10\n\t" \
|
||||
"paddw %%xmm12,%%xmm12\n\t" \
|
||||
/*xmm4=t0''=t0'+t3'*/ \
|
||||
"paddw %%xmm0,%%xmm4\n\t" \
|
||||
/*xmm1=t1''=t1'+t2'*/ \
|
||||
"paddw %%xmm5,%%xmm1\n\t" \
|
||||
/*xmm6=t6''=t6'+t5'*/ \
|
||||
"paddw %%xmm9,%%xmm6\n\t" \
|
||||
/*xmm0,xmm5,xmm9 are now free.*/ \
|
||||
/*Stage 3:*/ \
|
||||
/*xmm10:xmm5=t5''*27146+0xB500 \
|
||||
xmm0=t5''*/ \
|
||||
"movdqa %%xmm10,%%xmm5\n\t" \
|
||||
"movdqa %%xmm10,%%xmm0\n\t" \
|
||||
"punpckhwd %%xmm12,%%xmm10\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm10\n\t" \
|
||||
"punpcklwd %%xmm12,%%xmm5\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm5\n\t" \
|
||||
/*xmm5=(t5''*27146+0xB500>>16)+t5''*/ \
|
||||
"psrad $16,%%xmm10\n\t" \
|
||||
"psrad $16,%%xmm5\n\t" \
|
||||
"packssdw %%xmm10,%%xmm5\n\t" \
|
||||
"paddw %%xmm0,%%xmm5\n\t" \
|
||||
/*xmm0=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
|
||||
"pcmpeqw %%xmm15,%%xmm0\n\t" \
|
||||
"psubw %%xmm14,%%xmm0\n\t" \
|
||||
"paddw %%xmm5,%%xmm0\n\t" \
|
||||
"movdqa %%xmm8,%%xmm5\n\t" \
|
||||
"psraw $1,%%xmm0\n\t" \
|
||||
/*xmm5=t5'''=t4'-s*/ \
|
||||
"psubw %%xmm0,%%xmm5\n\t" \
|
||||
/*xmm8=t4''=t4'+s*/ \
|
||||
"paddw %%xmm0,%%xmm8\n\t" \
|
||||
/*xmm0,xmm7,xmm9,xmm10 are free.*/ \
|
||||
/*xmm7:xmm9=t6''*27146+0xB500*/ \
|
||||
"movdqa %%xmm6,%%xmm7\n\t" \
|
||||
"movdqa %%xmm6,%%xmm9\n\t" \
|
||||
"punpckhwd %%xmm12,%%xmm7\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm7\n\t" \
|
||||
"punpcklwd %%xmm12,%%xmm9\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm9\n\t" \
|
||||
/*xmm9=(t6''*27146+0xB500>>16)+t6''*/ \
|
||||
"psrad $16,%%xmm7\n\t" \
|
||||
"psrad $16,%%xmm9\n\t" \
|
||||
"packssdw %%xmm7,%%xmm9\n\t" \
|
||||
"paddw %%xmm6,%%xmm9\n\t" \
|
||||
/*xmm9=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
|
||||
"pcmpeqw %%xmm15,%%xmm6\n\t" \
|
||||
"psubw %%xmm14,%%xmm6\n\t" \
|
||||
"paddw %%xmm6,%%xmm9\n\t" \
|
||||
"movdqa %%xmm11,%%xmm7\n\t" \
|
||||
"psraw $1,%%xmm9\n\t" \
|
||||
/*xmm7=t6'''=t7'-s*/ \
|
||||
"psubw %%xmm9,%%xmm7\n\t" \
|
||||
/*xmm9=t7''=t7'+s*/ \
|
||||
"paddw %%xmm11,%%xmm9\n\t" \
|
||||
/*xmm0,xmm6,xmm10,xmm11 are free.*/ \
|
||||
/*Stage 4:*/ \
|
||||
/*xmm10:xmm0=t1''*27146+0xB500*/ \
|
||||
"movdqa %%xmm1,%%xmm0\n\t" \
|
||||
"movdqa %%xmm1,%%xmm10\n\t" \
|
||||
"punpcklwd %%xmm12,%%xmm0\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm0\n\t" \
|
||||
"punpckhwd %%xmm12,%%xmm10\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm10\n\t" \
|
||||
/*xmm0=(t1''*27146+0xB500>>16)+t1''*/ \
|
||||
"psrad $16,%%xmm0\n\t" \
|
||||
"psrad $16,%%xmm10\n\t" \
|
||||
"mov $0x20006A0A,%[a]\n\t" \
|
||||
"packssdw %%xmm10,%%xmm0\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"paddw %%xmm1,%%xmm0\n\t" \
|
||||
/*xmm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
|
||||
"pcmpeqw %%xmm15,%%xmm1\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"psubw %%xmm14,%%xmm1\n\t" \
|
||||
"paddw %%xmm1,%%xmm0\n\t" \
|
||||
/*xmm10:xmm4=t0''*27146+0x4000*/ \
|
||||
"movdqa %%xmm4,%%xmm1\n\t" \
|
||||
"movdqa %%xmm4,%%xmm10\n\t" \
|
||||
"punpcklwd %%xmm12,%%xmm4\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm4\n\t" \
|
||||
"punpckhwd %%xmm12,%%xmm10\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm10\n\t" \
|
||||
/*xmm4=(t0''*27146+0x4000>>16)+t0''*/ \
|
||||
"psrad $16,%%xmm4\n\t" \
|
||||
"psrad $16,%%xmm10\n\t" \
|
||||
"mov $0x6CB7,%[a]\n\t" \
|
||||
"packssdw %%xmm10,%%xmm4\n\t" \
|
||||
"movd %[a],%%xmm12\n\t" \
|
||||
"paddw %%xmm1,%%xmm4\n\t" \
|
||||
/*xmm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
|
||||
"pcmpeqw %%xmm15,%%xmm1\n\t" \
|
||||
"pshufd $00,%%xmm12,%%xmm12\n\t" \
|
||||
"psubw %%xmm14,%%xmm1\n\t" \
|
||||
"mov $0x7FFF6C84,%[a]\n\t" \
|
||||
"paddw %%xmm1,%%xmm4\n\t" \
|
||||
/*xmm0=_y[0]=u=r+s>>1 \
|
||||
The naive implementation could cause overflow, so we use \
|
||||
u=(r&s)+((r^s)>>1).*/ \
|
||||
"movdqa %%xmm0,%%xmm6\n\t" \
|
||||
"pxor %%xmm4,%%xmm0\n\t" \
|
||||
"pand %%xmm4,%%xmm6\n\t" \
|
||||
"psraw $1,%%xmm0\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"paddw %%xmm6,%%xmm0\n\t" \
|
||||
/*xmm4=_y[4]=v=r-u*/ \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"psubw %%xmm0,%%xmm4\n\t" \
|
||||
/*xmm1,xmm6,xmm10,xmm11 are free.*/ \
|
||||
/*xmm6:xmm10=60547*t3''+0x6CB7*/ \
|
||||
"movdqa %%xmm3,%%xmm10\n\t" \
|
||||
"movdqa %%xmm3,%%xmm6\n\t" \
|
||||
"punpcklwd %%xmm3,%%xmm10\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm10\n\t" \
|
||||
"mov $0x61F861F8,%[a]\n\t" \
|
||||
"punpckhwd %%xmm3,%%xmm6\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm6\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"paddd %%xmm12,%%xmm10\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"paddd %%xmm12,%%xmm6\n\t" \
|
||||
/*xmm1:xmm2=25080*t2'' \
|
||||
xmm12=t2''*/ \
|
||||
"movdqa %%xmm2,%%xmm11\n\t" \
|
||||
"movdqa %%xmm2,%%xmm12\n\t" \
|
||||
"pmullw %%xmm13,%%xmm2\n\t" \
|
||||
"pmulhw %%xmm13,%%xmm11\n\t" \
|
||||
"movdqa %%xmm2,%%xmm1\n\t" \
|
||||
"punpcklwd %%xmm11,%%xmm2\n\t" \
|
||||
"punpckhwd %%xmm11,%%xmm1\n\t" \
|
||||
/*xmm10=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
|
||||
"paddd %%xmm2,%%xmm10\n\t" \
|
||||
"paddd %%xmm1,%%xmm6\n\t" \
|
||||
"psrad $16,%%xmm10\n\t" \
|
||||
"pcmpeqw %%xmm15,%%xmm3\n\t" \
|
||||
"psrad $16,%%xmm6\n\t" \
|
||||
"psubw %%xmm14,%%xmm3\n\t" \
|
||||
"packssdw %%xmm6,%%xmm10\n\t" \
|
||||
"paddw %%xmm3,%%xmm10\n\t" \
|
||||
/*xmm2=_y[2]=u \
|
||||
xmm10=s=(25080*u>>16)-t2''*/ \
|
||||
"movdqa %%xmm10,%%xmm2\n\t" \
|
||||
"pmulhw %%xmm13,%%xmm10\n\t" \
|
||||
"psubw %%xmm12,%%xmm10\n\t" \
|
||||
/*xmm1:xmm6=s*21600+0x2800*/ \
|
||||
"pxor %%xmm12,%%xmm12\n\t" \
|
||||
"psubw %%xmm14,%%xmm12\n\t" \
|
||||
"mov $0x28005460,%[a]\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"movdqa %%xmm10,%%xmm6\n\t" \
|
||||
"movdqa %%xmm10,%%xmm1\n\t" \
|
||||
"punpcklwd %%xmm12,%%xmm6\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm6\n\t" \
|
||||
"mov $0x0E3D,%[a]\n\t" \
|
||||
"punpckhwd %%xmm12,%%xmm1\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm1\n\t" \
|
||||
/*xmm6=(s*21600+0x2800>>18)+s*/ \
|
||||
"psrad $18,%%xmm6\n\t" \
|
||||
"psrad $18,%%xmm1\n\t" \
|
||||
"movd %[a],%%xmm12\n\t" \
|
||||
"packssdw %%xmm1,%%xmm6\n\t" \
|
||||
"pshufd $00,%%xmm12,%%xmm12\n\t" \
|
||||
"paddw %%xmm10,%%xmm6\n\t" \
|
||||
/*xmm6=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
|
||||
"mov $0x7FFF54DC,%[a]\n\t" \
|
||||
"pcmpeqw %%xmm15,%%xmm10\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"psubw %%xmm14,%%xmm10\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"paddw %%xmm10,%%xmm6\n\t " \
|
||||
/*xmm1,xmm3,xmm10,xmm11 are free.*/ \
|
||||
/*xmm11:xmm10=54491*t5'''+0x0E3D*/ \
|
||||
"movdqa %%xmm5,%%xmm10\n\t" \
|
||||
"movdqa %%xmm5,%%xmm11\n\t" \
|
||||
"punpcklwd %%xmm5,%%xmm10\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm10\n\t" \
|
||||
"mov $0x8E3A8E3A,%[a]\n\t" \
|
||||
"punpckhwd %%xmm5,%%xmm11\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm11\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"paddd %%xmm12,%%xmm10\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"paddd %%xmm12,%%xmm11\n\t" \
|
||||
/*xmm7:xmm12=36410*t6''' \
|
||||
xmm1=t6'''*/ \
|
||||
"movdqa %%xmm7,%%xmm3\n\t" \
|
||||
"movdqa %%xmm7,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm13,%%xmm3\n\t" \
|
||||
"pmullw %%xmm13,%%xmm7\n\t" \
|
||||
"paddw %%xmm1,%%xmm3\n\t" \
|
||||
"movdqa %%xmm7,%%xmm12\n\t" \
|
||||
"punpckhwd %%xmm3,%%xmm7\n\t" \
|
||||
"punpcklwd %%xmm3,%%xmm12\n\t" \
|
||||
/*xmm10=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
|
||||
"paddd %%xmm12,%%xmm10\n\t" \
|
||||
"paddd %%xmm7,%%xmm11\n\t" \
|
||||
"psrad $16,%%xmm10\n\t" \
|
||||
"pcmpeqw %%xmm15,%%xmm5\n\t" \
|
||||
"psrad $16,%%xmm11\n\t" \
|
||||
"psubw %%xmm14,%%xmm5\n\t" \
|
||||
"packssdw %%xmm11,%%xmm10\n\t" \
|
||||
"pxor %%xmm12,%%xmm12\n\t" \
|
||||
"paddw %%xmm5,%%xmm10\n\t" \
|
||||
/*xmm5=_y[5]=u \
|
||||
xmm1=s=t6'''-(36410*u>>16)*/ \
|
||||
"psubw %%xmm14,%%xmm12\n\t" \
|
||||
"movdqa %%xmm10,%%xmm5\n\t" \
|
||||
"mov $0x340067C8,%[a]\n\t" \
|
||||
"pmulhw %%xmm13,%%xmm10\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"paddw %%xmm5,%%xmm10\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"psubw %%xmm10,%%xmm1\n\t" \
|
||||
/*xmm11:xmm3=s*26568+0x3400*/ \
|
||||
"movdqa %%xmm1,%%xmm3\n\t" \
|
||||
"movdqa %%xmm1,%%xmm11\n\t" \
|
||||
"punpcklwd %%xmm12,%%xmm3\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm3\n\t" \
|
||||
"mov $0x7B1B,%[a]\n\t" \
|
||||
"punpckhwd %%xmm12,%%xmm11\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm11\n\t" \
|
||||
/*xmm3=(s*26568+0x3400>>17)+s*/ \
|
||||
"psrad $17,%%xmm3\n\t" \
|
||||
"psrad $17,%%xmm11\n\t" \
|
||||
"movd %[a],%%xmm12\n\t" \
|
||||
"packssdw %%xmm11,%%xmm3\n\t" \
|
||||
"pshufd $00,%%xmm12,%%xmm12\n\t" \
|
||||
"paddw %%xmm1,%%xmm3\n\t" \
|
||||
/*xmm3=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
|
||||
"mov $0x7FFF7B16,%[a]\n\t" \
|
||||
"pcmpeqw %%xmm15,%%xmm1\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"psubw %%xmm14,%%xmm1\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"paddw %%xmm1,%%xmm3\n\t " \
|
||||
/*xmm1,xmm7,xmm10,xmm11 are free.*/ \
|
||||
/*xmm11:xmm10=64277*t7''+0x7B1B*/ \
|
||||
"movdqa %%xmm9,%%xmm10\n\t" \
|
||||
"movdqa %%xmm9,%%xmm11\n\t" \
|
||||
"punpcklwd %%xmm9,%%xmm10\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm10\n\t" \
|
||||
"mov $0x31F131F1,%[a]\n\t" \
|
||||
"punpckhwd %%xmm9,%%xmm11\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm11\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"paddd %%xmm12,%%xmm10\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
"paddd %%xmm12,%%xmm11\n\t" \
|
||||
/*xmm12:xmm7=12785*t4''*/ \
|
||||
"movdqa %%xmm8,%%xmm7\n\t" \
|
||||
"movdqa %%xmm8,%%xmm1\n\t" \
|
||||
"pmullw %%xmm13,%%xmm7\n\t" \
|
||||
"pmulhw %%xmm13,%%xmm1\n\t" \
|
||||
"movdqa %%xmm7,%%xmm12\n\t" \
|
||||
"punpcklwd %%xmm1,%%xmm7\n\t" \
|
||||
"punpckhwd %%xmm1,%%xmm12\n\t" \
|
||||
/*xmm10=u=(12785*t4''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
|
||||
"paddd %%xmm7,%%xmm10\n\t" \
|
||||
"paddd %%xmm12,%%xmm11\n\t" \
|
||||
"psrad $16,%%xmm10\n\t" \
|
||||
"pcmpeqw %%xmm15,%%xmm9\n\t" \
|
||||
"psrad $16,%%xmm11\n\t" \
|
||||
"psubw %%xmm14,%%xmm9\n\t" \
|
||||
"packssdw %%xmm11,%%xmm10\n\t" \
|
||||
"pxor %%xmm12,%%xmm12\n\t" \
|
||||
"paddw %%xmm9,%%xmm10\n\t" \
|
||||
/*xmm1=_y[1]=u \
|
||||
xmm10=s=(12785*u>>16)-t4''*/ \
|
||||
"psubw %%xmm14,%%xmm12\n\t" \
|
||||
"movdqa %%xmm10,%%xmm1\n\t" \
|
||||
"mov $0x3000503B,%[a]\n\t" \
|
||||
"pmulhw %%xmm13,%%xmm10\n\t" \
|
||||
"movd %[a],%%xmm13\n\t" \
|
||||
"psubw %%xmm8,%%xmm10\n\t" \
|
||||
"pshufd $00,%%xmm13,%%xmm13\n\t" \
|
||||
/*xmm8:xmm7=s*20539+0x3000*/ \
|
||||
"movdqa %%xmm10,%%xmm7\n\t" \
|
||||
"movdqa %%xmm10,%%xmm8\n\t" \
|
||||
"punpcklwd %%xmm12,%%xmm7\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm7\n\t" \
|
||||
"punpckhwd %%xmm12,%%xmm8\n\t" \
|
||||
"pmaddwd %%xmm13,%%xmm8\n\t" \
|
||||
/*xmm7=(s*20539+0x3000>>20)+s*/ \
|
||||
"psrad $20,%%xmm7\n\t" \
|
||||
"psrad $20,%%xmm8\n\t" \
|
||||
"packssdw %%xmm8,%%xmm7\n\t" \
|
||||
"paddw %%xmm10,%%xmm7\n\t" \
|
||||
/*xmm7=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
|
||||
"pcmpeqw %%xmm15,%%xmm10\n\t" \
|
||||
"psubw %%xmm14,%%xmm10\n\t" \
|
||||
"paddw %%xmm10,%%xmm7\n\t " \
|
||||
|
||||
/*SSE2 implementation of the fDCT for x86-64 only.
|
||||
Because of the 8 extra XMM registers on x86-64, this version can operate
|
||||
without any temporary stack access at all.*/
|
||||
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
ptrdiff_t a;
|
||||
__asm__ __volatile__(
|
||||
/*Load the input.*/
|
||||
"movdqa 0x00(%[x]),%%xmm0\n\t"
|
||||
"movdqa 0x10(%[x]),%%xmm1\n\t"
|
||||
"movdqa 0x20(%[x]),%%xmm2\n\t"
|
||||
"movdqa 0x30(%[x]),%%xmm3\n\t"
|
||||
"movdqa 0x40(%[x]),%%xmm4\n\t"
|
||||
"movdqa 0x50(%[x]),%%xmm5\n\t"
|
||||
"movdqa 0x60(%[x]),%%xmm6\n\t"
|
||||
"movdqa 0x70(%[x]),%%xmm7\n\t"
|
||||
/*Add two extra bits of working precision to improve accuracy; any more and
|
||||
we could overflow.*/
|
||||
/*We also add a few biases to correct for some systematic error that
|
||||
remains in the full fDCT->iDCT round trip.*/
|
||||
/*xmm15={0}x8*/
|
||||
"pxor %%xmm15,%%xmm15\n\t"
|
||||
/*xmm14={-1}x8*/
|
||||
"pcmpeqb %%xmm14,%%xmm14\n\t"
|
||||
"psllw $2,%%xmm0\n\t"
|
||||
/*xmm8=xmm0*/
|
||||
"movdqa %%xmm0,%%xmm8\n\t"
|
||||
"psllw $2,%%xmm1\n\t"
|
||||
/*xmm8={_x[7...0]==0}*/
|
||||
"pcmpeqw %%xmm15,%%xmm8\n\t"
|
||||
"psllw $2,%%xmm2\n\t"
|
||||
/*xmm8={_x[7...0]!=0}*/
|
||||
"psubw %%xmm14,%%xmm8\n\t"
|
||||
"psllw $2,%%xmm3\n\t"
|
||||
/*%[a]=1*/
|
||||
"mov $1,%[a]\n\t"
|
||||
/*xmm8={_x[6]!=0,0,_x[4]!=0,0,_x[2]!=0,0,_x[0]!=0,0}*/
|
||||
"pslld $16,%%xmm8\n\t"
|
||||
"psllw $2,%%xmm4\n\t"
|
||||
/*xmm9={0,0,0,0,0,0,0,1}*/
|
||||
"movd %[a],%%xmm9\n\t"
|
||||
/*xmm8={0,0,_x[2]!=0,0,_x[0]!=0,0}*/
|
||||
"pshufhw $0x00,%%xmm8,%%xmm8\n\t"
|
||||
"psllw $2,%%xmm5\n\t"
|
||||
/*%[a]={1}x2*/
|
||||
"mov $0x10001,%[a]\n\t"
|
||||
/*xmm8={0,0,0,0,0,0,0,_x[0]!=0}*/
|
||||
"pshuflw $0x01,%%xmm8,%%xmm8\n\t"
|
||||
"psllw $2,%%xmm6\n\t"
|
||||
/*xmm10={0,0,0,0,0,0,1,1}*/
|
||||
"movd %[a],%%xmm10\n\t"
|
||||
/*xmm0=_x[7...0]+{0,0,0,0,0,0,0,_x[0]!=0}*/
|
||||
"paddw %%xmm8,%%xmm0\n\t"
|
||||
"psllw $2,%%xmm7\n\t"
|
||||
/*xmm0=_x[7...0]+{0,0,0,0,0,0,1,(_x[0]!=0)+1}*/
|
||||
"paddw %%xmm10,%%xmm0\n\t"
|
||||
/*xmm1=_x[15...8]-{0,0,0,0,0,0,0,1}*/
|
||||
"psubw %%xmm9,%%xmm1\n\t"
|
||||
/*Transform columns.*/
|
||||
OC_FDCT_8x8
|
||||
/*Transform rows.*/
|
||||
OC_TRANSPOSE_8x8
|
||||
OC_FDCT_8x8
|
||||
/*xmm14={-2,-2,-2,-2,-2,-2,-2,-2}*/
|
||||
"paddw %%xmm14,%%xmm14\n\t"
|
||||
"psubw %%xmm14,%%xmm0\n\t"
|
||||
"psubw %%xmm14,%%xmm1\n\t"
|
||||
"psraw $2,%%xmm0\n\t"
|
||||
"psubw %%xmm14,%%xmm2\n\t"
|
||||
"psraw $2,%%xmm1\n\t"
|
||||
"psubw %%xmm14,%%xmm3\n\t"
|
||||
"psraw $2,%%xmm2\n\t"
|
||||
"psubw %%xmm14,%%xmm4\n\t"
|
||||
"psraw $2,%%xmm3\n\t"
|
||||
"psubw %%xmm14,%%xmm5\n\t"
|
||||
"psraw $2,%%xmm4\n\t"
|
||||
"psubw %%xmm14,%%xmm6\n\t"
|
||||
"psraw $2,%%xmm5\n\t"
|
||||
"psubw %%xmm14,%%xmm7\n\t"
|
||||
"psraw $2,%%xmm6\n\t"
|
||||
"psraw $2,%%xmm7\n\t"
|
||||
/*Transpose, zig-zag, and store the result.*/
|
||||
/*We could probably do better using SSSE3's palignr, but re-using MMXEXT
|
||||
version will do for now.*/
|
||||
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
|
||||
"movdq2q %%xmm"#_row","_reg"\n\t" \
|
||||
|
||||
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
|
||||
"punpckhqdq %%xmm"#_row",%%xmm"#_row"\n\t" \
|
||||
"movdq2q %%xmm"#_row","_reg"\n\t" \
|
||||
|
||||
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
|
||||
#undef OC_ZZ_LOAD_ROW_LO
|
||||
#undef OC_ZZ_LOAD_ROW_HI
|
||||
:[a]"=&r"(a)
|
||||
:[y]"r"(_y),[x]"r"(_x)
|
||||
:"memory"
|
||||
);
|
||||
}
|
||||
#endif
|
||||
456
engine/thirdparty/libtheora/x86/sse2idct.c
vendored
Normal file
456
engine/thirdparty/libtheora/x86/sse2idct.c
vendored
Normal file
|
|
@ -0,0 +1,456 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/*SSE2 acceleration of Theora's iDCT.*/
|
||||
#include "x86int.h"
|
||||
#include "sse2trans.h"
|
||||
#include "../dct.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*A table of constants used by the MMX routines.*/
|
||||
const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={
|
||||
8, 8, 8, 8, 8, 8, 8, 8,
|
||||
OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,
|
||||
OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,
|
||||
OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,
|
||||
OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,
|
||||
OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,
|
||||
OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,
|
||||
OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1
|
||||
};
|
||||
|
||||
|
||||
/*Performs the first three stages of the iDCT.
|
||||
xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input
|
||||
(accessed in that order).
|
||||
The remaining rows must be in _x at their corresponding locations.
|
||||
On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
|
||||
contain rows 4 through 7.*/
|
||||
#define OC_IDCT_8x8_ABC(_x) \
|
||||
"#OC_IDCT_8x8_ABC\n\t" \
|
||||
/*Stage 1:*/ \
|
||||
/*2-3 rotation by 6pi/16. \
|
||||
xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \
|
||||
"movdqa %%xmm1,%%xmm0\n\t" \
|
||||
"pmulhw %%xmm2,%%xmm1\n\t" \
|
||||
"movdqa %%xmm4,%%xmm7\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm0\n\t" \
|
||||
"pmulhw %%xmm2,%%xmm7\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm4\n\t" \
|
||||
"paddw %%xmm6,%%xmm0\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \
|
||||
"paddw %%xmm1,%%xmm2\n\t" \
|
||||
"psubw %%xmm0,%%xmm7\n\t" \
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
"paddw %%xmm4,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
/*5-6 rotation by 3pi/16. \
|
||||
xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \
|
||||
"movdqa %%xmm4,%%xmm2\n\t" \
|
||||
"movdqa %%xmm6,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm3,%%xmm4\n\t" \
|
||||
"pmulhw %%xmm5,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm3,%%xmm6\n\t" \
|
||||
"pmulhw %%xmm5,%%xmm2\n\t" \
|
||||
"paddw %%xmm3,%%xmm4\n\t" \
|
||||
"paddw %%xmm5,%%xmm3\n\t" \
|
||||
"paddw %%xmm6,%%xmm3\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \
|
||||
"paddw %%xmm5,%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \
|
||||
"paddw %%xmm3,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
|
||||
"psubw %%xmm4,%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \
|
||||
/*4-7 rotation by 7pi/16. \
|
||||
xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \
|
||||
"movdqa %%xmm3,%%xmm0\n\t" \
|
||||
"movdqa %%xmm4,%%xmm7\n\t" \
|
||||
"pmulhw %%xmm5,%%xmm3\n\t" \
|
||||
"pmulhw %%xmm5,%%xmm7\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm4\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm0\n\t" \
|
||||
"paddw %%xmm6,%%xmm4\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \
|
||||
"paddw %%xmm5,%%xmm7\n\t" \
|
||||
"psubw %%xmm4,%%xmm3\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
|
||||
"paddw %%xmm7,%%xmm0\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \
|
||||
/*0-1 butterfly. \
|
||||
xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \
|
||||
"paddw %%xmm7,%%xmm6\n\t" \
|
||||
"movdqa %%xmm4,%%xmm5\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm4\n\t" \
|
||||
"paddw %%xmm7,%%xmm7\n\t" \
|
||||
"psubw %%xmm6,%%xmm7\n\t" \
|
||||
"paddw %%xmm6,%%xmm4\n\t" \
|
||||
/*Stage 2:*/ \
|
||||
/*4-5 butterfly: xmm3=t[4], xmm1=t[5] \
|
||||
7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \
|
||||
"movdqa %%xmm3,%%xmm6\n\t" \
|
||||
"paddw %%xmm1,%%xmm3\n\t" \
|
||||
"psubw %%xmm1,%%xmm6\n\t" \
|
||||
"movdqa %%xmm5,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm7,%%xmm5\n\t" \
|
||||
"paddw %%xmm7,%%xmm5\n\t" \
|
||||
"movdqa %%xmm0,%%xmm7\n\t" \
|
||||
"paddw %%xmm2,%%xmm0\n\t" \
|
||||
"psubw %%xmm2,%%xmm7\n\t" \
|
||||
"movdqa %%xmm1,%%xmm2\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm7,%%xmm2\n\t" \
|
||||
"paddw %%xmm6,%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
|
||||
"paddw %%xmm7,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
|
||||
/*Stage 3: \
|
||||
6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
|
||||
0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
|
||||
1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
|
||||
"paddw %%xmm2,%%xmm1\n\t" \
|
||||
"paddw %%xmm5,%%xmm6\n\t" \
|
||||
"paddw %%xmm4,%%xmm7\n\t" \
|
||||
"paddw %%xmm2,%%xmm2\n\t" \
|
||||
"paddw %%xmm4,%%xmm4\n\t" \
|
||||
"paddw %%xmm5,%%xmm5\n\t" \
|
||||
"psubw %%xmm1,%%xmm2\n\t" \
|
||||
"psubw %%xmm7,%%xmm4\n\t" \
|
||||
"psubw %%xmm6,%%xmm5\n\t" \
|
||||
|
||||
/*Performs the last stage of the iDCT.
|
||||
On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
|
||||
contain rows 4 through 7.
|
||||
On output, xmm0 through xmm7 contain the corresponding rows.*/
|
||||
#define OC_IDCT_8x8_D \
|
||||
"#OC_IDCT_8x8_D\n\t" \
|
||||
/*Stage 4: \
|
||||
0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
|
||||
1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
|
||||
2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
|
||||
3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
|
||||
"psubw %%xmm0,%%xmm7\n\t" \
|
||||
"psubw %%xmm1,%%xmm6\n\t" \
|
||||
"psubw %%xmm2,%%xmm5\n\t" \
|
||||
"psubw %%xmm3,%%xmm4\n\t" \
|
||||
"paddw %%xmm0,%%xmm0\n\t" \
|
||||
"paddw %%xmm1,%%xmm1\n\t" \
|
||||
"paddw %%xmm2,%%xmm2\n\t" \
|
||||
"paddw %%xmm3,%%xmm3\n\t" \
|
||||
"paddw %%xmm7,%%xmm0\n\t" \
|
||||
"paddw %%xmm6,%%xmm1\n\t" \
|
||||
"paddw %%xmm5,%%xmm2\n\t" \
|
||||
"paddw %%xmm4,%%xmm3\n\t" \
|
||||
|
||||
/*Performs the last stage of the iDCT.
|
||||
On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3
|
||||
contain rows 4 through 7.
|
||||
On output, xmm0 through xmm7 contain the corresponding rows.*/
|
||||
#define OC_IDCT_8x8_D_STORE \
|
||||
"#OC_IDCT_8x8_D_STORE\n\t" \
|
||||
/*Stage 4: \
|
||||
0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \
|
||||
1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \
|
||||
2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \
|
||||
3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \
|
||||
"psubw %%xmm3,%%xmm4\n\t" \
|
||||
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \
|
||||
"psubw %%xmm0,%%xmm7\n\t" \
|
||||
"psubw %%xmm1,%%xmm6\n\t" \
|
||||
"psubw %%xmm2,%%xmm5\n\t" \
|
||||
"paddw %%xmm4,%%xmm7\n\t" \
|
||||
"paddw %%xmm4,%%xmm6\n\t" \
|
||||
"paddw %%xmm4,%%xmm5\n\t" \
|
||||
"paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \
|
||||
"paddw %%xmm0,%%xmm0\n\t" \
|
||||
"paddw %%xmm1,%%xmm1\n\t" \
|
||||
"paddw %%xmm2,%%xmm2\n\t" \
|
||||
"paddw %%xmm3,%%xmm3\n\t" \
|
||||
"paddw %%xmm7,%%xmm0\n\t" \
|
||||
"paddw %%xmm6,%%xmm1\n\t" \
|
||||
"psraw $4,%%xmm0\n\t" \
|
||||
"paddw %%xmm5,%%xmm2\n\t" \
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \
|
||||
"psraw $4,%%xmm1\n\t" \
|
||||
"paddw %%xmm4,%%xmm3\n\t" \
|
||||
"movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \
|
||||
"psraw $4,%%xmm2\n\t" \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \
|
||||
"psraw $4,%%xmm3\n\t" \
|
||||
"movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \
|
||||
"psraw $4,%%xmm4\n\t" \
|
||||
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \
|
||||
"psraw $4,%%xmm5\n\t" \
|
||||
"movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \
|
||||
"psraw $4,%%xmm6\n\t" \
|
||||
"movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \
|
||||
"psraw $4,%%xmm7\n\t" \
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \
|
||||
|
||||
static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||
int i;
|
||||
/*This routine accepts an 8x8 matrix pre-transposed.*/
|
||||
__asm__ __volatile__(
|
||||
/*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/
|
||||
"movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t"
|
||||
"movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t"
|
||||
"movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t"
|
||||
"movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t"
|
||||
OC_IDCT_8x8_ABC(x)
|
||||
OC_IDCT_8x8_D
|
||||
OC_TRANSPOSE_8x8
|
||||
/*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t"
|
||||
"movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t"
|
||||
"movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t"
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t"
|
||||
OC_IDCT_8x8_ABC(y)
|
||||
OC_IDCT_8x8_D_STORE
|
||||
:[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)),
|
||||
[y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
|
||||
:[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)),
|
||||
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
|
||||
);
|
||||
__asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::);
|
||||
/*Clear input data for next block (decoder only).*/
|
||||
for(i=0;i<2;i++){
|
||||
__asm__ __volatile__(
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t"
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t"
|
||||
:[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only
|
||||
need to work with four columns at a time.
|
||||
Doing this in MMX is faster on processors with a 64-bit data path.*/
|
||||
#define OC_IDCT_8x8_10_MMX \
|
||||
"#OC_IDCT_8x8_10_MMX\n\t" \
|
||||
/*Stage 1:*/ \
|
||||
/*2-3 rotation by 6pi/16. \
|
||||
mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \
|
||||
"movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \
|
||||
"pmulhw %%mm2,%%mm6\n\t" \
|
||||
"pmulhw %%mm2,%%mm7\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \
|
||||
"paddw %%mm6,%%mm2\n\t" \
|
||||
"movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \
|
||||
"movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
/*5-6 rotation by 3pi/16. \
|
||||
mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \
|
||||
"pmulhw %%mm3,%%mm5\n\t" \
|
||||
"pmulhw %%mm3,%%mm2\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \
|
||||
"paddw %%mm3,%%mm5\n\t" \
|
||||
"paddw %%mm3,%%mm2\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
|
||||
/*4-7 rotation by 7pi/16. \
|
||||
mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \
|
||||
"pmulhw %%mm1,%%mm3\n\t" \
|
||||
"pmulhw %%mm1,%%mm7\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
|
||||
"movq %%mm3,%%mm6\n\t" \
|
||||
"paddw %%mm1,%%mm7\n\t" \
|
||||
/*0-1 butterfly. \
|
||||
mm4=C4, mm0=X0, X4=0.*/ \
|
||||
/*Stage 2:*/ \
|
||||
/*4-5 butterfly: mm3=t[4], mm5=t[5] \
|
||||
7-6 butterfly: mm2=t[6], mm7=t[7]*/ \
|
||||
"psubw %%mm5,%%mm3\n\t" \
|
||||
"paddw %%mm5,%%mm6\n\t" \
|
||||
"movq %%mm4,%%mm1\n\t" \
|
||||
"pmulhw %%mm0,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm4\n\t" \
|
||||
"movq %%mm7,%%mm0\n\t" \
|
||||
"movq %%mm4,%%mm5\n\t" \
|
||||
"paddw %%mm2,%%mm0\n\t" \
|
||||
"psubw %%mm2,%%mm7\n\t" \
|
||||
"movq %%mm1,%%mm2\n\t" \
|
||||
"pmulhw %%mm6,%%mm1\n\t" \
|
||||
"pmulhw %%mm7,%%mm2\n\t" \
|
||||
"paddw %%mm6,%%mm1\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \
|
||||
"paddw %%mm7,%%mm2\n\t" \
|
||||
"movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \
|
||||
/*Stage 3: \
|
||||
6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \
|
||||
0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \
|
||||
1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \
|
||||
"paddw %%mm2,%%mm1\n\t" \
|
||||
"paddw %%mm5,%%mm6\n\t" \
|
||||
"paddw %%mm4,%%mm7\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"paddw %%mm4,%%mm4\n\t" \
|
||||
"paddw %%mm5,%%mm5\n\t" \
|
||||
"psubw %%mm1,%%mm2\n\t" \
|
||||
"psubw %%mm7,%%mm4\n\t" \
|
||||
"psubw %%mm6,%%mm5\n\t" \
|
||||
/*Stage 4: \
|
||||
0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \
|
||||
1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \
|
||||
2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \
|
||||
3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \
|
||||
"psubw %%mm0,%%mm7\n\t" \
|
||||
"psubw %%mm1,%%mm6\n\t" \
|
||||
"psubw %%mm2,%%mm5\n\t" \
|
||||
"psubw %%mm3,%%mm4\n\t" \
|
||||
"paddw %%mm0,%%mm0\n\t" \
|
||||
"paddw %%mm1,%%mm1\n\t" \
|
||||
"paddw %%mm2,%%mm2\n\t" \
|
||||
"paddw %%mm3,%%mm3\n\t" \
|
||||
"paddw %%mm7,%%mm0\n\t" \
|
||||
"paddw %%mm6,%%mm1\n\t" \
|
||||
"paddw %%mm5,%%mm2\n\t" \
|
||||
"paddw %%mm4,%%mm3\n\t" \
|
||||
|
||||
#define OC_IDCT_8x8_10_ABC \
|
||||
"#OC_IDCT_8x8_10_ABC\n\t" \
|
||||
/*Stage 1:*/ \
|
||||
/*2-3 rotation by 6pi/16. \
|
||||
xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \
|
||||
"pmulhw %%xmm2,%%xmm6\n\t" \
|
||||
"pmulhw %%xmm2,%%xmm7\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \
|
||||
"paddw %%xmm6,%%xmm2\n\t" \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \
|
||||
"movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
/*5-6 rotation by 3pi/16. \
|
||||
xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \
|
||||
"pmulhw %%xmm3,%%xmm5\n\t" \
|
||||
"pmulhw %%xmm3,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \
|
||||
"paddw %%xmm3,%%xmm5\n\t" \
|
||||
"paddw %%xmm3,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \
|
||||
/*4-7 rotation by 7pi/16. \
|
||||
xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \
|
||||
"pmulhw %%xmm1,%%xmm3\n\t" \
|
||||
"pmulhw %%xmm1,%%xmm7\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \
|
||||
"movdqa %%xmm3,%%xmm6\n\t" \
|
||||
"paddw %%xmm1,%%xmm7\n\t" \
|
||||
/*0-1 butterfly. \
|
||||
xmm4=C4, xmm0=X0, X4=0.*/ \
|
||||
/*Stage 2:*/ \
|
||||
/*4-5 butterfly: xmm3=t[4], xmm5=t[5] \
|
||||
7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \
|
||||
"psubw %%xmm5,%%xmm3\n\t" \
|
||||
"paddw %%xmm5,%%xmm6\n\t" \
|
||||
"movdqa %%xmm4,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm0,%%xmm4\n\t" \
|
||||
"paddw %%xmm0,%%xmm4\n\t" \
|
||||
"movdqa %%xmm7,%%xmm0\n\t" \
|
||||
"movdqa %%xmm4,%%xmm5\n\t" \
|
||||
"paddw %%xmm2,%%xmm0\n\t" \
|
||||
"psubw %%xmm2,%%xmm7\n\t" \
|
||||
"movdqa %%xmm1,%%xmm2\n\t" \
|
||||
"pmulhw %%xmm6,%%xmm1\n\t" \
|
||||
"pmulhw %%xmm7,%%xmm2\n\t" \
|
||||
"paddw %%xmm6,%%xmm1\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \
|
||||
"paddw %%xmm7,%%xmm2\n\t" \
|
||||
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \
|
||||
/*Stage 3: \
|
||||
6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \
|
||||
0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \
|
||||
1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \
|
||||
"paddw %%xmm2,%%xmm1\n\t" \
|
||||
"paddw %%xmm5,%%xmm6\n\t" \
|
||||
"paddw %%xmm4,%%xmm7\n\t" \
|
||||
"paddw %%xmm2,%%xmm2\n\t" \
|
||||
"paddw %%xmm4,%%xmm4\n\t" \
|
||||
"paddw %%xmm5,%%xmm5\n\t" \
|
||||
"psubw %%xmm1,%%xmm2\n\t" \
|
||||
"psubw %%xmm7,%%xmm4\n\t" \
|
||||
"psubw %%xmm6,%%xmm5\n\t" \
|
||||
|
||||
static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
OC_ALIGN16(ogg_int16_t buf[16]);
|
||||
/*This routine accepts an 8x8 matrix pre-transposed.*/
|
||||
__asm__ __volatile__(
|
||||
"movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t"
|
||||
"movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t"
|
||||
"movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t"
|
||||
"movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t"
|
||||
OC_IDCT_8x8_10_MMX
|
||||
OC_TRANSPOSE_8x4_MMX2SSE
|
||||
OC_IDCT_8x8_10_ABC
|
||||
OC_IDCT_8x8_D_STORE
|
||||
:[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)),
|
||||
[y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64))
|
||||
:[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
|
||||
[c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128))
|
||||
);
|
||||
/*Clear input data for next block (decoder only).*/
|
||||
__asm__ __volatile__(
|
||||
"pxor %%mm0,%%mm0\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
|
||||
"movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
|
||||
:[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28))
|
||||
);
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||
version of the transform.*/
|
||||
void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||
/*_last_zzi is subtly different from an actual count of the number of
|
||||
coefficients we decoded for this block.
|
||||
It contains the value of zzi BEFORE the final token in the block was
|
||||
decoded.
|
||||
In most cases this is an EOB token (the continuation of an EOB run from a
|
||||
previous block counts), and so this is the same as the coefficient count.
|
||||
However, in the case that the last token was NOT an EOB token, but filled
|
||||
the block up with exactly 64 coefficients, _last_zzi will be less than 64.
|
||||
Provided the last token was not a pure zero run, the minimum value it can
|
||||
be is 46, and so that doesn't affect any of the cases in this routine.
|
||||
However, if the last token WAS a pure zero run of length 63, then _last_zzi
|
||||
will be 1 while the number of coefficients decoded is 64.
|
||||
Thus, we will trigger the following special case, where the real
|
||||
coefficient count would not.
|
||||
Note also that a zero run of length 64 will give _last_zzi a value of 0,
|
||||
but we still process the DC coefficient, which might have a non-zero value
|
||||
due to DC prediction.
|
||||
Although convoluted, this is arguably the correct behavior: it allows us to
|
||||
use a smaller transform when the block ends with a long zero run instead
|
||||
of a normal EOB token.
|
||||
It could be smarter... multiple separate zero runs at the end of a block
|
||||
will fool it, but an encoder that generates these really deserves what it
|
||||
gets.
|
||||
Needless to say we inherited this approach from VP3.*/
|
||||
/*Then perform the iDCT.*/
|
||||
if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x);
|
||||
else oc_idct8x8_slow_sse2(_y,_x);
|
||||
}
|
||||
|
||||
#endif
|
||||
242
engine/thirdparty/libtheora/x86/sse2trans.h
vendored
Normal file
242
engine/thirdparty/libtheora/x86/sse2trans.h
vendored
Normal file
|
|
@ -0,0 +1,242 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_sse2trans_H)
|
||||
# define _x86_sse2trans_H (1)
|
||||
# include "x86int.h"
|
||||
|
||||
# if defined(OC_X86_64_ASM)
|
||||
/*On x86-64 we can transpose in-place without spilling registers.
|
||||
By clever choices of the order to apply the butterflies and the order of
|
||||
their outputs, we can take the rows in order and output the columns in order
|
||||
without any extra operations and using just one temporary register.*/
|
||||
# define OC_TRANSPOSE_8x8 \
|
||||
"#OC_TRANSPOSE_8x8\n\t" \
|
||||
"movdqa %%xmm4,%%xmm8\n\t" \
|
||||
/*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
|
||||
"punpcklwd %%xmm5,%%xmm4\n\t" \
|
||||
/*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
|
||||
"punpckhwd %%xmm5,%%xmm8\n\t" \
|
||||
/*xmm5 is free.*/ \
|
||||
"movdqa %%xmm0,%%xmm5\n\t" \
|
||||
/*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
|
||||
"punpcklwd %%xmm1,%%xmm0\n\t" \
|
||||
/*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
|
||||
"punpckhwd %%xmm1,%%xmm5\n\t" \
|
||||
/*xmm1 is free.*/ \
|
||||
"movdqa %%xmm6,%%xmm1\n\t" \
|
||||
/*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
|
||||
"punpcklwd %%xmm7,%%xmm6\n\t" \
|
||||
/*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
|
||||
"punpckhwd %%xmm7,%%xmm1\n\t" \
|
||||
/*xmm7 is free.*/ \
|
||||
"movdqa %%xmm2,%%xmm7\n\t" \
|
||||
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||
"punpckhwd %%xmm3,%%xmm2\n\t" \
|
||||
/*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
|
||||
"punpcklwd %%xmm3,%%xmm7\n\t" \
|
||||
/*xmm3 is free.*/ \
|
||||
"movdqa %%xmm0,%%xmm3\n\t" \
|
||||
/*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
|
||||
"punpckldq %%xmm7,%%xmm0\n\t" \
|
||||
/*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
|
||||
"punpckhdq %%xmm7,%%xmm3\n\t" \
|
||||
/*xmm7 is free.*/ \
|
||||
"movdqa %%xmm5,%%xmm7\n\t" \
|
||||
/*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
|
||||
"punpckldq %%xmm2,%%xmm5\n\t" \
|
||||
/*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
|
||||
"punpckhdq %%xmm2,%%xmm7\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm4,%%xmm2\n\t" \
|
||||
/*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
|
||||
"punpckhdq %%xmm6,%%xmm4\n\t" \
|
||||
/*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
|
||||
"punpckldq %%xmm6,%%xmm2\n\t" \
|
||||
/*xmm6 is free.*/ \
|
||||
"movdqa %%xmm8,%%xmm6\n\t" \
|
||||
/*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
|
||||
"punpckldq %%xmm1,%%xmm6\n\t" \
|
||||
/*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||
"punpckhdq %%xmm1,%%xmm8\n\t" \
|
||||
/*xmm1 is free.*/ \
|
||||
"movdqa %%xmm0,%%xmm1\n\t" \
|
||||
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"punpcklqdq %%xmm2,%%xmm0\n\t" \
|
||||
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
|
||||
"punpckhqdq %%xmm2,%%xmm1\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm3,%%xmm2\n\t" \
|
||||
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
|
||||
"punpckhqdq %%xmm4,%%xmm3\n\t" \
|
||||
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
|
||||
"punpcklqdq %%xmm4,%%xmm2\n\t" \
|
||||
/*xmm4 is free.*/ \
|
||||
"movdqa %%xmm5,%%xmm4\n\t" \
|
||||
/*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
|
||||
"punpckhqdq %%xmm6,%%xmm5\n\t" \
|
||||
/*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
|
||||
"punpcklqdq %%xmm6,%%xmm4\n\t" \
|
||||
/*xmm6 is free.*/ \
|
||||
"movdqa %%xmm7,%%xmm6\n\t" \
|
||||
/*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
|
||||
"punpckhqdq %%xmm8,%%xmm7\n\t" \
|
||||
/*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
|
||||
"punpcklqdq %%xmm8,%%xmm6\n\t" \
|
||||
/*xmm8 is free.*/ \
|
||||
|
||||
# else
|
||||
/*Otherwise, we need to spill some values to %[buf] temporarily.
|
||||
Again, the butterflies are carefully arranged to get the columns to come out
|
||||
in order, minimizing register spills and maximizing the delay between a load
|
||||
and when the value loaded is actually used.*/
|
||||
# define OC_TRANSPOSE_8x8 \
|
||||
"#OC_TRANSPOSE_8x8\n\t" \
|
||||
/*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \
|
||||
"movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
/*xmm0 is free.*/ \
|
||||
"movdqa %%xmm2,%%xmm0\n\t" \
|
||||
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||
"punpckhwd %%xmm3,%%xmm2\n\t" \
|
||||
/*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \
|
||||
"punpcklwd %%xmm3,%%xmm0\n\t" \
|
||||
/*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \
|
||||
/*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm6,%%xmm2\n\t" \
|
||||
/*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \
|
||||
"punpcklwd %%xmm7,%%xmm6\n\t" \
|
||||
/*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \
|
||||
"punpckhwd %%xmm7,%%xmm2\n\t" \
|
||||
/*xmm7 is free.*/ \
|
||||
"movdqa %%xmm4,%%xmm7\n\t" \
|
||||
/*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \
|
||||
"punpcklwd %%xmm5,%%xmm4\n\t" \
|
||||
/*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \
|
||||
"punpckhwd %%xmm5,%%xmm7\n\t" \
|
||||
/*xmm5 is free.*/ \
|
||||
"movdqa %%xmm3,%%xmm5\n\t" \
|
||||
/*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \
|
||||
"punpcklwd %%xmm1,%%xmm3\n\t" \
|
||||
/*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \
|
||||
"punpckhwd %%xmm1,%%xmm5\n\t" \
|
||||
/*xmm1 is free.*/ \
|
||||
"movdqa %%xmm7,%%xmm1\n\t" \
|
||||
/*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \
|
||||
"punpckldq %%xmm2,%%xmm7\n\t" \
|
||||
/*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||
"punpckhdq %%xmm2,%%xmm1\n\t" \
|
||||
/*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \
|
||||
/*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||
"movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \
|
||||
/*xmm1 is free.*/ \
|
||||
"movdqa %%xmm3,%%xmm1\n\t" \
|
||||
/*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
|
||||
"punpckhdq %%xmm0,%%xmm3\n\t" \
|
||||
/*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
|
||||
"punpckldq %%xmm0,%%xmm1\n\t" \
|
||||
/*xmm0 is free.*/ \
|
||||
"movdqa %%xmm4,%%xmm0\n\t" \
|
||||
/*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
|
||||
"punpckhdq %%xmm6,%%xmm4\n\t" \
|
||||
/*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
|
||||
"punpckldq %%xmm6,%%xmm0\n\t" \
|
||||
/*xmm6 is free.*/ \
|
||||
"movdqa %%xmm5,%%xmm6\n\t" \
|
||||
/*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \
|
||||
"punpckldq %%xmm2,%%xmm5\n\t" \
|
||||
/*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \
|
||||
"punpckhdq %%xmm2,%%xmm6\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm1,%%xmm2\n\t" \
|
||||
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
|
||||
"punpckhqdq %%xmm0,%%xmm1\n\t" \
|
||||
/*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"punpcklqdq %%xmm0,%%xmm2\n\t" \
|
||||
/*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \
|
||||
/*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \
|
||||
/*xmm2 is free.*/ \
|
||||
"movdqa %%xmm3,%%xmm2\n\t" \
|
||||
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
|
||||
"punpckhqdq %%xmm4,%%xmm3\n\t" \
|
||||
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
|
||||
"punpcklqdq %%xmm4,%%xmm2\n\t" \
|
||||
/*xmm4 is free.*/ \
|
||||
"movdqa %%xmm5,%%xmm4\n\t" \
|
||||
/*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \
|
||||
"punpckhqdq %%xmm7,%%xmm5\n\t" \
|
||||
/*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \
|
||||
"punpcklqdq %%xmm7,%%xmm4\n\t" \
|
||||
/*xmm7 is free.*/ \
|
||||
"movdqa %%xmm6,%%xmm7\n\t" \
|
||||
/*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \
|
||||
"punpcklqdq %%xmm0,%%xmm6\n\t" \
|
||||
/*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \
|
||||
"punpckhqdq %%xmm0,%%xmm7\n\t" \
|
||||
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \
|
||||
|
||||
# endif
|
||||
|
||||
/*Transpose 4 values in each of 8 MMX registers into 8 values in the first
|
||||
four SSE registers.
|
||||
No need to be clever here; we have plenty of room.*/
|
||||
# define OC_TRANSPOSE_8x4_MMX2SSE \
|
||||
"#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \
|
||||
"movq2dq %%mm0,%%xmm0\n\t" \
|
||||
"movq2dq %%mm1,%%xmm1\n\t" \
|
||||
/*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \
|
||||
"punpcklwd %%xmm1,%%xmm0\n\t" \
|
||||
"movq2dq %%mm2,%%xmm3\n\t" \
|
||||
"movq2dq %%mm3,%%xmm2\n\t" \
|
||||
/*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \
|
||||
"punpcklwd %%xmm2,%%xmm3\n\t" \
|
||||
"movq2dq %%mm4,%%xmm4\n\t" \
|
||||
"movq2dq %%mm5,%%xmm5\n\t" \
|
||||
/*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \
|
||||
"punpcklwd %%xmm5,%%xmm4\n\t" \
|
||||
"movq2dq %%mm6,%%xmm7\n\t" \
|
||||
"movq2dq %%mm7,%%xmm6\n\t" \
|
||||
/*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \
|
||||
"punpcklwd %%xmm6,%%xmm7\n\t" \
|
||||
"movdqa %%xmm0,%%xmm2\n\t" \
|
||||
/*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \
|
||||
"punpckldq %%xmm3,%%xmm0\n\t" \
|
||||
/*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \
|
||||
"punpckhdq %%xmm3,%%xmm2\n\t" \
|
||||
"movdqa %%xmm4,%%xmm5\n\t" \
|
||||
/*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \
|
||||
"punpckldq %%xmm7,%%xmm4\n\t" \
|
||||
/*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \
|
||||
"punpckhdq %%xmm7,%%xmm5\n\t" \
|
||||
"movdqa %%xmm0,%%xmm1\n\t" \
|
||||
/*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \
|
||||
"punpcklqdq %%xmm4,%%xmm0\n\t" \
|
||||
/*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \
|
||||
"punpckhqdq %%xmm4,%%xmm1\n\t" \
|
||||
"movdqa %%xmm2,%%xmm3\n\t" \
|
||||
/*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \
|
||||
"punpcklqdq %%xmm5,%%xmm2\n\t" \
|
||||
/*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \
|
||||
"punpckhqdq %%xmm5,%%xmm3\n\t" \
|
||||
|
||||
#endif
|
||||
182
engine/thirdparty/libtheora/x86/x86cpu.c
vendored
Normal file
182
engine/thirdparty/libtheora/x86/x86cpu.c
vendored
Normal file
|
|
@ -0,0 +1,182 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
CPU capability detection for x86 processors.
|
||||
Originally written by Rudolf Marek.
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include "x86cpu.h"
|
||||
|
||||
#if !defined(OC_X86_ASM)
|
||||
ogg_uint32_t oc_cpu_flags_get(void){
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
# if defined(__amd64__)||defined(__x86_64__)
|
||||
/*On x86-64, gcc seems to be able to figure out how to save %rbx for us when
|
||||
compiling with -fPIC.*/
|
||||
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
|
||||
__asm__ __volatile__( \
|
||||
"cpuid\n\t" \
|
||||
:[eax]"=a"(_eax),[ebx]"=b"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
|
||||
:"a"(_op) \
|
||||
:"cc" \
|
||||
)
|
||||
# else
|
||||
/*On x86-32, not so much.*/
|
||||
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
|
||||
__asm__ __volatile__( \
|
||||
"xchgl %%ebx,%[ebx]\n\t" \
|
||||
"cpuid\n\t" \
|
||||
"xchgl %%ebx,%[ebx]\n\t" \
|
||||
:[eax]"=a"(_eax),[ebx]"=r"(_ebx),[ecx]"=c"(_ecx),[edx]"=d"(_edx) \
|
||||
:"a"(_op) \
|
||||
:"cc" \
|
||||
)
|
||||
# endif
|
||||
|
||||
static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
||||
ogg_uint32_t flags;
|
||||
/*If there isn't even MMX, give up.*/
|
||||
if(!(_edx&0x00800000))return 0;
|
||||
flags=OC_CPU_X86_MMX;
|
||||
if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
|
||||
if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
|
||||
if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
|
||||
if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
|
||||
if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
|
||||
if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
|
||||
return flags;
|
||||
}
|
||||
|
||||
static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
||||
ogg_uint32_t flags;
|
||||
/*If there isn't even MMX, give up.*/
|
||||
if(!(_edx&0x00800000))return 0;
|
||||
flags=OC_CPU_X86_MMX;
|
||||
if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
|
||||
if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
|
||||
if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
|
||||
if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
|
||||
if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
|
||||
return flags;
|
||||
}
|
||||
|
||||
ogg_uint32_t oc_cpu_flags_get(void){
|
||||
ogg_uint32_t flags;
|
||||
ogg_uint32_t eax;
|
||||
ogg_uint32_t ebx;
|
||||
ogg_uint32_t ecx;
|
||||
ogg_uint32_t edx;
|
||||
# if !defined(__amd64__)&&!defined(__x86_64__)
|
||||
/*Not all x86-32 chips support cpuid, so we have to check.*/
|
||||
__asm__ __volatile__(
|
||||
"pushfl\n\t"
|
||||
"pushfl\n\t"
|
||||
"popl %[a]\n\t"
|
||||
"movl %[a],%[b]\n\t"
|
||||
"xorl $0x200000,%[a]\n\t"
|
||||
"pushl %[a]\n\t"
|
||||
"popfl\n\t"
|
||||
"pushfl\n\t"
|
||||
"popl %[a]\n\t"
|
||||
"popfl\n\t"
|
||||
:[a]"=r"(eax),[b]"=r"(ebx)
|
||||
:
|
||||
:"cc"
|
||||
);
|
||||
/*No cpuid.*/
|
||||
if(eax==ebx)return 0;
|
||||
# endif
|
||||
cpuid(0,eax,ebx,ecx,edx);
|
||||
/* l e t n I e n i u n e G*/
|
||||
if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
|
||||
/* 6 8 x M T e n i u n e G*/
|
||||
ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
|
||||
int family;
|
||||
int model;
|
||||
/*Intel, Transmeta (tested with Crusoe TM5800):*/
|
||||
cpuid(1,eax,ebx,ecx,edx);
|
||||
flags=oc_parse_intel_flags(edx,ecx);
|
||||
family=(eax>>8)&0xF;
|
||||
model=(eax>>4)&0xF;
|
||||
/*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
|
||||
unit, so don't use it.*/
|
||||
if(family==6&&(model==9||model==13||model==14)){
|
||||
flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
|
||||
}
|
||||
}
|
||||
/* D M A c i t n e h t u A*/
|
||||
else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
|
||||
/* C S N y b e d o e G*/
|
||||
ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
|
||||
/*AMD, Geode:*/
|
||||
cpuid(0x80000000,eax,ebx,ecx,edx);
|
||||
if(eax<0x80000001)flags=0;
|
||||
else{
|
||||
cpuid(0x80000001,eax,ebx,ecx,edx);
|
||||
flags=oc_parse_amd_flags(edx,ecx);
|
||||
}
|
||||
/*Also check for SSE.*/
|
||||
cpuid(1,eax,ebx,ecx,edx);
|
||||
flags|=oc_parse_intel_flags(edx,ecx);
|
||||
}
|
||||
/*Technically some VIA chips can be configured in the BIOS to return any
|
||||
string here the user wants.
|
||||
There is a special detection method that can be used to identify such
|
||||
processors, but in my opinion, if the user really wants to change it, they
|
||||
deserve what they get.*/
|
||||
/* s l u a H r u a t n e C*/
|
||||
else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
|
||||
/*VIA:*/
|
||||
/*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
|
||||
chips (thanks to the engineers from Centaur Technology who provided it).
|
||||
These chips support Intel-like cpuid info.
|
||||
The C3-2 (Nehemiah) cores appear to, as well.*/
|
||||
cpuid(1,eax,ebx,ecx,edx);
|
||||
flags=oc_parse_intel_flags(edx,ecx);
|
||||
if(eax>=0x80000001){
|
||||
/*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
|
||||
We need to check this even if the Intel test succeeds to pick up 3DNow!
|
||||
support on these processors.
|
||||
Unlike actual AMD processors, we cannot _rely_ on this info, since
|
||||
some cores (e.g., the 693 stepping of the Nehemiah) claim to support
|
||||
this function, yet return edx=0, despite the Intel test indicating
|
||||
MMX support.
|
||||
Therefore the features detected here are strictly added to those
|
||||
detected by the Intel test.*/
|
||||
/*TODO: How about earlier chips?*/
|
||||
cpuid(0x80000001,eax,ebx,ecx,edx);
|
||||
/*Note: As of the C7, this function returns Intel-style extended feature
|
||||
flags, not AMD-style.
|
||||
Currently, this only defines bits 11, 20, and 29 (0x20100800), which
|
||||
do not conflict with any of the AMD flags we inspect.
|
||||
For the remaining bits, Intel tells us, "Do not count on their value",
|
||||
but VIA assures us that they will all be zero (at least on the C7 and
|
||||
Isaiah chips).
|
||||
In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
|
||||
(0xC0C00000) for something else, we will have to add code to detect
|
||||
the model to decide when it is appropriate to inspect them.*/
|
||||
flags|=oc_parse_amd_flags(edx,ecx);
|
||||
}
|
||||
}
|
||||
else{
|
||||
/*Implement me.*/
|
||||
flags=0;
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
#endif
|
||||
36
engine/thirdparty/libtheora/x86/x86cpu.h
vendored
Normal file
36
engine/thirdparty/libtheora/x86/x86cpu.h
vendored
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_x86cpu_H)
|
||||
# define _x86_x86cpu_H (1)
|
||||
#include "../internal.h"
|
||||
|
||||
#define OC_CPU_X86_MMX (1<<0)
|
||||
#define OC_CPU_X86_3DNOW (1<<1)
|
||||
#define OC_CPU_X86_3DNOWEXT (1<<2)
|
||||
#define OC_CPU_X86_MMXEXT (1<<3)
|
||||
#define OC_CPU_X86_SSE (1<<4)
|
||||
#define OC_CPU_X86_SSE2 (1<<5)
|
||||
#define OC_CPU_X86_PNI (1<<6)
|
||||
#define OC_CPU_X86_SSSE3 (1<<7)
|
||||
#define OC_CPU_X86_SSE4_1 (1<<8)
|
||||
#define OC_CPU_X86_SSE4_2 (1<<9)
|
||||
#define OC_CPU_X86_SSE4A (1<<10)
|
||||
#define OC_CPU_X86_SSE5 (1<<11)
|
||||
|
||||
ogg_uint32_t oc_cpu_flags_get(void);
|
||||
|
||||
#endif
|
||||
63
engine/thirdparty/libtheora/x86/x86enc.c
vendored
Normal file
63
engine/thirdparty/libtheora/x86/x86enc.c
vendored
Normal file
|
|
@ -0,0 +1,63 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: x86state.c 15675 2009-02-06 09:43:27Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
#include "x86enc.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
|
||||
ogg_uint32_t cpu_flags;
|
||||
cpu_flags=_enc->state.cpu_flags;
|
||||
oc_enc_accel_init_c(_enc);
|
||||
# if defined(OC_ENC_USE_VTABLE)
|
||||
if(cpu_flags&OC_CPU_X86_MMX){
|
||||
_enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
|
||||
_enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
|
||||
_enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
||||
_enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
||||
}
|
||||
if(cpu_flags&OC_CPU_X86_MMXEXT){
|
||||
_enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
|
||||
_enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
|
||||
_enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
|
||||
_enc->opt_vtable.frag_satd=oc_enc_frag_satd_mmxext;
|
||||
_enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
|
||||
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
|
||||
_enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
|
||||
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
|
||||
}
|
||||
if(cpu_flags&OC_CPU_X86_SSE2){
|
||||
# if defined(OC_X86_64_ASM)
|
||||
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
|
||||
# endif
|
||||
_enc->opt_vtable.frag_ssd=oc_enc_frag_ssd_sse2;
|
||||
_enc->opt_vtable.frag_border_ssd=oc_enc_frag_border_ssd_sse2;
|
||||
_enc->opt_vtable.frag_satd=oc_enc_frag_satd_sse2;
|
||||
_enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_sse2;
|
||||
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_sse2;
|
||||
_enc->opt_vtable.enquant_table_init=oc_enc_enquant_table_init_x86;
|
||||
_enc->opt_vtable.enquant_table_fixup=oc_enc_enquant_table_fixup_x86;
|
||||
_enc->opt_vtable.quantize=oc_enc_quantize_sse2;
|
||||
# else
|
||||
(void) cpu_flags;
|
||||
# endif
|
||||
_enc->opt_data.enquant_table_size=128*sizeof(ogg_uint16_t);
|
||||
_enc->opt_data.enquant_table_alignment=16;
|
||||
# if defined(OC_ENC_USE_VTABLE)
|
||||
}
|
||||
# endif
|
||||
}
|
||||
#endif
|
||||
114
engine/thirdparty/libtheora/x86/x86enc.h
vendored
Normal file
114
engine/thirdparty/libtheora/x86/x86enc.h
vendored
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: x86int.h 15675 2009-02-06 09:43:27Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_x86enc_H)
|
||||
# define _x86_x86enc_H (1)
|
||||
# include "x86int.h"
|
||||
|
||||
# if defined(OC_X86_ASM)
|
||||
# define oc_enc_accel_init oc_enc_accel_init_x86
|
||||
# if defined(OC_X86_64_ASM)
|
||||
/*x86-64 guarantees SIMD support up through at least SSE2.
|
||||
If the best routine we have available only needs SSE2 (which at the moment
|
||||
covers all of them), then we can avoid runtime detection and the indirect
|
||||
call.*/
|
||||
# define oc_enc_frag_sub(_enc,_diff,_x,_y,_stride) \
|
||||
oc_enc_frag_sub_mmx(_diff,_x,_y,_stride)
|
||||
# define oc_enc_frag_sub_128(_enc,_diff,_x,_stride) \
|
||||
oc_enc_frag_sub_128_mmx(_diff,_x,_stride)
|
||||
# define oc_enc_frag_sad(_enc,_src,_ref,_ystride) \
|
||||
oc_enc_frag_sad_mmxext(_src,_ref,_ystride)
|
||||
# define oc_enc_frag_sad_thresh(_enc,_src,_ref,_ystride,_thresh) \
|
||||
oc_enc_frag_sad_thresh_mmxext(_src,_ref,_ystride,_thresh)
|
||||
# define oc_enc_frag_sad2_thresh(_enc,_src,_ref1,_ref2,_ystride,_thresh) \
|
||||
oc_enc_frag_sad2_thresh_mmxext(_src,_ref1,_ref2,_ystride,_thresh)
|
||||
# define oc_enc_frag_satd(_enc,_dc,_src,_ref,_ystride) \
|
||||
oc_enc_frag_satd_sse2(_dc,_src,_ref,_ystride)
|
||||
# define oc_enc_frag_satd2(_enc,_dc,_src,_ref1,_ref2,_ystride) \
|
||||
oc_enc_frag_satd2_sse2(_dc,_src,_ref1,_ref2,_ystride)
|
||||
# define oc_enc_frag_intra_satd(_enc,_dc,_src,_ystride) \
|
||||
oc_enc_frag_intra_satd_sse2(_dc,_src,_ystride)
|
||||
# define oc_enc_frag_ssd(_enc,_src,_ref,_ystride) \
|
||||
oc_enc_frag_ssd_sse2(_src,_ref,_ystride)
|
||||
# define oc_enc_frag_border_ssd(_enc,_src,_ref,_ystride,_mask) \
|
||||
oc_enc_frag_border_ssd_sse2(_src,_ref,_ystride,_mask)
|
||||
# define oc_enc_frag_copy2(_enc,_dst,_src1,_src2,_ystride) \
|
||||
oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride)
|
||||
# define oc_enc_enquant_table_init(_enc,_enquant,_dequant) \
|
||||
oc_enc_enquant_table_init_x86(_enquant,_dequant)
|
||||
# define oc_enc_enquant_table_fixup(_enc,_enquant,_nqis) \
|
||||
oc_enc_enquant_table_fixup_x86(_enquant,_nqis)
|
||||
# define oc_enc_quantize(_enc,_qdct,_dct,_dequant,_enquant) \
|
||||
oc_enc_quantize_sse2(_qdct,_dct,_dequant,_enquant)
|
||||
# define oc_enc_frag_recon_intra(_enc,_dst,_ystride,_residue) \
|
||||
oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
|
||||
# define oc_enc_frag_recon_inter(_enc,_dst,_src,_ystride,_residue) \
|
||||
oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
|
||||
# define oc_enc_fdct8x8(_enc,_y,_x) \
|
||||
oc_enc_fdct8x8_x86_64sse2(_y,_x)
|
||||
# else
|
||||
# define OC_ENC_USE_VTABLE (1)
|
||||
# endif
|
||||
# endif
|
||||
|
||||
# include "../encint.h"
|
||||
|
||||
void oc_enc_accel_init_x86(oc_enc_ctx *_enc);
|
||||
|
||||
void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
|
||||
const unsigned char *_x,const unsigned char *_y,int _stride);
|
||||
void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
|
||||
const unsigned char *_x,int _stride);
|
||||
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _thresh);
|
||||
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||
unsigned _thresh);
|
||||
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_satd_sse2(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
|
||||
unsigned oc_enc_frag_satd2_sse2(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
|
||||
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,
|
||||
const unsigned char *_src,int _ystride);
|
||||
unsigned oc_enc_frag_intra_satd_sse2(int *_dc,
|
||||
const unsigned char *_src,int _ystride);
|
||||
unsigned oc_enc_frag_ssd_sse2(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_border_ssd_sse2(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,ogg_int64_t _mask);
|
||||
void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride);
|
||||
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
||||
void oc_enc_enquant_table_init_x86(void *_enquant,
|
||||
const ogg_uint16_t _dequant[64]);
|
||||
void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis);
|
||||
int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||
const ogg_uint16_t _dequant[64],const void *_enquant);
|
||||
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||
|
||||
# if defined(OC_X86_64_ASM)
|
||||
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||
# endif
|
||||
|
||||
#endif
|
||||
149
engine/thirdparty/libtheora/x86/x86enquant.c
vendored
Normal file
149
engine/thirdparty/libtheora/x86/x86enquant.c
vendored
Normal file
|
|
@ -0,0 +1,149 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: mmxstate.c 17247 2010-05-28 05:35:32Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include "x86enc.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
|
||||
|
||||
/*The default enquant table is not quite suitable for SIMD purposes.
|
||||
First, the m and l parameters need to be separated so that an entire row full
|
||||
of m's or l's can be loaded at a time.
|
||||
Second, x86 SIMD has no element-wise arithmetic right-shift, so we have to
|
||||
emulate one with a multiply.
|
||||
Therefore we translate the shift count into a scale factor.*/
|
||||
void oc_enc_enquant_table_init_x86(void *_enquant,
|
||||
const ogg_uint16_t _dequant[64]){
|
||||
ogg_int16_t *m;
|
||||
ogg_int16_t *l;
|
||||
int zzi;
|
||||
m=(ogg_int16_t *)_enquant;
|
||||
l=m+64;
|
||||
for(zzi=0;zzi<64;zzi++){
|
||||
oc_iquant q;
|
||||
oc_iquant_init(&q,_dequant[zzi]);
|
||||
m[zzi]=q.m;
|
||||
/*q.l must be at least 2 for this to work; fortunately, once all the scale
|
||||
factors are baked in, the minimum quantizer is much larger than that.*/
|
||||
l[zzi]=1<<16-q.l;
|
||||
}
|
||||
}
|
||||
|
||||
void oc_enc_enquant_table_fixup_x86(void *_enquant[3][3][2],int _nqis){
|
||||
int pli;
|
||||
int qii;
|
||||
int qti;
|
||||
for(pli=0;pli<3;pli++)for(qii=1;qii<_nqis;qii++)for(qti=0;qti<2;qti++){
|
||||
((ogg_int16_t *)_enquant[pli][qii][qti])[0]=
|
||||
((ogg_int16_t *)_enquant[pli][0][qti])[0];
|
||||
((ogg_int16_t *)_enquant[pli][qii][qti])[64]=
|
||||
((ogg_int16_t *)_enquant[pli][0][qti])[64];
|
||||
}
|
||||
}
|
||||
|
||||
int oc_enc_quantize_sse2(ogg_int16_t _qdct[64],const ogg_int16_t _dct[64],
|
||||
const ogg_uint16_t _dequant[64],const void *_enquant){
|
||||
ptrdiff_t r;
|
||||
__asm__ __volatile__(
|
||||
"xor %[r],%[r]\n\t"
|
||||
/*Loop through two rows at a time.*/
|
||||
".p2align 4\n\t"
|
||||
"0:\n\t"
|
||||
/*Load the first two rows of the data and the quant matrices.*/
|
||||
"movdqa 0x00(%[dct],%[r]),%%xmm0\n\t"
|
||||
"movdqa 0x10(%[dct],%[r]),%%xmm1\n\t"
|
||||
"movdqa 0x00(%[dq],%[r]),%%xmm2\n\t"
|
||||
"movdqa 0x10(%[dq],%[r]),%%xmm3\n\t"
|
||||
"movdqa 0x00(%[q],%[r]),%%xmm4\n\t"
|
||||
"movdqa 0x10(%[q],%[r]),%%xmm5\n\t"
|
||||
/*Double the input and propagate its sign to the rounding factor.
|
||||
Using SSSE3's psignw would help here, but we need the mask later anyway.*/
|
||||
"movdqa %%xmm0,%%xmm6\n\t"
|
||||
"psraw $15,%%xmm0\n\t"
|
||||
"movdqa %%xmm1,%%xmm7\n\t"
|
||||
"paddw %%xmm6,%%xmm6\n\t"
|
||||
"psraw $15,%%xmm1\n\t"
|
||||
"paddw %%xmm7,%%xmm7\n\t"
|
||||
"paddw %%xmm0,%%xmm2\n\t"
|
||||
"paddw %%xmm1,%%xmm3\n\t"
|
||||
"pxor %%xmm0,%%xmm2\n\t"
|
||||
"pxor %%xmm1,%%xmm3\n\t"
|
||||
/*Add the rounding factor and perform the first multiply.*/
|
||||
"paddw %%xmm2,%%xmm6\n\t"
|
||||
"paddw %%xmm3,%%xmm7\n\t"
|
||||
"pmulhw %%xmm6,%%xmm4\n\t"
|
||||
"pmulhw %%xmm7,%%xmm5\n\t"
|
||||
"movdqa 0x80(%[q],%[r]),%%xmm2\n\t"
|
||||
"movdqa 0x90(%[q],%[r]),%%xmm3\n\t"
|
||||
"paddw %%xmm4,%%xmm6\n\t"
|
||||
"paddw %%xmm5,%%xmm7\n\t"
|
||||
/*Emulate an element-wise right-shift via a second multiply.*/
|
||||
"pmulhw %%xmm2,%%xmm6\n\t"
|
||||
"pmulhw %%xmm3,%%xmm7\n\t"
|
||||
"add $32,%[r]\n\t"
|
||||
"cmp $96,%[r]\n\t"
|
||||
/*Correct for the sign.*/
|
||||
"psubw %%xmm0,%%xmm6\n\t"
|
||||
"psubw %%xmm1,%%xmm7\n\t"
|
||||
/*Save the result.*/
|
||||
"movdqa %%xmm6,-0x20(%[qdct],%[r])\n\t"
|
||||
"movdqa %%xmm7,-0x10(%[qdct],%[r])\n\t"
|
||||
"jle 0b\n\t"
|
||||
/*Now find the location of the last non-zero value.*/
|
||||
"movdqa 0x50(%[qdct]),%%xmm5\n\t"
|
||||
"movdqa 0x40(%[qdct]),%%xmm4\n\t"
|
||||
"packsswb %%xmm7,%%xmm6\n\t"
|
||||
"packsswb %%xmm5,%%xmm4\n\t"
|
||||
"pxor %%xmm0,%%xmm0\n\t"
|
||||
"mov $-1,%k[dq]\n\t"
|
||||
"pcmpeqb %%xmm0,%%xmm6\n\t"
|
||||
"pcmpeqb %%xmm0,%%xmm4\n\t"
|
||||
"pmovmskb %%xmm6,%k[q]\n\t"
|
||||
"pmovmskb %%xmm4,%k[r]\n\t"
|
||||
"shl $16,%k[q]\n\t"
|
||||
"or %k[r],%k[q]\n\t"
|
||||
"mov $32,%[r]\n\t"
|
||||
/*We have to use xor here instead of not in order to set the flags.*/
|
||||
"xor %k[dq],%k[q]\n\t"
|
||||
"jnz 1f\n\t"
|
||||
"movdqa 0x30(%[qdct]),%%xmm7\n\t"
|
||||
"movdqa 0x20(%[qdct]),%%xmm6\n\t"
|
||||
"movdqa 0x10(%[qdct]),%%xmm5\n\t"
|
||||
"movdqa 0x00(%[qdct]),%%xmm4\n\t"
|
||||
"packsswb %%xmm7,%%xmm6\n\t"
|
||||
"packsswb %%xmm5,%%xmm4\n\t"
|
||||
"pcmpeqb %%xmm0,%%xmm6\n\t"
|
||||
"pcmpeqb %%xmm0,%%xmm4\n\t"
|
||||
"pmovmskb %%xmm6,%k[q]\n\t"
|
||||
"pmovmskb %%xmm4,%k[r]\n\t"
|
||||
"shl $16,%k[q]\n\t"
|
||||
"or %k[r],%k[q]\n\t"
|
||||
"xor %[r],%[r]\n\t"
|
||||
"not %k[q]\n\t"
|
||||
"or $1,%k[q]\n\t"
|
||||
"1:\n\t"
|
||||
"bsr %k[q],%k[q]\n\t"
|
||||
"add %k[q],%k[r]\n\t"
|
||||
:[r]"=&a"(r),[q]"+r"(_enquant),[dq]"+r"(_dequant)
|
||||
:[dct]"r"(_dct),[qdct]"r"(_qdct)
|
||||
:"cc","memory"
|
||||
);
|
||||
return (int)r;
|
||||
}
|
||||
|
||||
#endif
|
||||
122
engine/thirdparty/libtheora/x86/x86int.h
vendored
Normal file
122
engine/thirdparty/libtheora/x86/x86int.h
vendored
Normal file
|
|
@ -0,0 +1,122 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_x86int_H)
|
||||
# define _x86_x86int_H (1)
|
||||
# include "../internal.h"
|
||||
|
||||
# if defined(OC_X86_ASM)
|
||||
# define oc_state_accel_init oc_state_accel_init_x86
|
||||
# if defined(OC_X86_64_ASM)
|
||||
/*x86-64 guarantees SIMD support up through at least SSE2.
|
||||
If the best routine we have available only needs SSE2 (which at the moment
|
||||
covers all of them), then we can avoid runtime detection and the indirect
|
||||
call.*/
|
||||
# define oc_frag_copy(_state,_dst,_src,_ystride) \
|
||||
oc_frag_copy_mmx(_dst,_src,_ystride)
|
||||
# define oc_frag_copy_list(_state,_dst_frame,_src_frame,_ystride, \
|
||||
_fragis,_nfragis,_frag_buf_offs) \
|
||||
oc_frag_copy_list_mmx(_dst_frame,_src_frame,_ystride, \
|
||||
_fragis,_nfragis,_frag_buf_offs)
|
||||
# define oc_frag_recon_intra(_state,_dst,_ystride,_residue) \
|
||||
oc_frag_recon_intra_mmx(_dst,_ystride,_residue)
|
||||
# define oc_frag_recon_inter(_state,_dst,_src,_ystride,_residue) \
|
||||
oc_frag_recon_inter_mmx(_dst,_src,_ystride,_residue)
|
||||
# define oc_frag_recon_inter2(_state,_dst,_src1,_src2,_ystride,_residue) \
|
||||
oc_frag_recon_inter2_mmx(_dst,_src1,_src2,_ystride,_residue)
|
||||
# define oc_idct8x8(_state,_y,_x,_last_zzi) \
|
||||
oc_idct8x8_sse2(_y,_x,_last_zzi)
|
||||
# define oc_state_frag_recon oc_state_frag_recon_mmx
|
||||
# define oc_loop_filter_init(_state,_bv,_flimit) \
|
||||
oc_loop_filter_init_mmxext(_bv,_flimit)
|
||||
# define oc_state_loop_filter_frag_rows oc_state_loop_filter_frag_rows_mmxext
|
||||
# define oc_restore_fpu(_state) \
|
||||
oc_restore_fpu_mmx()
|
||||
# else
|
||||
# define OC_STATE_USE_VTABLE (1)
|
||||
# endif
|
||||
# endif
|
||||
|
||||
# include "../state.h"
|
||||
# include "x86cpu.h"
|
||||
|
||||
/*Converts the expression in the argument to a string.*/
|
||||
#define OC_M2STR(_s) #_s
|
||||
|
||||
/*Memory operands do not always include an offset.
|
||||
To avoid warnings, we force an offset with %H (which adds 8).*/
|
||||
# if __GNUC_PREREQ(4,0)
|
||||
# define OC_MEM_OFFS(_offs,_name) \
|
||||
OC_M2STR(_offs-8+%H[_name])
|
||||
# endif
|
||||
/*If your gcc version does't support %H, then you get to suffer the warnings.
|
||||
Note that Apple's gas breaks on things like _offs+(%esp): it throws away the
|
||||
whole offset, instead of substituting in 0 for the missing operand to +.*/
|
||||
# if !defined(OC_MEM_OFFS)
|
||||
# define OC_MEM_OFFS(_offs,_name) \
|
||||
OC_M2STR(_offs+%[_name])
|
||||
# endif
|
||||
|
||||
/*Declare an array operand with an exact size.
|
||||
This tells gcc we're going to clobber this memory region, without having to
|
||||
clobber all of "memory" and lets us access local buffers directly using the
|
||||
stack pointer, without allocating a separate register to point to them.*/
|
||||
#define OC_ARRAY_OPERAND(_type,_ptr,_size) \
|
||||
(*({ \
|
||||
struct{_type array_value__[(_size)];} *array_addr__=(void *)(_ptr); \
|
||||
array_addr__; \
|
||||
}))
|
||||
|
||||
/*Declare an array operand with an exact size.
|
||||
This tells gcc we're going to clobber this memory region, without having to
|
||||
clobber all of "memory" and lets us access local buffers directly using the
|
||||
stack pointer, without allocating a separate register to point to them.*/
|
||||
#define OC_CONST_ARRAY_OPERAND(_type,_ptr,_size) \
|
||||
(*({ \
|
||||
const struct{_type array_value__[(_size)];} *array_addr__= \
|
||||
(const void *)(_ptr); \
|
||||
array_addr__; \
|
||||
}))
|
||||
|
||||
extern const unsigned short __attribute__((aligned(16))) OC_IDCT_CONSTS[64];
|
||||
|
||||
void oc_state_accel_init_x86(oc_theora_state *_state);
|
||||
|
||||
void oc_frag_copy_mmx(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride);
|
||||
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter_mmx(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
|
||||
void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit);
|
||||
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
void oc_restore_fpu_mmx(void);
|
||||
|
||||
#endif
|
||||
97
engine/thirdparty/libtheora/x86/x86state.c
vendored
Normal file
97
engine/thirdparty/libtheora/x86/x86state.c
vendored
Normal file
|
|
@ -0,0 +1,97 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include "x86int.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
#if defined(OC_STATE_USE_VTABLE)
|
||||
/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
|
||||
each quadrant of the destination.*/
|
||||
static const unsigned char OC_FZIG_ZAG_MMX[128]={
|
||||
0, 8, 1, 2, 9,16,24,17,
|
||||
10, 3,32,11,18,25, 4,12,
|
||||
5,26,19,40,33,34,41,48,
|
||||
27, 6,13,20,28,21,14, 7,
|
||||
56,49,42,35,43,50,57,36,
|
||||
15,22,29,30,23,44,37,58,
|
||||
51,59,38,45,52,31,60,53,
|
||||
46,39,47,54,61,62,55,63,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64
|
||||
};
|
||||
#endif
|
||||
|
||||
/*This table has been modified from OC_FZIG_ZAG by baking an 8x8 transpose into
|
||||
the destination.*/
|
||||
static const unsigned char OC_FZIG_ZAG_SSE2[128]={
|
||||
0, 8, 1, 2, 9,16,24,17,
|
||||
10, 3, 4,11,18,25,32,40,
|
||||
33,26,19,12, 5, 6,13,20,
|
||||
27,34,41,48,56,49,42,35,
|
||||
28,21,14, 7,15,22,29,36,
|
||||
43,50,57,58,51,44,37,30,
|
||||
23,31,38,45,52,59,60,53,
|
||||
46,39,47,54,61,62,55,63,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64
|
||||
};
|
||||
|
||||
void oc_state_accel_init_x86(oc_theora_state *_state){
|
||||
oc_state_accel_init_c(_state);
|
||||
_state->cpu_flags=oc_cpu_flags_get();
|
||||
# if defined(OC_STATE_USE_VTABLE)
|
||||
if(_state->cpu_flags&OC_CPU_X86_MMX){
|
||||
_state->opt_vtable.frag_copy=oc_frag_copy_mmx;
|
||||
_state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
|
||||
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
||||
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
||||
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
|
||||
_state->opt_vtable.idct8x8=oc_idct8x8_mmx;
|
||||
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
|
||||
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
|
||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||
oc_state_loop_filter_frag_rows_mmx;
|
||||
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
|
||||
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
|
||||
}
|
||||
if(_state->cpu_flags&OC_CPU_X86_MMXEXT){
|
||||
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmxext;
|
||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||
oc_state_loop_filter_frag_rows_mmxext;
|
||||
}
|
||||
if(_state->cpu_flags&OC_CPU_X86_SSE2){
|
||||
_state->opt_vtable.idct8x8=oc_idct8x8_sse2;
|
||||
# endif
|
||||
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_SSE2;
|
||||
# if defined(OC_STATE_USE_VTABLE)
|
||||
}
|
||||
# endif
|
||||
}
|
||||
#endif
|
||||
244
engine/thirdparty/libtheora/x86/x86zigzag.h
vendored
Normal file
244
engine/thirdparty/libtheora/x86/x86zigzag.h
vendored
Normal file
|
|
@ -0,0 +1,244 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_x86zigzag_H)
|
||||
# define _x86_x86zigzag_H (1)
|
||||
# include "x86enc.h"
|
||||
|
||||
|
||||
/*Converts DCT coefficients from transposed order into zig-zag scan order and
|
||||
stores them in %[y].
|
||||
This relies on two macros to load the contents of each row:
|
||||
OC_ZZ_LOAD_ROW_LO(row,"reg") and OC_ZZ_LOAD_ROW_HI(row,"reg"), which load
|
||||
the first four and second four entries of each row into the specified
|
||||
register, respectively.
|
||||
OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
|
||||
(because when the rows are already in SSE2 registers, loading the high half
|
||||
destructively modifies the register).
|
||||
The index of each output element in the original 64-element array should wind
|
||||
up in the following 8x8 matrix (the letters indicate the order we compute
|
||||
each 4-tuple below):
|
||||
A 0 8 1 2 9 16 24 17 B
|
||||
C 10 3 4 11 18 25 32 40 E
|
||||
F 33 26 19 12 5 6 13 20 D
|
||||
G 27 34 41 48 56 49 42 35 I
|
||||
L 28 21 14 7 15 22 29 36 M
|
||||
H 43 50 57 58 51 44 37 30 O
|
||||
N 23 31 38 45 52 59 60 53 J
|
||||
P 46 39 47 54 61 62 55 63 K
|
||||
The order of the coefficients within each tuple is reversed in the comments
|
||||
below to reflect the usual MSB to LSB notation.*/
|
||||
#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
|
||||
OC_ZZ_LOAD_ROW_LO(0,"%%mm0") /*mm0=03 02 01 00*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(1,"%%mm1") /*mm1=11 10 09 08*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(2,"%%mm2") /*mm2=19 18 17 16*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(3,"%%mm3") /*mm3=27 26 25 24*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(0,"%%mm4") /*mm4=07 06 05 04*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(1,"%%mm5") /*mm5=15 14 13 12*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(2,"%%mm6") /*mm6=23 22 21 20*/ \
|
||||
"movq %%mm0,%%mm7\n\t" /*mm7=03 02 01 00*/ \
|
||||
"punpckhdq %%mm1,%%mm0\n\t" /*mm0=11 10 03 02*/ \
|
||||
"pshufw $0x39,%%mm4,%%mm4\n\t" /*mm4=04 07 06 05*/ \
|
||||
"punpcklwd %%mm0,%%mm1\n\t" /*mm1=03 09 02 08*/ \
|
||||
"pshufw $0x39,%%mm5,%%mm5\n\t" /*mm5=12 15 14 13*/ \
|
||||
"punpcklwd %%mm1,%%mm7\n\t" /*mm7=02 01 08 00 *A*/ \
|
||||
"movq %%mm7,0x00(%[y])\n\t" \
|
||||
"punpckhwd %%mm4,%%mm1\n\t" /*mm1=04 03 07 09*/ \
|
||||
"movq %%mm2,%%mm7\n\t" /*mm7=19 18 17 16*/ \
|
||||
"punpckhdq %%mm1,%%mm0\n\t" /*mm0=04 03 11 10*/ \
|
||||
"punpckhwd %%mm5,%%mm7\n\t" /*mm7=12 19 15 18*/ \
|
||||
"punpcklwd %%mm3,%%mm1\n\t" /*mm1=25 07 24 09*/ \
|
||||
"punpcklwd %%mm6,%%mm5\n\t" /*mm5=21 14 20 13*/ \
|
||||
"punpcklwd %%mm2,%%mm1\n\t" /*mm1=17 24 16 09 *B*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(4,"%%mm2") /*mm2=35 34 33 32*/ \
|
||||
"movq %%mm1,0x08(%[y])\n\t" \
|
||||
OC_ZZ_LOAD_ROW_LO(5,"%%mm1") /*mm1=43 42 41 40*/ \
|
||||
"pshufw $0x78,%%mm0,%%mm0\n\t" /*mm0=11 04 03 10 *C*/ \
|
||||
"movq %%mm0,0x10(%[y])\n\t" \
|
||||
"punpckhdq %%mm4,%%mm6\n\t" /*mm6=?? 07 23 22*/ \
|
||||
"punpckldq %%mm5,%%mm4\n\t" /*mm4=20 13 06 05 *D*/ \
|
||||
"movq %%mm4,0x28(%[y])\n\t" \
|
||||
"psrlq $16,%%mm3\n\t" /*mm3=.. 27 26 25*/ \
|
||||
"pshufw $0x0E,%%mm2,%%mm0\n\t" /*mm0=?? ?? 35 34*/ \
|
||||
"movq %%mm7,%%mm4\n\t" /*mm4=12 19 15 18*/ \
|
||||
"punpcklwd %%mm3,%%mm2\n\t" /*mm2=26 33 25 32*/ \
|
||||
"punpcklwd %%mm1,%%mm4\n\t" /*mm4=41 15 40 18*/ \
|
||||
"punpckhwd %%mm1,%%mm3\n\t" /*mm3=43 .. 42 27*/ \
|
||||
"punpckldq %%mm2,%%mm4\n\t" /*mm4=25 32 40 18*/ \
|
||||
"punpcklwd %%mm0,%%mm3\n\t" /*mm3=35 42 34 27*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(6,"%%mm0") /*mm0=51 50 49 48*/ \
|
||||
"pshufw $0x6C,%%mm4,%%mm4\n\t" /*mm4=40 32 25 18 *E*/ \
|
||||
"movq %%mm4,0x18(%[y])\n\t" \
|
||||
OC_ZZ_LOAD_ROW_LO(7,"%%mm4") /*mm4=59 58 57 56*/ \
|
||||
"punpckhdq %%mm7,%%mm2\n\t" /*mm2=12 19 26 33 *F*/ \
|
||||
"movq %%mm2,0x20(%[y])\n\t" \
|
||||
"pshufw $0xD0,%%mm1,%%mm1\n\t" /*mm1=43 41 ?? ??*/ \
|
||||
"pshufw $0x87,%%mm0,%%mm0\n\t" /*mm0=50 48 49 51*/ \
|
||||
"movq %%mm3,%%mm2\n\t" /*mm2=35 42 34 27*/ \
|
||||
"punpckhwd %%mm0,%%mm1\n\t" /*mm1=50 43 48 41*/ \
|
||||
"pshufw $0x93,%%mm4,%%mm4\n\t" /*mm4=58 57 56 59*/ \
|
||||
"punpckldq %%mm1,%%mm3\n\t" /*mm3=48 41 34 27 *G*/ \
|
||||
"movq %%mm3,0x30(%[y])\n\t" \
|
||||
"punpckhdq %%mm4,%%mm1\n\t" /*mm1=58 57 50 43 *H*/ \
|
||||
"movq %%mm1,0x50(%[y])\n\t" \
|
||||
OC_ZZ_LOAD_ROW_HI(7,"%%mm1") /*mm1=63 62 61 60*/ \
|
||||
"punpcklwd %%mm0,%%mm4\n\t" /*mm4=49 56 51 59*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(6,"%%mm0") /*mm0=55 54 53 52*/ \
|
||||
"psllq $16,%%mm6\n\t" /*mm6=07 23 22 ..*/ \
|
||||
"movq %%mm4,%%mm3\n\t" /*mm3=49 56 51 59*/ \
|
||||
"punpckhdq %%mm2,%%mm4\n\t" /*mm4=35 42 49 56 *I*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(3,"%%mm2") /*mm2=31 30 29 28*/ \
|
||||
"movq %%mm4,0x38(%[y])\n\t" \
|
||||
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=61 51 60 59*/ \
|
||||
"punpcklwd %%mm6,%%mm7\n\t" /*mm7=22 15 .. ??*/ \
|
||||
"movq %%mm3,%%mm4\n\t" /*mm4=61 51 60 59*/ \
|
||||
"punpcklwd %%mm0,%%mm3\n\t" /*mm3=53 60 52 59*/ \
|
||||
"punpckhwd %%mm0,%%mm4\n\t" /*mm4=55 61 54 51*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(4,"%%mm0") /*mm0=39 38 37 36*/ \
|
||||
"pshufw $0xE1,%%mm3,%%mm3\n\t" /*mm3=53 60 59 52 *J*/ \
|
||||
"movq %%mm3,0x68(%[y])\n\t" \
|
||||
"movq %%mm4,%%mm3\n\t" /*mm3=?? ?? 54 51*/ \
|
||||
"pshufw $0x39,%%mm2,%%mm2\n\t" /*mm2=28 31 30 29*/ \
|
||||
"punpckhwd %%mm1,%%mm4\n\t" /*mm4=63 55 62 61 *K*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(5,"%%mm1") /*mm1=47 46 45 44*/ \
|
||||
"movq %%mm4,0x78(%[y])\n\t" \
|
||||
"punpckhwd %%mm2,%%mm6\n\t" /*mm6=28 07 31 23*/ \
|
||||
"punpcklwd %%mm0,%%mm2\n\t" /*mm2=37 30 36 29*/ \
|
||||
"punpckhdq %%mm6,%%mm5\n\t" /*mm5=28 07 21 14*/ \
|
||||
"pshufw $0x4B,%%mm2,%%mm2\n\t" /*mm2=36 29 30 37*/ \
|
||||
"pshufw $0x87,%%mm5,%%mm5\n\t" /*mm5=07 14 21 28 *L*/ \
|
||||
"movq %%mm5,0x40(%[y])\n\t" \
|
||||
"punpckhdq %%mm2,%%mm7\n\t" /*mm7=36 29 22 15 *M*/ \
|
||||
"movq %%mm7,0x48(%[y])\n\t" \
|
||||
"pshufw $0x9C,%%mm1,%%mm1\n\t" /*mm1=46 45 47 44*/ \
|
||||
"punpckhwd %%mm1,%%mm0\n\t" /*mm0=46 39 45 38*/ \
|
||||
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=47 54 44 51*/ \
|
||||
"punpckldq %%mm0,%%mm6\n\t" /*mm6=45 38 31 23 *N*/ \
|
||||
"movq %%mm6,0x60(%[y])\n\t" \
|
||||
"punpckhdq %%mm3,%%mm0\n\t" /*mm0=47 54 46 39*/ \
|
||||
"punpckldq %%mm2,%%mm3\n\t" /*mm3=30 37 44 51 *O*/ \
|
||||
"movq %%mm3,0x58(%[y])\n\t" \
|
||||
"pshufw $0xB1,%%mm0,%%mm0\n\t" /*mm0=54 47 39 46 *P*/ \
|
||||
"movq %%mm0,0x70(%[y])\n\t" \
|
||||
|
||||
/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
|
||||
order and stores them in %[qdct].
|
||||
The index of each output element in the original 64-element array should wind
|
||||
up in the following 8x8 matrix (the letters indicate the order we compute
|
||||
each 4-tuple below):
|
||||
A 0 1 8 16 9 2 3 10 B
|
||||
C 17 24 32 25 18 11 4 5 D
|
||||
E 12 19 26 33 40 48 41 34 I
|
||||
H 27 20 13 6 7 14 21 28 G
|
||||
K 35 42 49 56 57 50 43 36 J
|
||||
F 29 22 15 23 30 37 44 51 M
|
||||
P 58 59 52 45 38 31 39 46 L
|
||||
N 53 60 61 54 47 55 62 63 O
|
||||
The order of the coefficients within each tuple is reversed in the comments
|
||||
below to reflect the usual MSB to LSB notation.*/
|
||||
#define OC_ZIG_ZAG_MMXEXT \
|
||||
"movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \
|
||||
"movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \
|
||||
"movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \
|
||||
"movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \
|
||||
"movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \
|
||||
"movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \
|
||||
"movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \
|
||||
"punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \
|
||||
"movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \
|
||||
"punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \
|
||||
"movq %%mm0,0x00(%[qdct])\n\t" \
|
||||
"movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \
|
||||
"punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \
|
||||
"psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \
|
||||
"punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \
|
||||
"punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \
|
||||
"pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
|
||||
"movq %%mm6,0x08(%[qdct])\n\t" \
|
||||
"psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \
|
||||
"movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \
|
||||
"punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \
|
||||
"movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \
|
||||
"punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \
|
||||
"por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \
|
||||
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \
|
||||
"movq %%mm2,0x10(%[qdct])\n\t" \
|
||||
"movq %%mm3,0x18(%[qdct])\n\t" \
|
||||
"movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \
|
||||
"movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \
|
||||
"pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
|
||||
"punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \
|
||||
"punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \
|
||||
"punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \
|
||||
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \
|
||||
"punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \
|
||||
"pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
|
||||
"psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \
|
||||
"punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \
|
||||
"punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \
|
||||
"punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \
|
||||
"movq %%mm0,0x20(%[qdct])\n\t" \
|
||||
"movq %%mm3,0x50(%[qdct])\n\t" \
|
||||
"movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \
|
||||
"movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \
|
||||
"movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \
|
||||
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \
|
||||
"psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \
|
||||
"movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \
|
||||
"punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \
|
||||
"punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \
|
||||
"movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \
|
||||
"pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
|
||||
"movq %%mm2,0x30(%[qdct])\n\t" \
|
||||
"movq %%mm6,0x38(%[qdct])\n\t" \
|
||||
"movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \
|
||||
"punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \
|
||||
"movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \
|
||||
"punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \
|
||||
"psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \
|
||||
"punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \
|
||||
"punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \
|
||||
"pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
|
||||
"movq %%mm0,0x28(%[qdct])\n\t" \
|
||||
"punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \
|
||||
"punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \
|
||||
"punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \
|
||||
"pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
|
||||
"movq %%mm4,0x40(%[qdct])\n\t" \
|
||||
"movq %%mm6,0x48(%[qdct])\n\t" \
|
||||
"movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \
|
||||
"movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \
|
||||
"psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \
|
||||
"pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
|
||||
"pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
|
||||
"punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \
|
||||
"pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
|
||||
"punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \
|
||||
"punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \
|
||||
"movq %%mm2,0x68(%[qdct])\n\t" \
|
||||
"movq %%mm1,0x58(%[qdct])\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \
|
||||
"punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \
|
||||
"pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
|
||||
"pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
|
||||
"punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \
|
||||
"punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \
|
||||
"movq %%mm6,0x70(%[qdct])\n\t" \
|
||||
"movq %%mm5,0x78(%[qdct])\n\t" \
|
||||
"movq %%mm7,0x60(%[qdct])\n\t" \
|
||||
|
||||
#endif
|
||||
983
engine/thirdparty/libtheora/x86_vc/mmxencfrag.c
vendored
Normal file
983
engine/thirdparty/libtheora/x86_vc/mmxencfrag.c
vendored
Normal file
|
|
@ -0,0 +1,983 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: dsp_mmx.c 14579 2008-03-12 06:42:40Z xiphmont $
|
||||
|
||||
********************************************************************/
|
||||
#include <stddef.h>
|
||||
#include "x86enc.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
ptrdiff_t ret;
|
||||
__asm{
|
||||
#define SRC esi
|
||||
#define REF edx
|
||||
#define YSTRIDE ecx
|
||||
#define YSTRIDE3 edi
|
||||
mov YSTRIDE,_ystride
|
||||
mov SRC,_src
|
||||
mov REF,_ref
|
||||
/*Load the first 4 rows of each block.*/
|
||||
movq mm0,[SRC]
|
||||
movq mm1,[REF]
|
||||
movq mm2,[SRC][YSTRIDE]
|
||||
movq mm3,[REF][YSTRIDE]
|
||||
lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
|
||||
movq mm4,[SRC+YSTRIDE*2]
|
||||
movq mm5,[REF+YSTRIDE*2]
|
||||
movq mm6,[SRC+YSTRIDE3]
|
||||
movq mm7,[REF+YSTRIDE3]
|
||||
/*Compute their SADs and add them in mm0*/
|
||||
psadbw mm0,mm1
|
||||
psadbw mm2,mm3
|
||||
lea SRC,[SRC+YSTRIDE*4]
|
||||
paddw mm0,mm2
|
||||
lea REF,[REF+YSTRIDE*4]
|
||||
/*Load the next 3 rows as registers become available.*/
|
||||
movq mm2,[SRC]
|
||||
movq mm3,[REF]
|
||||
psadbw mm4,mm5
|
||||
psadbw mm6,mm7
|
||||
paddw mm0,mm4
|
||||
movq mm5,[REF+YSTRIDE]
|
||||
movq mm4,[SRC+YSTRIDE]
|
||||
paddw mm0,mm6
|
||||
movq mm7,[REF+YSTRIDE*2]
|
||||
movq mm6,[SRC+YSTRIDE*2]
|
||||
/*Start adding their SADs to mm0*/
|
||||
psadbw mm2,mm3
|
||||
psadbw mm4,mm5
|
||||
paddw mm0,mm2
|
||||
psadbw mm6,mm7
|
||||
/*Load last row as registers become available.*/
|
||||
movq mm2,[SRC+YSTRIDE3]
|
||||
movq mm3,[REF+YSTRIDE3]
|
||||
/*And finish adding up their SADs.*/
|
||||
paddw mm0,mm4
|
||||
psadbw mm2,mm3
|
||||
paddw mm0,mm6
|
||||
paddw mm0,mm2
|
||||
movd [ret],mm0
|
||||
#undef SRC
|
||||
#undef REF
|
||||
#undef YSTRIDE
|
||||
#undef YSTRIDE3
|
||||
}
|
||||
return (unsigned)ret;
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _thresh){
|
||||
/*Early termination is for suckers.*/
|
||||
return oc_enc_frag_sad_mmxext(_src,_ref,_ystride);
|
||||
}
|
||||
|
||||
#define OC_SAD2_LOOP __asm{ \
|
||||
/*We want to compute (mm0+mm1>>1) on unsigned bytes without overflow, but \
|
||||
pavgb computes (mm0+mm1+1>>1). \
|
||||
The latter is exactly 1 too large when the low bit of two corresponding \
|
||||
bytes is only set in one of them. \
|
||||
Therefore we pxor the operands, pand to mask out the low bits, and psubb to \
|
||||
correct the output of pavgb.*/ \
|
||||
__asm movq mm6,mm0 \
|
||||
__asm lea REF1,[REF1+YSTRIDE*2] \
|
||||
__asm pxor mm0,mm1 \
|
||||
__asm pavgb mm6,mm1 \
|
||||
__asm lea REF2,[REF2+YSTRIDE*2] \
|
||||
__asm movq mm1,mm2 \
|
||||
__asm pand mm0,mm7 \
|
||||
__asm pavgb mm2,mm3 \
|
||||
__asm pxor mm1,mm3 \
|
||||
__asm movq mm3,[REF2+YSTRIDE] \
|
||||
__asm psubb mm6,mm0 \
|
||||
__asm movq mm0,[REF1] \
|
||||
__asm pand mm1,mm7 \
|
||||
__asm psadbw mm4,mm6 \
|
||||
__asm movd mm6,RET \
|
||||
__asm psubb mm2,mm1 \
|
||||
__asm movq mm1,[REF2] \
|
||||
__asm lea SRC,[SRC+YSTRIDE*2] \
|
||||
__asm psadbw mm5,mm2 \
|
||||
__asm movq mm2,[REF1+YSTRIDE] \
|
||||
__asm paddw mm5,mm4 \
|
||||
__asm movq mm4,[SRC] \
|
||||
__asm paddw mm6,mm5 \
|
||||
__asm movq mm5,[SRC+YSTRIDE] \
|
||||
__asm movd RET,mm6 \
|
||||
}
|
||||
|
||||
/*Same as above, but does not pre-load the next two rows.*/
|
||||
#define OC_SAD2_TAIL __asm{ \
|
||||
__asm movq mm6,mm0 \
|
||||
__asm pavgb mm0,mm1 \
|
||||
__asm pxor mm6,mm1 \
|
||||
__asm movq mm1,mm2 \
|
||||
__asm pand mm6,mm7 \
|
||||
__asm pavgb mm2,mm3 \
|
||||
__asm pxor mm1,mm3 \
|
||||
__asm psubb mm0,mm6 \
|
||||
__asm pand mm1,mm7 \
|
||||
__asm psadbw mm4,mm0 \
|
||||
__asm psubb mm2,mm1 \
|
||||
__asm movd mm6,RET \
|
||||
__asm psadbw mm5,mm2 \
|
||||
__asm paddw mm5,mm4 \
|
||||
__asm paddw mm6,mm5 \
|
||||
__asm movd RET,mm6 \
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||
unsigned _thresh){
|
||||
ptrdiff_t ret;
|
||||
__asm{
|
||||
#define REF1 ecx
|
||||
#define REF2 edi
|
||||
#define YSTRIDE esi
|
||||
#define SRC edx
|
||||
#define RET eax
|
||||
mov YSTRIDE,_ystride
|
||||
mov SRC,_src
|
||||
mov REF1,_ref1
|
||||
mov REF2,_ref2
|
||||
movq mm0,[REF1]
|
||||
movq mm1,[REF2]
|
||||
movq mm2,[REF1+YSTRIDE]
|
||||
movq mm3,[REF2+YSTRIDE]
|
||||
xor RET,RET
|
||||
movq mm4,[SRC]
|
||||
pxor mm7,mm7
|
||||
pcmpeqb mm6,mm6
|
||||
movq mm5,[SRC+YSTRIDE]
|
||||
psubb mm7,mm6
|
||||
OC_SAD2_LOOP
|
||||
OC_SAD2_LOOP
|
||||
OC_SAD2_LOOP
|
||||
OC_SAD2_TAIL
|
||||
mov [ret],RET
|
||||
#undef REF1
|
||||
#undef REF2
|
||||
#undef YSTRIDE
|
||||
#undef SRC
|
||||
#undef RET
|
||||
}
|
||||
return (unsigned)ret;
|
||||
}
|
||||
|
||||
/*Load an 8x4 array of pixel values from %[src] and %[ref] and compute their
|
||||
16-bit difference in mm0...mm7.*/
|
||||
#define OC_LOAD_SUB_8x4(_off) __asm{ \
|
||||
__asm movd mm0,[_off+SRC] \
|
||||
__asm movd mm4,[_off+REF] \
|
||||
__asm movd mm1,[_off+SRC+SRC_YSTRIDE] \
|
||||
__asm lea SRC,[SRC+SRC_YSTRIDE*2] \
|
||||
__asm movd mm5,[_off+REF+REF_YSTRIDE] \
|
||||
__asm lea REF,[REF+REF_YSTRIDE*2] \
|
||||
__asm movd mm2,[_off+SRC] \
|
||||
__asm movd mm7,[_off+REF] \
|
||||
__asm movd mm3,[_off+SRC+SRC_YSTRIDE] \
|
||||
__asm movd mm6,[_off+REF+REF_YSTRIDE] \
|
||||
__asm punpcklbw mm0,mm4 \
|
||||
__asm lea SRC,[SRC+SRC_YSTRIDE*2] \
|
||||
__asm punpcklbw mm4,mm4 \
|
||||
__asm lea REF,[REF+REF_YSTRIDE*2] \
|
||||
__asm psubw mm0,mm4 \
|
||||
__asm movd mm4,[_off+SRC] \
|
||||
__asm movq [_off*2+BUF],mm0 \
|
||||
__asm movd mm0,[_off+REF] \
|
||||
__asm punpcklbw mm1,mm5 \
|
||||
__asm punpcklbw mm5,mm5 \
|
||||
__asm psubw mm1,mm5 \
|
||||
__asm movd mm5,[_off+SRC+SRC_YSTRIDE] \
|
||||
__asm punpcklbw mm2,mm7 \
|
||||
__asm punpcklbw mm7,mm7 \
|
||||
__asm psubw mm2,mm7 \
|
||||
__asm movd mm7,[_off+REF+REF_YSTRIDE] \
|
||||
__asm punpcklbw mm3,mm6 \
|
||||
__asm lea SRC,[SRC+SRC_YSTRIDE*2] \
|
||||
__asm punpcklbw mm6,mm6 \
|
||||
__asm psubw mm3,mm6 \
|
||||
__asm movd mm6,[_off+SRC] \
|
||||
__asm punpcklbw mm4,mm0 \
|
||||
__asm lea REF,[REF+REF_YSTRIDE*2] \
|
||||
__asm punpcklbw mm0,mm0 \
|
||||
__asm lea SRC,[SRC+SRC_YSTRIDE*2] \
|
||||
__asm psubw mm4,mm0 \
|
||||
__asm movd mm0,[_off+REF] \
|
||||
__asm punpcklbw mm5,mm7 \
|
||||
__asm neg SRC_YSTRIDE \
|
||||
__asm punpcklbw mm7,mm7 \
|
||||
__asm psubw mm5,mm7 \
|
||||
__asm movd mm7,[_off+SRC+SRC_YSTRIDE] \
|
||||
__asm punpcklbw mm6,mm0 \
|
||||
__asm lea REF,[REF+REF_YSTRIDE*2] \
|
||||
__asm punpcklbw mm0,mm0 \
|
||||
__asm neg REF_YSTRIDE \
|
||||
__asm psubw mm6,mm0 \
|
||||
__asm movd mm0,[_off+REF+REF_YSTRIDE] \
|
||||
__asm lea SRC,[SRC+SRC_YSTRIDE*8] \
|
||||
__asm punpcklbw mm7,mm0 \
|
||||
__asm neg SRC_YSTRIDE \
|
||||
__asm punpcklbw mm0,mm0 \
|
||||
__asm lea REF,[REF+REF_YSTRIDE*8] \
|
||||
__asm psubw mm7,mm0 \
|
||||
__asm neg REF_YSTRIDE \
|
||||
__asm movq mm0,[_off*2+BUF] \
|
||||
}
|
||||
|
||||
/*Load an 8x4 array of pixel values from %[src] into %%mm0...%%mm7.*/
|
||||
#define OC_LOAD_8x4(_off) __asm{ \
|
||||
__asm movd mm0,[_off+SRC] \
|
||||
__asm movd mm1,[_off+SRC+YSTRIDE] \
|
||||
__asm movd mm2,[_off+SRC+YSTRIDE*2] \
|
||||
__asm pxor mm7,mm7 \
|
||||
__asm movd mm3,[_off+SRC+YSTRIDE3] \
|
||||
__asm punpcklbw mm0,mm7 \
|
||||
__asm movd mm4,[_off+SRC4] \
|
||||
__asm punpcklbw mm1,mm7 \
|
||||
__asm movd mm5,[_off+SRC4+YSTRIDE] \
|
||||
__asm punpcklbw mm2,mm7 \
|
||||
__asm movd mm6,[_off+SRC4+YSTRIDE*2] \
|
||||
__asm punpcklbw mm3,mm7 \
|
||||
__asm movd mm7,[_off+SRC4+YSTRIDE3] \
|
||||
__asm punpcklbw mm4,mm4 \
|
||||
__asm punpcklbw mm5,mm5 \
|
||||
__asm psrlw mm4,8 \
|
||||
__asm psrlw mm5,8 \
|
||||
__asm punpcklbw mm6,mm6 \
|
||||
__asm punpcklbw mm7,mm7 \
|
||||
__asm psrlw mm6,8 \
|
||||
__asm psrlw mm7,8 \
|
||||
}
|
||||
|
||||
/*Performs the first two stages of an 8-point 1-D Hadamard transform.
|
||||
The transform is performed in place, except that outputs 0-3 are swapped with
|
||||
outputs 4-7.
|
||||
Outputs 2, 3, 6, and 7 from the second stage are negated (which allows us to
|
||||
perform this stage in place with no temporary registers).*/
|
||||
#define OC_HADAMARD_AB_8x4 __asm{ \
|
||||
/*Stage A: \
|
||||
Outputs 0-3 are swapped with 4-7 here.*/ \
|
||||
__asm paddw mm5,mm1 \
|
||||
__asm paddw mm6,mm2 \
|
||||
__asm paddw mm1,mm1 \
|
||||
__asm paddw mm2,mm2 \
|
||||
__asm psubw mm1,mm5 \
|
||||
__asm psubw mm2,mm6 \
|
||||
__asm paddw mm7,mm3 \
|
||||
__asm paddw mm4,mm0 \
|
||||
__asm paddw mm3,mm3 \
|
||||
__asm paddw mm0,mm0 \
|
||||
__asm psubw mm3,mm7 \
|
||||
__asm psubw mm0,mm4 \
|
||||
/*Stage B:*/ \
|
||||
__asm paddw mm0,mm2 \
|
||||
__asm paddw mm1,mm3 \
|
||||
__asm paddw mm4,mm6 \
|
||||
__asm paddw mm5,mm7 \
|
||||
__asm paddw mm2,mm2 \
|
||||
__asm paddw mm3,mm3 \
|
||||
__asm paddw mm6,mm6 \
|
||||
__asm paddw mm7,mm7 \
|
||||
__asm psubw mm2,mm0 \
|
||||
__asm psubw mm3,mm1 \
|
||||
__asm psubw mm6,mm4 \
|
||||
__asm psubw mm7,mm5 \
|
||||
}
|
||||
|
||||
/*Performs the last stage of an 8-point 1-D Hadamard transform in place.
|
||||
Outputs 1, 3, 5, and 7 are negated (which allows us to perform this stage in
|
||||
place with no temporary registers).*/
|
||||
#define OC_HADAMARD_C_8x4 __asm{ \
|
||||
/*Stage C:*/ \
|
||||
__asm paddw mm0,mm1 \
|
||||
__asm paddw mm2,mm3 \
|
||||
__asm paddw mm4,mm5 \
|
||||
__asm paddw mm6,mm7 \
|
||||
__asm paddw mm1,mm1 \
|
||||
__asm paddw mm3,mm3 \
|
||||
__asm paddw mm5,mm5 \
|
||||
__asm paddw mm7,mm7 \
|
||||
__asm psubw mm1,mm0 \
|
||||
__asm psubw mm3,mm2 \
|
||||
__asm psubw mm5,mm4 \
|
||||
__asm psubw mm7,mm6 \
|
||||
}
|
||||
|
||||
/*Performs an 8-point 1-D Hadamard transform.
|
||||
The transform is performed in place, except that outputs 0-3 are swapped with
|
||||
outputs 4-7.
|
||||
Outputs 1, 2, 5 and 6 are negated (which allows us to perform the transform
|
||||
in place with no temporary registers).*/
|
||||
#define OC_HADAMARD_8x4 __asm{ \
|
||||
OC_HADAMARD_AB_8x4 \
|
||||
OC_HADAMARD_C_8x4 \
|
||||
}
|
||||
|
||||
/*Performs the first part of the final stage of the Hadamard transform and
|
||||
summing of absolute values.
|
||||
At the end of this part, mm1 will contain the DC coefficient of the
|
||||
transform.*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) __asm{ \
|
||||
/*We use the fact that \
|
||||
(abs(a+b)+abs(a-b))/2=max(abs(a),abs(b)) \
|
||||
to merge the final butterfly with the abs and the first stage of \
|
||||
accumulation. \
|
||||
Thus we can avoid using pabsw, which is not available until SSSE3. \
|
||||
Emulating pabsw takes 3 instructions, so the straightforward MMXEXT \
|
||||
implementation would be (3+3)*8+7=55 instructions (+4 for spilling \
|
||||
registers). \
|
||||
Even with pabsw, it would be (3+1)*8+7=39 instructions (with no spills). \
|
||||
This implementation is only 26 (+4 for spilling registers).*/ \
|
||||
__asm movq [_r7+BUF],mm7 \
|
||||
__asm movq [_r6+BUF],mm6 \
|
||||
/*mm7={0x7FFF}x4 \
|
||||
mm0=max(abs(mm0),abs(mm1))-0x7FFF*/ \
|
||||
__asm pcmpeqb mm7,mm7 \
|
||||
__asm movq mm6,mm0 \
|
||||
__asm psrlw mm7,1 \
|
||||
__asm paddw mm6,mm1 \
|
||||
__asm pmaxsw mm0,mm1 \
|
||||
__asm paddsw mm6,mm7 \
|
||||
__asm psubw mm0,mm6 \
|
||||
/*mm2=max(abs(mm2),abs(mm3))-0x7FFF \
|
||||
mm4=max(abs(mm4),abs(mm5))-0x7FFF*/ \
|
||||
__asm movq mm6,mm2 \
|
||||
__asm movq mm1,mm4 \
|
||||
__asm pmaxsw mm2,mm3 \
|
||||
__asm pmaxsw mm4,mm5 \
|
||||
__asm paddw mm6,mm3 \
|
||||
__asm paddw mm1,mm5 \
|
||||
__asm movq mm3,[_r7+BUF] \
|
||||
}
|
||||
|
||||
/*Performs the second part of the final stage of the Hadamard transform and
|
||||
summing of absolute values.*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) __asm{ \
|
||||
__asm paddsw mm6,mm7 \
|
||||
__asm movq mm5,[_r6+BUF] \
|
||||
__asm paddsw mm1,mm7 \
|
||||
__asm psubw mm2,mm6 \
|
||||
__asm psubw mm4,mm1 \
|
||||
/*mm7={1}x4 (needed for the horizontal add that follows) \
|
||||
mm0+=mm2+mm4+max(abs(mm3),abs(mm5))-0x7FFF*/ \
|
||||
__asm movq mm6,mm3 \
|
||||
__asm pmaxsw mm3,mm5 \
|
||||
__asm paddw mm0,mm2 \
|
||||
__asm paddw mm6,mm5 \
|
||||
__asm paddw mm0,mm4 \
|
||||
__asm paddsw mm6,mm7 \
|
||||
__asm paddw mm0,mm3 \
|
||||
__asm psrlw mm7,14 \
|
||||
__asm psubw mm0,mm6 \
|
||||
}
|
||||
|
||||
/*Performs the last stage of an 8-point 1-D Hadamard transform, takes the
|
||||
absolute value of each component, and accumulates everything into mm0.
|
||||
This is the only portion of SATD which requires MMXEXT (we could use plain
|
||||
MMX, but it takes 4 instructions and an extra register to work around the
|
||||
lack of a pmaxsw, which is a pretty serious penalty).*/
|
||||
#define OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x4(_r6,_r7) \
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x4(_r6,_r7) \
|
||||
}
|
||||
|
||||
/*Performs an 8-point 1-D Hadamard transform, takes the absolute value of each
|
||||
component, and accumulates everything into mm0.
|
||||
Note that mm0 will have an extra 4 added to each column, and that after
|
||||
removing this value, the remainder will be half the conventional value.*/
|
||||
#define OC_HADAMARD_ABS_ACCUM_8x4(_r6,_r7) __asm{ \
|
||||
OC_HADAMARD_AB_8x4 \
|
||||
OC_HADAMARD_C_ABS_ACCUM_8x4(_r6,_r7) \
|
||||
}
|
||||
|
||||
/*Performs two 4x4 transposes (mostly) in place.
|
||||
On input, {mm0,mm1,mm2,mm3} contains rows {e,f,g,h}, and {mm4,mm5,mm6,mm7}
|
||||
contains rows {a,b,c,d}.
|
||||
On output, {0x40,0x50,0x60,0x70}+_off+BUF contains {e,f,g,h}^T, and
|
||||
{mm4,mm5,mm6,mm7} contains the transposed rows {a,b,c,d}^T.*/
|
||||
#define OC_TRANSPOSE_4x4x2(_off) __asm{ \
|
||||
/*First 4x4 transpose:*/ \
|
||||
__asm movq [0x10+_off+BUF],mm5 \
|
||||
/*mm0 = e3 e2 e1 e0 \
|
||||
mm1 = f3 f2 f1 f0 \
|
||||
mm2 = g3 g2 g1 g0 \
|
||||
mm3 = h3 h2 h1 h0*/ \
|
||||
__asm movq mm5,mm2 \
|
||||
__asm punpcklwd mm2,mm3 \
|
||||
__asm punpckhwd mm5,mm3 \
|
||||
__asm movq mm3,mm0 \
|
||||
__asm punpcklwd mm0,mm1 \
|
||||
__asm punpckhwd mm3,mm1 \
|
||||
/*mm0 = f1 e1 f0 e0 \
|
||||
mm3 = f3 e3 f2 e2 \
|
||||
mm2 = h1 g1 h0 g0 \
|
||||
mm5 = h3 g3 h2 g2*/ \
|
||||
__asm movq mm1,mm0 \
|
||||
__asm punpckldq mm0,mm2 \
|
||||
__asm punpckhdq mm1,mm2 \
|
||||
__asm movq mm2,mm3 \
|
||||
__asm punpckhdq mm3,mm5 \
|
||||
__asm movq [0x40+_off+BUF],mm0 \
|
||||
__asm punpckldq mm2,mm5 \
|
||||
/*mm0 = h0 g0 f0 e0 \
|
||||
mm1 = h1 g1 f1 e1 \
|
||||
mm2 = h2 g2 f2 e2 \
|
||||
mm3 = h3 g3 f3 e3*/ \
|
||||
__asm movq mm5,[0x10+_off+BUF] \
|
||||
/*Second 4x4 transpose:*/ \
|
||||
/*mm4 = a3 a2 a1 a0 \
|
||||
mm5 = b3 b2 b1 b0 \
|
||||
mm6 = c3 c2 c1 c0 \
|
||||
mm7 = d3 d2 d1 d0*/ \
|
||||
__asm movq mm0,mm6 \
|
||||
__asm punpcklwd mm6,mm7 \
|
||||
__asm movq [0x50+_off+BUF],mm1 \
|
||||
__asm punpckhwd mm0,mm7 \
|
||||
__asm movq mm7,mm4 \
|
||||
__asm punpcklwd mm4,mm5 \
|
||||
__asm movq [0x60+_off+BUF],mm2 \
|
||||
__asm punpckhwd mm7,mm5 \
|
||||
/*mm4 = b1 a1 b0 a0 \
|
||||
mm7 = b3 a3 b2 a2 \
|
||||
mm6 = d1 c1 d0 c0 \
|
||||
mm0 = d3 c3 d2 c2*/ \
|
||||
__asm movq mm5,mm4 \
|
||||
__asm punpckldq mm4,mm6 \
|
||||
__asm movq [0x70+_off+BUF],mm3 \
|
||||
__asm punpckhdq mm5,mm6 \
|
||||
__asm movq mm6,mm7 \
|
||||
__asm punpckhdq mm7,mm0 \
|
||||
__asm punpckldq mm6,mm0 \
|
||||
/*mm4 = d0 c0 b0 a0 \
|
||||
mm5 = d1 c1 b1 a1 \
|
||||
mm6 = d2 c2 b2 a2 \
|
||||
mm7 = d3 c3 b3 a3*/ \
|
||||
}
|
||||
|
||||
static unsigned oc_int_frag_satd_mmxext(int *_dc,
|
||||
const unsigned char *_src,int _src_ystride,
|
||||
const unsigned char *_ref,int _ref_ystride){
|
||||
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||
ogg_int16_t *bufp;
|
||||
unsigned ret;
|
||||
unsigned ret2;
|
||||
int dc;
|
||||
bufp=buf;
|
||||
__asm{
|
||||
#define SRC esi
|
||||
#define REF eax
|
||||
#define SRC_YSTRIDE ecx
|
||||
#define REF_YSTRIDE edx
|
||||
#define BUF edi
|
||||
#define RET edx
|
||||
#define RET2 ecx
|
||||
#define DC eax
|
||||
#define DC_WORD ax
|
||||
mov SRC,_src
|
||||
mov SRC_YSTRIDE,_src_ystride
|
||||
mov REF,_ref
|
||||
mov REF_YSTRIDE,_ref_ystride
|
||||
mov BUF,bufp
|
||||
OC_LOAD_SUB_8x4(0x00)
|
||||
OC_HADAMARD_8x4
|
||||
OC_TRANSPOSE_4x4x2(0x00)
|
||||
/*Finish swapping out this 8x4 block to make room for the next one.
|
||||
mm0...mm3 have been swapped out already.*/
|
||||
movq [0x00+BUF],mm4
|
||||
movq [0x10+BUF],mm5
|
||||
movq [0x20+BUF],mm6
|
||||
movq [0x30+BUF],mm7
|
||||
OC_LOAD_SUB_8x4(0x04)
|
||||
OC_HADAMARD_8x4
|
||||
OC_TRANSPOSE_4x4x2(0x08)
|
||||
/*Here the first 4x4 block of output from the last transpose is the second
|
||||
4x4 block of input for the next transform.
|
||||
We have cleverly arranged that it already be in the appropriate place, so
|
||||
we only have to do half the loads.*/
|
||||
movq mm1,[0x10+BUF]
|
||||
movq mm2,[0x20+BUF]
|
||||
movq mm3,[0x30+BUF]
|
||||
movq mm0,[0x00+BUF]
|
||||
/*We split out the stages here so we can save the DC coefficient in the
|
||||
middle.*/
|
||||
OC_HADAMARD_AB_8x4
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
|
||||
movd DC,mm1
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
|
||||
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||
Now we finally have to promote things to dwords.
|
||||
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
|
||||
latency of pmaddwd by starting the next series of loads now.*/
|
||||
pmaddwd mm0,mm7
|
||||
movq mm1,[0x50+BUF]
|
||||
movq mm5,[0x58+BUF]
|
||||
movq mm4,mm0
|
||||
movq mm2,[0x60+BUF]
|
||||
punpckhdq mm0,mm0
|
||||
movq mm6,[0x68+BUF]
|
||||
paddd mm4,mm0
|
||||
movq mm3,[0x70+BUF]
|
||||
movd RET2,mm4
|
||||
movq mm7,[0x78+BUF]
|
||||
movq mm0,[0x40+BUF]
|
||||
movq mm4,[0x48+BUF]
|
||||
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
|
||||
pmaddwd mm0,mm7
|
||||
/*Subtract abs(dc) from 2*ret2.*/
|
||||
movsx DC,DC_WORD
|
||||
cdq
|
||||
lea RET2,[RET+RET2*2]
|
||||
movq mm4,mm0
|
||||
punpckhdq mm0,mm0
|
||||
xor RET,DC
|
||||
paddd mm4,mm0
|
||||
/*The sums produced by OC_HADAMARD_ABS_ACCUM_8x4 each have an extra 4
|
||||
added to them, a factor of two removed, and the DC value included;
|
||||
correct the final sum here.*/
|
||||
sub RET2,RET
|
||||
movd RET,mm4
|
||||
lea RET,[RET2+RET*2-64]
|
||||
mov ret,RET
|
||||
mov dc,DC
|
||||
#undef SRC
|
||||
#undef REF
|
||||
#undef SRC_YSTRIDE
|
||||
#undef REF_YSTRIDE
|
||||
#undef BUF
|
||||
#undef RET
|
||||
#undef RET2
|
||||
#undef DC
|
||||
#undef DC_WORD
|
||||
}
|
||||
*_dc=dc;
|
||||
return ret;
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_satd_mmxext(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride){
|
||||
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,_ref,_ystride);
|
||||
}
|
||||
|
||||
|
||||
/*Our internal implementation of frag_copy2 takes an extra stride parameter so
|
||||
we can share code with oc_enc_frag_satd2_mmxext().*/
|
||||
static void oc_int_frag_copy2_mmxext(unsigned char *_dst,int _dst_ystride,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _src_ystride){
|
||||
__asm{
|
||||
/*Load the first 3 rows.*/
|
||||
#define DST_YSTRIDE edi
|
||||
#define SRC_YSTRIDE esi
|
||||
#define DST eax
|
||||
#define SRC1 edx
|
||||
#define SRC2 ecx
|
||||
mov DST_YSTRIDE,_dst_ystride
|
||||
mov SRC_YSTRIDE,_src_ystride
|
||||
mov DST,_dst
|
||||
mov SRC1,_src1
|
||||
mov SRC2,_src2
|
||||
movq mm0,[SRC1]
|
||||
movq mm1,[SRC2]
|
||||
movq mm2,[SRC1+SRC_YSTRIDE]
|
||||
lea SRC1,[SRC1+SRC_YSTRIDE*2]
|
||||
movq mm3,[SRC2+SRC_YSTRIDE]
|
||||
lea SRC2,[SRC2+SRC_YSTRIDE*2]
|
||||
pxor mm7,mm7
|
||||
movq mm4,[SRC1]
|
||||
pcmpeqb mm6,mm6
|
||||
movq mm5,[SRC2]
|
||||
/*mm7={1}x8.*/
|
||||
psubb mm7,mm6
|
||||
/*Start averaging mm0 and mm1 into mm6.*/
|
||||
movq mm6,mm0
|
||||
pxor mm0,mm1
|
||||
pavgb mm6,mm1
|
||||
/*mm1 is free, start averaging mm3 into mm2 using mm1.*/
|
||||
movq mm1,mm2
|
||||
pand mm0,mm7
|
||||
pavgb mm2,mm3
|
||||
pxor mm1,mm3
|
||||
/*mm3 is free.*/
|
||||
psubb mm6,mm0
|
||||
/*mm0 is free, start loading the next row.*/
|
||||
movq mm0,[SRC1+SRC_YSTRIDE]
|
||||
/*Start averaging mm5 and mm4 using mm3.*/
|
||||
movq mm3,mm4
|
||||
/*mm6 [row 0] is done; write it out.*/
|
||||
movq [DST],mm6
|
||||
pand mm1,mm7
|
||||
pavgb mm4,mm5
|
||||
psubb mm2,mm1
|
||||
/*mm1 is free, continue loading the next row.*/
|
||||
movq mm1,[SRC2+SRC_YSTRIDE]
|
||||
pxor mm3,mm5
|
||||
lea SRC1,[SRC1+SRC_YSTRIDE*2]
|
||||
/*mm2 [row 1] is done; write it out.*/
|
||||
movq [DST+DST_YSTRIDE],mm2
|
||||
pand mm3,mm7
|
||||
/*Start loading the next row.*/
|
||||
movq mm2,[SRC1]
|
||||
lea DST,[DST+DST_YSTRIDE*2]
|
||||
psubb mm4,mm3
|
||||
lea SRC2,[SRC2+SRC_YSTRIDE*2]
|
||||
/*mm4 [row 2] is done; write it out.*/
|
||||
movq [DST],mm4
|
||||
/*Continue loading the next row.*/
|
||||
movq mm3,[SRC2]
|
||||
/*Start averaging mm0 and mm1 into mm6.*/
|
||||
movq mm6,mm0
|
||||
pxor mm0,mm1
|
||||
/*Start loading the next row.*/
|
||||
movq mm4,[SRC1+SRC_YSTRIDE]
|
||||
pavgb mm6,mm1
|
||||
/*mm1 is free; start averaging mm3 into mm2 using mm1.*/
|
||||
movq mm1,mm2
|
||||
pand mm0,mm7
|
||||
/*Continue loading the next row.*/
|
||||
movq mm5,[SRC2+SRC_YSTRIDE]
|
||||
pavgb mm2,mm3
|
||||
lea SRC1,[SRC1+SRC_YSTRIDE*2]
|
||||
pxor mm1,mm3
|
||||
/*mm3 is free.*/
|
||||
psubb mm6,mm0
|
||||
/*mm0 is free, start loading the next row.*/
|
||||
movq mm0,[SRC1]
|
||||
/*Start averaging mm5 into mm4 using mm3.*/
|
||||
movq mm3,mm4
|
||||
/*mm6 [row 3] is done; write it out.*/
|
||||
movq [DST+DST_YSTRIDE],mm6
|
||||
pand mm1,mm7
|
||||
lea SRC2,[SRC2+SRC_YSTRIDE*2]
|
||||
pavgb mm4,mm5
|
||||
lea DST,[DST+DST_YSTRIDE*2]
|
||||
psubb mm2,mm1
|
||||
/*mm1 is free; continue loading the next row.*/
|
||||
movq mm1,[SRC2]
|
||||
pxor mm3,mm5
|
||||
/*mm2 [row 4] is done; write it out.*/
|
||||
movq [DST],mm2
|
||||
pand mm3,mm7
|
||||
/*Start loading the next row.*/
|
||||
movq mm2,[SRC1+SRC_YSTRIDE]
|
||||
psubb mm4,mm3
|
||||
/*Start averaging mm0 and mm1 into mm6.*/
|
||||
movq mm6,mm0
|
||||
/*Continue loading the next row.*/
|
||||
movq mm3,[SRC2+SRC_YSTRIDE]
|
||||
/*mm4 [row 5] is done; write it out.*/
|
||||
movq [DST+DST_YSTRIDE],mm4
|
||||
pxor mm0,mm1
|
||||
pavgb mm6,mm1
|
||||
/*mm4 is free; start averaging mm3 into mm2 using mm4.*/
|
||||
movq mm4,mm2
|
||||
pand mm0,mm7
|
||||
pavgb mm2,mm3
|
||||
pxor mm4,mm3
|
||||
lea DST,[DST+DST_YSTRIDE*2]
|
||||
psubb mm6,mm0
|
||||
pand mm4,mm7
|
||||
/*mm6 [row 6] is done, write it out.*/
|
||||
movq [DST],mm6
|
||||
psubb mm2,mm4
|
||||
/*mm2 [row 7] is done, write it out.*/
|
||||
movq [DST+DST_YSTRIDE],mm2
|
||||
#undef SRC1
|
||||
#undef SRC2
|
||||
#undef SRC_YSTRIDE
|
||||
#undef DST_YSTRIDE
|
||||
#undef DST
|
||||
}
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_satd2_mmxext(int *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride){
|
||||
OC_ALIGN8(unsigned char ref[64]);
|
||||
oc_int_frag_copy2_mmxext(ref,8,_ref1,_ref2,_ystride);
|
||||
return oc_int_frag_satd_mmxext(_dc,_src,_ystride,ref,8);
|
||||
}
|
||||
|
||||
unsigned oc_enc_frag_intra_satd_mmxext(int *_dc,const unsigned char *_src,
|
||||
int _ystride){
|
||||
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||
ogg_int16_t *bufp;
|
||||
unsigned ret1;
|
||||
unsigned ret2;
|
||||
int dc;
|
||||
bufp=buf;
|
||||
__asm{
|
||||
#define SRC eax
|
||||
#define SRC4 esi
|
||||
#define BUF edi
|
||||
#define YSTRIDE edx
|
||||
#define YSTRIDE3 ecx
|
||||
#define RET eax
|
||||
#define RET2 ecx
|
||||
#define DC edx
|
||||
#define DC_WORD dx
|
||||
mov SRC,_src
|
||||
mov BUF,bufp
|
||||
mov YSTRIDE,_ystride
|
||||
/* src4 = src+4*ystride */
|
||||
lea SRC4,[SRC+YSTRIDE*4]
|
||||
/* ystride3 = 3*ystride */
|
||||
lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
|
||||
OC_LOAD_8x4(0x00)
|
||||
OC_HADAMARD_8x4
|
||||
OC_TRANSPOSE_4x4x2(0x00)
|
||||
/*Finish swapping out this 8x4 block to make room for the next one.
|
||||
mm0...mm3 have been swapped out already.*/
|
||||
movq [0x00+BUF],mm4
|
||||
movq [0x10+BUF],mm5
|
||||
movq [0x20+BUF],mm6
|
||||
movq [0x30+BUF],mm7
|
||||
OC_LOAD_8x4(0x04)
|
||||
OC_HADAMARD_8x4
|
||||
OC_TRANSPOSE_4x4x2(0x08)
|
||||
/*Here the first 4x4 block of output from the last transpose is the second
|
||||
4x4 block of input for the next transform.
|
||||
We have cleverly arranged that it already be in the appropriate place, so
|
||||
we only have to do half the loads.*/
|
||||
movq mm1,[0x10+BUF]
|
||||
movq mm2,[0x20+BUF]
|
||||
movq mm3,[0x30+BUF]
|
||||
movq mm0,[0x00+BUF]
|
||||
/*We split out the stages here so we can save the DC coefficient in the
|
||||
middle.*/
|
||||
OC_HADAMARD_AB_8x4
|
||||
OC_HADAMARD_C_ABS_ACCUM_A_8x4(0x28,0x38)
|
||||
movd DC,mm1
|
||||
OC_HADAMARD_C_ABS_ACCUM_B_8x4(0x28,0x38)
|
||||
/*Up to this point, everything fit in 16 bits (8 input + 1 for the
|
||||
difference + 2*3 for the two 8-point 1-D Hadamards - 1 for the abs - 1
|
||||
for the factor of two we dropped + 3 for the vertical accumulation).
|
||||
Now we finally have to promote things to dwords.
|
||||
We break this part out of OC_HADAMARD_ABS_ACCUM_8x4 to hide the long
|
||||
latency of pmaddwd by starting the next series of loads now.*/
|
||||
pmaddwd mm0,mm7
|
||||
movq mm1,[0x50+BUF]
|
||||
movq mm5,[0x58+BUF]
|
||||
movq mm2,[0x60+BUF]
|
||||
movq mm4,mm0
|
||||
movq mm6,[0x68+BUF]
|
||||
punpckhdq mm0,mm0
|
||||
movq mm3,[0x70+BUF]
|
||||
paddd mm4,mm0
|
||||
movq mm7,[0x78+BUF]
|
||||
movd RET,mm4
|
||||
movq mm0,[0x40+BUF]
|
||||
movq mm4,[0x48+BUF]
|
||||
OC_HADAMARD_ABS_ACCUM_8x4(0x68,0x78)
|
||||
pmaddwd mm0,mm7
|
||||
/*We assume that the DC coefficient is always positive (which is true,
|
||||
because the input to the INTRA transform was not a difference).*/
|
||||
movzx DC,DC_WORD
|
||||
add RET,RET
|
||||
sub RET,DC
|
||||
movq mm4,mm0
|
||||
punpckhdq mm0,mm0
|
||||
paddd mm4,mm0
|
||||
movd RET2,mm4
|
||||
lea RET,[-64+RET+RET2*2]
|
||||
mov [dc],DC
|
||||
mov [ret1],RET
|
||||
#undef SRC
|
||||
#undef SRC4
|
||||
#undef BUF
|
||||
#undef YSTRIDE
|
||||
#undef YSTRIDE3
|
||||
#undef RET
|
||||
#undef RET2
|
||||
#undef DC
|
||||
#undef DC_WORD
|
||||
}
|
||||
*_dc=dc;
|
||||
return ret1;
|
||||
}
|
||||
|
||||
void oc_enc_frag_sub_mmx(ogg_int16_t _residue[64],
|
||||
const unsigned char *_src, const unsigned char *_ref,int _ystride){
|
||||
int i;
|
||||
__asm pxor mm7,mm7
|
||||
for(i=4;i-->0;){
|
||||
__asm{
|
||||
#define SRC edx
|
||||
#define YSTRIDE esi
|
||||
#define RESIDUE eax
|
||||
#define REF ecx
|
||||
mov YSTRIDE,_ystride
|
||||
mov RESIDUE,_residue
|
||||
mov SRC,_src
|
||||
mov REF,_ref
|
||||
/*mm0=[src]*/
|
||||
movq mm0,[SRC]
|
||||
/*mm1=[ref]*/
|
||||
movq mm1,[REF]
|
||||
/*mm4=[src+ystride]*/
|
||||
movq mm4,[SRC+YSTRIDE]
|
||||
/*mm5=[ref+ystride]*/
|
||||
movq mm5,[REF+YSTRIDE]
|
||||
/*Compute [src]-[ref].*/
|
||||
movq mm2,mm0
|
||||
punpcklbw mm0,mm7
|
||||
movq mm3,mm1
|
||||
punpckhbw mm2,mm7
|
||||
punpcklbw mm1,mm7
|
||||
punpckhbw mm3,mm7
|
||||
psubw mm0,mm1
|
||||
psubw mm2,mm3
|
||||
/*Compute [src+ystride]-[ref+ystride].*/
|
||||
movq mm1,mm4
|
||||
punpcklbw mm4,mm7
|
||||
movq mm3,mm5
|
||||
punpckhbw mm1,mm7
|
||||
lea SRC,[SRC+YSTRIDE*2]
|
||||
punpcklbw mm5,mm7
|
||||
lea REF,[REF+YSTRIDE*2]
|
||||
punpckhbw mm3,mm7
|
||||
psubw mm4,mm5
|
||||
psubw mm1,mm3
|
||||
/*Write the answer out.*/
|
||||
movq [RESIDUE+0x00],mm0
|
||||
movq [RESIDUE+0x08],mm2
|
||||
movq [RESIDUE+0x10],mm4
|
||||
movq [RESIDUE+0x18],mm1
|
||||
lea RESIDUE,[RESIDUE+0x20]
|
||||
mov _residue,RESIDUE
|
||||
mov _src,SRC
|
||||
mov _ref,REF
|
||||
#undef SRC
|
||||
#undef YSTRIDE
|
||||
#undef RESIDUE
|
||||
#undef REF
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void oc_enc_frag_sub_128_mmx(ogg_int16_t _residue[64],
|
||||
const unsigned char *_src,int _ystride){
|
||||
__asm{
|
||||
#define YSTRIDE edx
|
||||
#define YSTRIDE3 edi
|
||||
#define RESIDUE ecx
|
||||
#define SRC eax
|
||||
mov YSTRIDE,_ystride
|
||||
mov RESIDUE,_residue
|
||||
mov SRC,_src
|
||||
/*mm0=[src]*/
|
||||
movq mm0,[SRC]
|
||||
/*mm1=[src+ystride]*/
|
||||
movq mm1,[SRC+YSTRIDE]
|
||||
/*mm6={-1}x4*/
|
||||
pcmpeqw mm6,mm6
|
||||
/*mm2=[src+2*ystride]*/
|
||||
movq mm2,[SRC+YSTRIDE*2]
|
||||
/*[ystride3]=3*[ystride]*/
|
||||
lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
|
||||
/*mm6={1}x4*/
|
||||
psllw mm6,15
|
||||
/*mm3=[src+3*ystride]*/
|
||||
movq mm3,[SRC+YSTRIDE3]
|
||||
/*mm6={128}x4*/
|
||||
psrlw mm6,8
|
||||
/*mm7=0*/
|
||||
pxor mm7,mm7
|
||||
/*[src]=[src]+4*[ystride]*/
|
||||
lea SRC,[SRC+YSTRIDE*4]
|
||||
/*Compute [src]-128 and [src+ystride]-128*/
|
||||
movq mm4,mm0
|
||||
punpcklbw mm0,mm7
|
||||
movq mm5,mm1
|
||||
punpckhbw mm4,mm7
|
||||
psubw mm0,mm6
|
||||
punpcklbw mm1,mm7
|
||||
psubw mm4,mm6
|
||||
punpckhbw mm5,mm7
|
||||
psubw mm1,mm6
|
||||
psubw mm5,mm6
|
||||
/*Write the answer out.*/
|
||||
movq [RESIDUE+0x00],mm0
|
||||
movq [RESIDUE+0x08],mm4
|
||||
movq [RESIDUE+0x10],mm1
|
||||
movq [RESIDUE+0x18],mm5
|
||||
/*mm0=[src+4*ystride]*/
|
||||
movq mm0,[SRC]
|
||||
/*mm1=[src+5*ystride]*/
|
||||
movq mm1,[SRC+YSTRIDE]
|
||||
/*Compute [src+2*ystride]-128 and [src+3*ystride]-128*/
|
||||
movq mm4,mm2
|
||||
punpcklbw mm2,mm7
|
||||
movq mm5,mm3
|
||||
punpckhbw mm4,mm7
|
||||
psubw mm2,mm6
|
||||
punpcklbw mm3,mm7
|
||||
psubw mm4,mm6
|
||||
punpckhbw mm5,mm7
|
||||
psubw mm3,mm6
|
||||
psubw mm5,mm6
|
||||
/*Write the answer out.*/
|
||||
movq [RESIDUE+0x20],mm2
|
||||
movq [RESIDUE+0x28],mm4
|
||||
movq [RESIDUE+0x30],mm3
|
||||
movq [RESIDUE+0x38],mm5
|
||||
/*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
|
||||
movq mm2,[SRC+YSTRIDE*2]
|
||||
movq mm3,[SRC+YSTRIDE3]
|
||||
movq mm4,mm0
|
||||
punpcklbw mm0,mm7
|
||||
movq mm5,mm1
|
||||
punpckhbw mm4,mm7
|
||||
psubw mm0,mm6
|
||||
punpcklbw mm1,mm7
|
||||
psubw mm4,mm6
|
||||
punpckhbw mm5,mm7
|
||||
psubw mm1,mm6
|
||||
psubw mm5,mm6
|
||||
/*Write the answer out.*/
|
||||
movq [RESIDUE+0x40],mm0
|
||||
movq [RESIDUE+0x48],mm4
|
||||
movq [RESIDUE+0x50],mm1
|
||||
movq [RESIDUE+0x58],mm5
|
||||
/*Compute [src+6*ystride]-128 and [src+7*ystride]-128*/
|
||||
movq mm4,mm2
|
||||
punpcklbw mm2,mm7
|
||||
movq mm5,mm3
|
||||
punpckhbw mm4,mm7
|
||||
psubw mm2,mm6
|
||||
punpcklbw mm3,mm7
|
||||
psubw mm4,mm6
|
||||
punpckhbw mm5,mm7
|
||||
psubw mm3,mm6
|
||||
psubw mm5,mm6
|
||||
/*Write the answer out.*/
|
||||
movq [RESIDUE+0x60],mm2
|
||||
movq [RESIDUE+0x68],mm4
|
||||
movq [RESIDUE+0x70],mm3
|
||||
movq [RESIDUE+0x78],mm5
|
||||
#undef YSTRIDE
|
||||
#undef YSTRIDE3
|
||||
#undef RESIDUE
|
||||
#undef SRC
|
||||
}
|
||||
}
|
||||
|
||||
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride){
|
||||
oc_int_frag_copy2_mmxext(_dst,_ystride,_src1,_src2,_ystride);
|
||||
}
|
||||
|
||||
#endif
|
||||
686
engine/thirdparty/libtheora/x86_vc/mmxfdct.c
vendored
Normal file
686
engine/thirdparty/libtheora/x86_vc/mmxfdct.c
vendored
Normal file
|
|
@ -0,0 +1,686 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *
|
||||
* by the Xiph.Org Foundation http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************/
|
||||
/*MMX fDCT implementation for x86_32*/
|
||||
/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/
|
||||
#include "x86enc.h"
|
||||
#include "x86zigzag.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
#define OC_FDCT_STAGE1_8x4 __asm{ \
|
||||
/*Stage 1:*/ \
|
||||
/*mm0=t7'=t0-t7*/ \
|
||||
__asm psubw mm0,mm7 \
|
||||
__asm paddw mm7,mm7 \
|
||||
/*mm1=t6'=t1-t6*/ \
|
||||
__asm psubw mm1, mm6 \
|
||||
__asm paddw mm6,mm6 \
|
||||
/*mm2=t5'=t2-t5*/ \
|
||||
__asm psubw mm2,mm5 \
|
||||
__asm paddw mm5,mm5 \
|
||||
/*mm3=t4'=t3-t4*/ \
|
||||
__asm psubw mm3,mm4 \
|
||||
__asm paddw mm4,mm4 \
|
||||
/*mm7=t0'=t0+t7*/ \
|
||||
__asm paddw mm7,mm0 \
|
||||
/*mm6=t1'=t1+t6*/ \
|
||||
__asm paddw mm6,mm1 \
|
||||
/*mm5=t2'=t2+t5*/ \
|
||||
__asm paddw mm5,mm2 \
|
||||
/*mm4=t3'=t3+t4*/ \
|
||||
__asm paddw mm4,mm3\
|
||||
}
|
||||
|
||||
#define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \
|
||||
/*Stage 2:*/ \
|
||||
/*mm7=t3''=t0'-t3'*/ \
|
||||
__asm psubw mm7,mm4 \
|
||||
__asm paddw mm4,mm4 \
|
||||
/*mm6=t2''=t1'-t2'*/ \
|
||||
__asm psubw mm6,mm5 \
|
||||
__asm movq [Y+_r6],mm7 \
|
||||
__asm paddw mm5,mm5 \
|
||||
/*mm1=t5''=t6'-t5'*/ \
|
||||
__asm psubw mm1,mm2 \
|
||||
__asm movq [Y+_r2],mm6 \
|
||||
/*mm4=t0''=t0'+t3'*/ \
|
||||
__asm paddw mm4,mm7 \
|
||||
__asm paddw mm2,mm2 \
|
||||
/*mm5=t1''=t1'+t2'*/ \
|
||||
__asm movq [Y+_r0],mm4 \
|
||||
__asm paddw mm5,mm6 \
|
||||
/*mm2=t6''=t6'+t5'*/ \
|
||||
__asm paddw mm2,mm1 \
|
||||
__asm movq [Y+_r4],mm5 \
|
||||
/*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \
|
||||
/*mm4, mm5, mm6, mm7 are free.*/ \
|
||||
/*Stage 3:*/ \
|
||||
/*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \
|
||||
__asm mov A,0x5A806A0A \
|
||||
__asm pcmpeqb mm6,mm6 \
|
||||
__asm movd mm7,A \
|
||||
__asm psrlw mm6,15 \
|
||||
__asm punpckldq mm7,mm7 \
|
||||
__asm paddw mm6,mm6 \
|
||||
/*mm0=0, m2={-1}x4 \
|
||||
mm5:mm4=t5''*27146+0xB500*/ \
|
||||
__asm movq mm4,mm1 \
|
||||
__asm movq mm5,mm1 \
|
||||
__asm punpcklwd mm4,mm6 \
|
||||
__asm movq [Y+_r3],mm2 \
|
||||
__asm pmaddwd mm4,mm7 \
|
||||
__asm movq [Y+_r7],mm0 \
|
||||
__asm punpckhwd mm5,mm6 \
|
||||
__asm pxor mm0,mm0 \
|
||||
__asm pmaddwd mm5,mm7 \
|
||||
__asm pcmpeqb mm2,mm2 \
|
||||
/*mm2=t6'', mm1=t5''+(t5''!=0) \
|
||||
mm4=(t5''*27146+0xB500>>16)*/ \
|
||||
__asm pcmpeqw mm0,mm1 \
|
||||
__asm psrad mm4,16 \
|
||||
__asm psubw mm0,mm2 \
|
||||
__asm movq mm2, [Y+_r3] \
|
||||
__asm psrad mm5,16 \
|
||||
__asm paddw mm1,mm0 \
|
||||
__asm packssdw mm4,mm5 \
|
||||
/*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \
|
||||
__asm paddw mm4,mm1 \
|
||||
__asm movq mm0, [Y+_r7] \
|
||||
__asm psraw mm4,1 \
|
||||
__asm movq mm1,mm3 \
|
||||
/*mm3=t4''=t4'+s*/ \
|
||||
__asm paddw mm3,mm4 \
|
||||
/*mm1=t5'''=t4'-s*/ \
|
||||
__asm psubw mm1,mm4 \
|
||||
/*mm1=0, mm3={-1}x4 \
|
||||
mm5:mm4=t6''*27146+0xB500*/ \
|
||||
__asm movq mm4,mm2 \
|
||||
__asm movq mm5,mm2 \
|
||||
__asm punpcklwd mm4,mm6 \
|
||||
__asm movq [Y+_r5],mm1 \
|
||||
__asm pmaddwd mm4,mm7 \
|
||||
__asm movq [Y+_r1],mm3 \
|
||||
__asm punpckhwd mm5,mm6 \
|
||||
__asm pxor mm1,mm1 \
|
||||
__asm pmaddwd mm5,mm7 \
|
||||
__asm pcmpeqb mm3,mm3 \
|
||||
/*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \
|
||||
__asm psrad mm4,16 \
|
||||
__asm pcmpeqw mm1,mm2 \
|
||||
__asm psrad mm5,16 \
|
||||
__asm psubw mm1,mm3 \
|
||||
__asm packssdw mm4,mm5 \
|
||||
__asm paddw mm2,mm1 \
|
||||
/*mm1=t1'' \
|
||||
mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \
|
||||
__asm paddw mm4,mm2 \
|
||||
__asm movq mm1,[Y+_r4] \
|
||||
__asm psraw mm4,1 \
|
||||
__asm movq mm2,mm0 \
|
||||
/*mm7={54491-0x7FFF,0x7FFF}x2 \
|
||||
mm0=t7''=t7'+s*/ \
|
||||
__asm paddw mm0,mm4 \
|
||||
/*mm2=t6'''=t7'-s*/ \
|
||||
__asm psubw mm2,mm4 \
|
||||
/*Stage 4:*/ \
|
||||
/*mm0=0, mm2=t0'' \
|
||||
mm5:mm4=t1''*27146+0xB500*/ \
|
||||
__asm movq mm4,mm1 \
|
||||
__asm movq mm5,mm1 \
|
||||
__asm punpcklwd mm4,mm6 \
|
||||
__asm movq [Y+_r3],mm2 \
|
||||
__asm pmaddwd mm4,mm7 \
|
||||
__asm movq mm2,[Y+_r0] \
|
||||
__asm punpckhwd mm5,mm6 \
|
||||
__asm movq [Y+_r7],mm0 \
|
||||
__asm pmaddwd mm5,mm7 \
|
||||
__asm pxor mm0,mm0 \
|
||||
/*mm7={27146,0x4000>>1}x2 \
|
||||
mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \
|
||||
__asm psrad mm4,16 \
|
||||
__asm mov A,0x20006A0A \
|
||||
__asm pcmpeqw mm0,mm1 \
|
||||
__asm movd mm7,A \
|
||||
__asm psrad mm5,16 \
|
||||
__asm psubw mm0,mm3 \
|
||||
__asm packssdw mm4,mm5 \
|
||||
__asm paddw mm0,mm1 \
|
||||
__asm punpckldq mm7,mm7 \
|
||||
__asm paddw mm0,mm4 \
|
||||
/*mm6={0x00000E3D}x2 \
|
||||
mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \
|
||||
__asm movq mm4,mm2 \
|
||||
__asm movq mm5,mm2 \
|
||||
__asm punpcklwd mm4,mm6 \
|
||||
__asm mov A,0x0E3D \
|
||||
__asm pmaddwd mm4,mm7 \
|
||||
__asm punpckhwd mm5,mm6 \
|
||||
__asm movd mm6,A \
|
||||
__asm pmaddwd mm5,mm7 \
|
||||
__asm pxor mm1,mm1 \
|
||||
__asm punpckldq mm6,mm6 \
|
||||
__asm pcmpeqw mm1,mm2 \
|
||||
/*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \
|
||||
__asm psrad mm4,16 \
|
||||
__asm psubw mm1,mm3 \
|
||||
__asm psrad mm5,16 \
|
||||
__asm paddw mm2,mm1 \
|
||||
__asm packssdw mm4,mm5 \
|
||||
__asm movq mm1,[Y+_r5] \
|
||||
__asm paddw mm4,mm2 \
|
||||
/*mm2=t6'', mm0=_y[0]=u=r+s>>1 \
|
||||
The naive implementation could cause overflow, so we use \
|
||||
u=(r&s)+((r^s)>>1).*/ \
|
||||
__asm movq mm2,[Y+_r3] \
|
||||
__asm movq mm7,mm0 \
|
||||
__asm pxor mm0,mm4 \
|
||||
__asm pand mm7,mm4 \
|
||||
__asm psraw mm0,1 \
|
||||
__asm mov A,0x7FFF54DC \
|
||||
__asm paddw mm0,mm7 \
|
||||
__asm movd mm7,A \
|
||||
/*mm7={54491-0x7FFF,0x7FFF}x2 \
|
||||
mm4=_y[4]=v=r-u*/ \
|
||||
__asm psubw mm4,mm0 \
|
||||
__asm punpckldq mm7,mm7 \
|
||||
__asm movq [Y+_r4],mm4 \
|
||||
/*mm0=0, mm7={36410}x4 \
|
||||
mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \
|
||||
__asm movq mm4,mm1 \
|
||||
__asm movq mm5,mm1 \
|
||||
__asm punpcklwd mm4,mm1 \
|
||||
__asm mov A,0x8E3A8E3A \
|
||||
__asm pmaddwd mm4,mm7 \
|
||||
__asm movq [Y+_r0],mm0 \
|
||||
__asm punpckhwd mm5,mm1 \
|
||||
__asm pxor mm0,mm0 \
|
||||
__asm pmaddwd mm5,mm7 \
|
||||
__asm pcmpeqw mm1,mm0 \
|
||||
__asm movd mm7,A \
|
||||
__asm psubw mm1,mm3 \
|
||||
__asm punpckldq mm7,mm7 \
|
||||
__asm paddd mm4,mm6 \
|
||||
__asm paddd mm5,mm6 \
|
||||
/*mm0=0 \
|
||||
mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \
|
||||
__asm movq mm6,mm2 \
|
||||
__asm movq mm3,mm2 \
|
||||
__asm pmulhw mm6,mm7 \
|
||||
__asm paddw mm1,mm2 \
|
||||
__asm pmullw mm3,mm7 \
|
||||
__asm pxor mm0,mm0 \
|
||||
__asm paddw mm6,mm1 \
|
||||
__asm movq mm1,mm3 \
|
||||
__asm punpckhwd mm3,mm6 \
|
||||
__asm punpcklwd mm1,mm6 \
|
||||
/*mm3={-1}x4, mm6={1}x4 \
|
||||
mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \
|
||||
__asm paddd mm5,mm3 \
|
||||
__asm paddd mm4,mm1 \
|
||||
__asm psrad mm5,16 \
|
||||
__asm pxor mm6,mm6 \
|
||||
__asm psrad mm4,16 \
|
||||
__asm pcmpeqb mm3,mm3 \
|
||||
__asm packssdw mm4,mm5 \
|
||||
__asm psubw mm6,mm3 \
|
||||
/*mm1=t7'', mm7={26568,0x3400}x2 \
|
||||
mm2=s=t6'''-(36410*u>>16)*/ \
|
||||
__asm movq mm1,mm4 \
|
||||
__asm mov A,0x340067C8 \
|
||||
__asm pmulhw mm4,mm7 \
|
||||
__asm movd mm7,A \
|
||||
__asm movq [Y+_r5],mm1 \
|
||||
__asm punpckldq mm7,mm7 \
|
||||
__asm paddw mm4,mm1 \
|
||||
__asm movq mm1,[Y+_r7] \
|
||||
__asm psubw mm2,mm4 \
|
||||
/*mm6={0x00007B1B}x2 \
|
||||
mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \
|
||||
__asm movq mm4,mm2 \
|
||||
__asm movq mm5,mm2 \
|
||||
__asm punpcklwd mm4,mm6 \
|
||||
__asm pcmpeqw mm0,mm2 \
|
||||
__asm pmaddwd mm4,mm7 \
|
||||
__asm mov A,0x7B1B \
|
||||
__asm punpckhwd mm5,mm6 \
|
||||
__asm movd mm6,A \
|
||||
__asm pmaddwd mm5,mm7 \
|
||||
__asm psubw mm0,mm3 \
|
||||
__asm punpckldq mm6,mm6 \
|
||||
/*mm7={64277-0x7FFF,0x7FFF}x2 \
|
||||
mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \
|
||||
__asm psrad mm4,17 \
|
||||
__asm paddw mm2,mm0 \
|
||||
__asm psrad mm5,17 \
|
||||
__asm mov A,0x7FFF7B16 \
|
||||
__asm packssdw mm4,mm5 \
|
||||
__asm movd mm7,A \
|
||||
__asm paddw mm2,mm4 \
|
||||
__asm punpckldq mm7,mm7 \
|
||||
/*mm0=0, mm7={12785}x4 \
|
||||
mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \
|
||||
__asm movq mm4,mm1 \
|
||||
__asm movq mm5,mm1 \
|
||||
__asm movq [Y+_r3],mm2 \
|
||||
__asm punpcklwd mm4,mm1 \
|
||||
__asm movq mm2,[Y+_r1] \
|
||||
__asm pmaddwd mm4,mm7 \
|
||||
__asm mov A,0x31F131F1 \
|
||||
__asm punpckhwd mm5,mm1 \
|
||||
__asm pxor mm0,mm0 \
|
||||
__asm pmaddwd mm5,mm7 \
|
||||
__asm pcmpeqw mm1,mm0 \
|
||||
__asm movd mm7,A \
|
||||
__asm psubw mm1,mm3 \
|
||||
__asm punpckldq mm7,mm7 \
|
||||
__asm paddd mm4,mm6 \
|
||||
__asm paddd mm5,mm6 \
|
||||
/*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \
|
||||
__asm movq mm6,mm2 \
|
||||
__asm movq mm3,mm2 \
|
||||
__asm pmulhw mm6,mm7 \
|
||||
__asm pmullw mm3,mm7 \
|
||||
__asm paddw mm6,mm1 \
|
||||
__asm movq mm1,mm3 \
|
||||
__asm punpckhwd mm3,mm6 \
|
||||
__asm punpcklwd mm1,mm6 \
|
||||
/*mm3={-1}x4, mm6={1}x4 \
|
||||
mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \
|
||||
__asm paddd mm5,mm3 \
|
||||
__asm paddd mm4,mm1 \
|
||||
__asm psrad mm5,16 \
|
||||
__asm pxor mm6,mm6 \
|
||||
__asm psrad mm4,16 \
|
||||
__asm pcmpeqb mm3,mm3 \
|
||||
__asm packssdw mm4,mm5 \
|
||||
__asm psubw mm6,mm3 \
|
||||
/*mm1=t3'', mm7={20539,0x3000}x2 \
|
||||
mm4=s=(12785*u>>16)-t4''*/ \
|
||||
__asm movq [Y+_r1],mm4 \
|
||||
__asm pmulhw mm4,mm7 \
|
||||
__asm mov A,0x3000503B \
|
||||
__asm movq mm1,[Y+_r6] \
|
||||
__asm movd mm7,A \
|
||||
__asm psubw mm4,mm2 \
|
||||
__asm punpckldq mm7,mm7 \
|
||||
/*mm6={0x00006CB7}x2 \
|
||||
mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \
|
||||
__asm movq mm5,mm4 \
|
||||
__asm movq mm2,mm4 \
|
||||
__asm punpcklwd mm4,mm6 \
|
||||
__asm pcmpeqw mm0,mm2 \
|
||||
__asm pmaddwd mm4,mm7 \
|
||||
__asm mov A,0x6CB7 \
|
||||
__asm punpckhwd mm5,mm6 \
|
||||
__asm movd mm6,A \
|
||||
__asm pmaddwd mm5,mm7 \
|
||||
__asm psubw mm0,mm3 \
|
||||
__asm punpckldq mm6,mm6 \
|
||||
/*mm7={60547-0x7FFF,0x7FFF}x2 \
|
||||
mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \
|
||||
__asm psrad mm4,20 \
|
||||
__asm paddw mm2,mm0 \
|
||||
__asm psrad mm5,20 \
|
||||
__asm mov A,0x7FFF6C84 \
|
||||
__asm packssdw mm4,mm5 \
|
||||
__asm movd mm7,A \
|
||||
__asm paddw mm2,mm4 \
|
||||
__asm punpckldq mm7,mm7 \
|
||||
/*mm0=0, mm7={25080}x4 \
|
||||
mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \
|
||||
__asm movq mm4,mm1 \
|
||||
__asm movq mm5,mm1 \
|
||||
__asm movq [Y+_r7],mm2 \
|
||||
__asm punpcklwd mm4,mm1 \
|
||||
__asm movq mm2,[Y+_r2] \
|
||||
__asm pmaddwd mm4,mm7 \
|
||||
__asm mov A,0x61F861F8 \
|
||||
__asm punpckhwd mm5,mm1 \
|
||||
__asm pxor mm0,mm0 \
|
||||
__asm pmaddwd mm5,mm7 \
|
||||
__asm movd mm7,A \
|
||||
__asm pcmpeqw mm1,mm0 \
|
||||
__asm psubw mm1,mm3 \
|
||||
__asm punpckldq mm7,mm7 \
|
||||
__asm paddd mm4,mm6 \
|
||||
__asm paddd mm5,mm6 \
|
||||
/*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \
|
||||
__asm movq mm6,mm2 \
|
||||
__asm movq mm3,mm2 \
|
||||
__asm pmulhw mm6,mm7 \
|
||||
__asm pmullw mm3,mm7 \
|
||||
__asm paddw mm6,mm1 \
|
||||
__asm movq mm1,mm3 \
|
||||
__asm punpckhwd mm3,mm6 \
|
||||
__asm punpcklwd mm1,mm6 \
|
||||
/*mm1={-1}x4 \
|
||||
mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \
|
||||
__asm paddd mm5,mm3 \
|
||||
__asm paddd mm4,mm1 \
|
||||
__asm psrad mm5,16 \
|
||||
__asm mov A,0x28005460 \
|
||||
__asm psrad mm4,16 \
|
||||
__asm pcmpeqb mm1,mm1 \
|
||||
__asm packssdw mm4,mm5 \
|
||||
/*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \
|
||||
mm4=s=(25080*u>>16)-t2''*/ \
|
||||
__asm movq mm6,mm4 \
|
||||
__asm pmulhw mm4,mm7 \
|
||||
__asm pxor mm5,mm5 \
|
||||
__asm movd mm7,A \
|
||||
__asm psubw mm5,mm1 \
|
||||
__asm punpckldq mm7,mm7 \
|
||||
__asm psubw mm4,mm2 \
|
||||
/*mm2=s+(s!=0) \
|
||||
mm4:mm3=s*21600+0x2800*/ \
|
||||
__asm movq mm3,mm4 \
|
||||
__asm movq mm2,mm4 \
|
||||
__asm punpckhwd mm4,mm5 \
|
||||
__asm pcmpeqw mm0,mm2 \
|
||||
__asm pmaddwd mm4,mm7 \
|
||||
__asm psubw mm0,mm1 \
|
||||
__asm punpcklwd mm3,mm5 \
|
||||
__asm paddw mm2,mm0 \
|
||||
__asm pmaddwd mm3,mm7 \
|
||||
/*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \
|
||||
mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \
|
||||
__asm movq mm0,[Y+_r4] \
|
||||
__asm psrad mm4,18 \
|
||||
__asm movq mm5,[Y+_r5] \
|
||||
__asm psrad mm3,18 \
|
||||
__asm movq mm1,[Y+_r7] \
|
||||
__asm packssdw mm3,mm4 \
|
||||
__asm movq mm4,[Y+_r0] \
|
||||
__asm paddw mm3,mm2 \
|
||||
}
|
||||
|
||||
/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].
|
||||
On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and
|
||||
{mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/
|
||||
#define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) __asm{ \
|
||||
/*First 4x4 transpose:*/ \
|
||||
/*mm0 = e3 e2 e1 e0 \
|
||||
mm5 = f3 f2 f1 f0 \
|
||||
mm3 = g3 g2 g1 g0 \
|
||||
mm1 = h3 h2 h1 h0*/ \
|
||||
__asm movq mm2,mm0 \
|
||||
__asm punpcklwd mm0,mm5 \
|
||||
__asm punpckhwd mm2,mm5 \
|
||||
__asm movq mm5,mm3 \
|
||||
__asm punpcklwd mm3,mm1 \
|
||||
__asm punpckhwd mm5,mm1 \
|
||||
/*mm0 = f1 e1 f0 e0 \
|
||||
mm2 = f3 e3 f2 e2 \
|
||||
mm3 = h1 g1 h0 g0 \
|
||||
mm5 = h3 g3 h2 g2*/ \
|
||||
__asm movq mm1,mm0 \
|
||||
__asm punpckldq mm0,mm3 \
|
||||
__asm movq [Y+_r4],mm0 \
|
||||
__asm punpckhdq mm1,mm3 \
|
||||
__asm movq mm0,[Y+_r1] \
|
||||
__asm movq mm3,mm2 \
|
||||
__asm punpckldq mm2,mm5 \
|
||||
__asm punpckhdq mm3,mm5 \
|
||||
__asm movq mm5,[Y+_r3] \
|
||||
/*_y[4] = h0 g0 f0 e0 \
|
||||
mm1 = h1 g1 f1 e1 \
|
||||
mm2 = h2 g2 f2 e2 \
|
||||
mm3 = h3 g3 f3 e3*/ \
|
||||
/*Second 4x4 transpose:*/ \
|
||||
/*mm4 = a3 a2 a1 a0 \
|
||||
mm0 = b3 b2 b1 b0 \
|
||||
mm6 = c3 c2 c1 c0 \
|
||||
mm5 = d3 d2 d1 d0*/ \
|
||||
__asm movq mm7,mm4 \
|
||||
__asm punpcklwd mm4,mm0 \
|
||||
__asm punpckhwd mm7,mm0 \
|
||||
__asm movq mm0,mm6 \
|
||||
__asm punpcklwd mm6,mm5 \
|
||||
__asm punpckhwd mm0,mm5 \
|
||||
/*mm4 = b1 a1 b0 a0 \
|
||||
mm7 = b3 a3 b2 a2 \
|
||||
mm6 = d1 c1 d0 c0 \
|
||||
mm0 = d3 c3 d2 c2*/ \
|
||||
__asm movq mm5,mm4 \
|
||||
__asm punpckldq mm4,mm6 \
|
||||
__asm punpckhdq mm5,mm6 \
|
||||
__asm movq mm6,mm7 \
|
||||
__asm punpckhdq mm7,mm0 \
|
||||
__asm punpckldq mm6,mm0 \
|
||||
/*mm4 = d0 c0 b0 a0 \
|
||||
mm5 = d1 c1 b1 a1 \
|
||||
mm6 = d2 c2 b2 a2 \
|
||||
mm7 = d3 c3 b3 a3*/ \
|
||||
}
|
||||
|
||||
/*MMX implementation of the fDCT.*/
|
||||
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
|
||||
OC_ALIGN8(ogg_int16_t buf[64]);
|
||||
ogg_int16_t *bufp;
|
||||
bufp=buf;
|
||||
__asm{
|
||||
#define X edx
|
||||
#define Y eax
|
||||
#define A ecx
|
||||
#define BUF esi
|
||||
/*Add two extra bits of working precision to improve accuracy; any more and
|
||||
we could overflow.*/
|
||||
/*We also add biases to correct for some systematic error that remains in
|
||||
the full fDCT->iDCT round trip.*/
|
||||
mov X, _x
|
||||
mov Y, _y
|
||||
mov BUF, bufp
|
||||
movq mm0,[0x00+X]
|
||||
movq mm1,[0x10+X]
|
||||
movq mm2,[0x20+X]
|
||||
movq mm3,[0x30+X]
|
||||
pcmpeqb mm4,mm4
|
||||
pxor mm7,mm7
|
||||
movq mm5,mm0
|
||||
psllw mm0,2
|
||||
pcmpeqw mm5,mm7
|
||||
movq mm7,[0x70+X]
|
||||
psllw mm1,2
|
||||
psubw mm5,mm4
|
||||
psllw mm2,2
|
||||
mov A,1
|
||||
pslld mm5,16
|
||||
movd mm6,A
|
||||
psllq mm5,16
|
||||
mov A,0x10001
|
||||
psllw mm3,2
|
||||
movd mm4,A
|
||||
punpckhwd mm5,mm6
|
||||
psubw mm1,mm6
|
||||
movq mm6,[0x60+X]
|
||||
paddw mm0,mm5
|
||||
movq mm5,[0x50+X]
|
||||
paddw mm0,mm4
|
||||
movq mm4,[0x40+X]
|
||||
/*We inline stage1 of the transform here so we can get better instruction
|
||||
scheduling with the shifts.*/
|
||||
/*mm0=t7'=t0-t7*/
|
||||
psllw mm7,2
|
||||
psubw mm0,mm7
|
||||
psllw mm6,2
|
||||
paddw mm7,mm7
|
||||
/*mm1=t6'=t1-t6*/
|
||||
psllw mm5,2
|
||||
psubw mm1,mm6
|
||||
psllw mm4,2
|
||||
paddw mm6,mm6
|
||||
/*mm2=t5'=t2-t5*/
|
||||
psubw mm2,mm5
|
||||
paddw mm5,mm5
|
||||
/*mm3=t4'=t3-t4*/
|
||||
psubw mm3,mm4
|
||||
paddw mm4,mm4
|
||||
/*mm7=t0'=t0+t7*/
|
||||
paddw mm7,mm0
|
||||
/*mm6=t1'=t1+t6*/
|
||||
paddw mm6,mm1
|
||||
/*mm5=t2'=t2+t5*/
|
||||
paddw mm5,mm2
|
||||
/*mm4=t3'=t3+t4*/
|
||||
paddw mm4,mm3
|
||||
OC_FDCT8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70)
|
||||
OC_TRANSPOSE8x4(0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70)
|
||||
/*Swap out this 8x4 block for the next one.*/
|
||||
movq mm0,[0x08+X]
|
||||
movq [0x30+Y],mm7
|
||||
movq mm7,[0x78+X]
|
||||
movq [0x50+Y],mm1
|
||||
movq mm1,[0x18+X]
|
||||
movq [0x20+Y],mm6
|
||||
movq mm6,[0x68+X]
|
||||
movq [0x60+Y],mm2
|
||||
movq mm2,[0x28+X]
|
||||
movq [0x10+Y],mm5
|
||||
movq mm5,[0x58+X]
|
||||
movq [0x70+Y],mm3
|
||||
movq mm3,[0x38+X]
|
||||
/*And increase its working precision, too.*/
|
||||
psllw mm0,2
|
||||
movq [0x00+Y],mm4
|
||||
psllw mm7,2
|
||||
movq mm4,[0x48+X]
|
||||
/*We inline stage1 of the transform here so we can get better instruction
|
||||
scheduling with the shifts.*/
|
||||
/*mm0=t7'=t0-t7*/
|
||||
psubw mm0,mm7
|
||||
psllw mm1,2
|
||||
paddw mm7,mm7
|
||||
psllw mm6,2
|
||||
/*mm1=t6'=t1-t6*/
|
||||
psubw mm1,mm6
|
||||
psllw mm2,2
|
||||
paddw mm6,mm6
|
||||
psllw mm5,2
|
||||
/*mm2=t5'=t2-t5*/
|
||||
psubw mm2,mm5
|
||||
psllw mm3,2
|
||||
paddw mm5,mm5
|
||||
psllw mm4,2
|
||||
/*mm3=t4'=t3-t4*/
|
||||
psubw mm3,mm4
|
||||
paddw mm4,mm4
|
||||
/*mm7=t0'=t0+t7*/
|
||||
paddw mm7,mm0
|
||||
/*mm6=t1'=t1+t6*/
|
||||
paddw mm6,mm1
|
||||
/*mm5=t2'=t2+t5*/
|
||||
paddw mm5,mm2
|
||||
/*mm4=t3'=t3+t4*/
|
||||
paddw mm4,mm3
|
||||
OC_FDCT8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78)
|
||||
OC_TRANSPOSE8x4(0x08,0x18,0x28,0x38,0x48,0x58,0x68,0x78)
|
||||
/*Here the first 4x4 block of output from the last transpose is the second
|
||||
4x4 block of input for the next transform.
|
||||
We have cleverly arranged that it already be in the appropriate place,
|
||||
so we only have to do half the stores and loads.*/
|
||||
movq mm0,[0x00+Y]
|
||||
movq [0x58+Y],mm1
|
||||
movq mm1,[0x10+Y]
|
||||
movq [0x68+Y],mm2
|
||||
movq mm2,[0x20+Y]
|
||||
movq [0x78+Y],mm3
|
||||
movq mm3,[0x30+Y]
|
||||
OC_FDCT_STAGE1_8x4
|
||||
OC_FDCT8x4(0x00,0x10,0x20,0x30,0x08,0x18,0x28,0x38)
|
||||
/*mm0={-2}x4*/
|
||||
pcmpeqw mm2,mm2
|
||||
paddw mm2,mm2
|
||||
/*Round and store the results (no transpose).*/
|
||||
movq mm7,[Y+0x10]
|
||||
psubw mm4,mm2
|
||||
psubw mm6,mm2
|
||||
psraw mm4,2
|
||||
psubw mm0,mm2
|
||||
movq [BUF+0x00],mm4
|
||||
movq mm4,[Y+0x30]
|
||||
psraw mm6,2
|
||||
psubw mm5,mm2
|
||||
movq [BUF+0x20],mm6
|
||||
psraw mm0,2
|
||||
psubw mm3,mm2
|
||||
movq [BUF+0x40],mm0
|
||||
psraw mm5,2
|
||||
psubw mm1,mm2
|
||||
movq [BUF+0x50],mm5
|
||||
psraw mm3,2
|
||||
psubw mm7,mm2
|
||||
movq [BUF+0x60],mm3
|
||||
psraw mm1,2
|
||||
psubw mm4,mm2
|
||||
movq [BUF+0x70],mm1
|
||||
psraw mm7,2
|
||||
movq [BUF+0x10],mm7
|
||||
psraw mm4,2
|
||||
movq [BUF+0x30],mm4
|
||||
/*Load the next block.*/
|
||||
movq mm0,[0x40+Y]
|
||||
movq mm7,[0x78+Y]
|
||||
movq mm1,[0x50+Y]
|
||||
movq mm6,[0x68+Y]
|
||||
movq mm2,[0x60+Y]
|
||||
movq mm5,[0x58+Y]
|
||||
movq mm3,[0x70+Y]
|
||||
movq mm4,[0x48+Y]
|
||||
OC_FDCT_STAGE1_8x4
|
||||
OC_FDCT8x4(0x40,0x50,0x60,0x70,0x48,0x58,0x68,0x78)
|
||||
/*mm0={-2}x4*/
|
||||
pcmpeqw mm2,mm2
|
||||
paddw mm2,mm2
|
||||
/*Round and store the results (no transpose).*/
|
||||
movq mm7,[Y+0x50]
|
||||
psubw mm4,mm2
|
||||
psubw mm6,mm2
|
||||
psraw mm4,2
|
||||
psubw mm0,mm2
|
||||
movq [BUF+0x08],mm4
|
||||
movq mm4,[Y+0x70]
|
||||
psraw mm6,2
|
||||
psubw mm5,mm2
|
||||
movq [BUF+0x28],mm6
|
||||
psraw mm0,2
|
||||
psubw mm3,mm2
|
||||
movq [BUF+0x48],mm0
|
||||
psraw mm5,2
|
||||
psubw mm1,mm2
|
||||
movq [BUF+0x58],mm5
|
||||
psraw mm3,2
|
||||
psubw mm7,mm2
|
||||
movq [BUF+0x68],mm3
|
||||
psraw mm1,2
|
||||
psubw mm4,mm2
|
||||
movq [BUF+0x78],mm1
|
||||
psraw mm7,2
|
||||
movq [BUF+0x18],mm7
|
||||
psraw mm4,2
|
||||
movq [BUF+0x38],mm4
|
||||
#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \
|
||||
__asm movq _reg,[BUF+16*(_row)] \
|
||||
|
||||
#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \
|
||||
__asm movq _reg,[BUF+16*(_row)+8] \
|
||||
|
||||
OC_TRANSPOSE_ZIG_ZAG_MMXEXT
|
||||
#undef OC_ZZ_LOAD_ROW_LO
|
||||
#undef OC_ZZ_LOAD_ROW_HI
|
||||
#undef X
|
||||
#undef Y
|
||||
#undef A
|
||||
#undef BUF
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
416
engine/thirdparty/libtheora/x86_vc/mmxfrag.c
vendored
Normal file
416
engine/thirdparty/libtheora/x86_vc/mmxfrag.c
vendored
Normal file
|
|
@ -0,0 +1,416 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/*MMX acceleration of fragment reconstruction for motion compensation.
|
||||
Originally written by Rudolf Marek.
|
||||
Additional optimization by Nils Pipenbrinck.
|
||||
Note: Loops are unrolled for best performance.
|
||||
The iteration each instruction belongs to is marked in the comments as #i.*/
|
||||
#include <stddef.h>
|
||||
#include "x86int.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||
between rows.*/
|
||||
# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \
|
||||
do{ \
|
||||
const unsigned char *src; \
|
||||
unsigned char *dst; \
|
||||
src=(_src); \
|
||||
dst=(_dst); \
|
||||
__asm mov SRC,src \
|
||||
__asm mov DST,dst \
|
||||
__asm mov YSTRIDE,_ystride \
|
||||
/*src+0*ystride*/ \
|
||||
__asm movq mm0,[SRC] \
|
||||
/*src+1*ystride*/ \
|
||||
__asm movq mm1,[SRC+YSTRIDE] \
|
||||
/*ystride3=ystride*3*/ \
|
||||
__asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
|
||||
/*src+2*ystride*/ \
|
||||
__asm movq mm2,[SRC+YSTRIDE*2] \
|
||||
/*src+3*ystride*/ \
|
||||
__asm movq mm3,[SRC+YSTRIDE3] \
|
||||
/*dst+0*ystride*/ \
|
||||
__asm movq [DST],mm0 \
|
||||
/*dst+1*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE],mm1 \
|
||||
/*Pointer to next 4.*/ \
|
||||
__asm lea SRC,[SRC+YSTRIDE*4] \
|
||||
/*dst+2*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE*2],mm2 \
|
||||
/*dst+3*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE3],mm3 \
|
||||
/*Pointer to next 4.*/ \
|
||||
__asm lea DST,[DST+YSTRIDE*4] \
|
||||
/*src+0*ystride*/ \
|
||||
__asm movq mm0,[SRC] \
|
||||
/*src+1*ystride*/ \
|
||||
__asm movq mm1,[SRC+YSTRIDE] \
|
||||
/*src+2*ystride*/ \
|
||||
__asm movq mm2,[SRC+YSTRIDE*2] \
|
||||
/*src+3*ystride*/ \
|
||||
__asm movq mm3,[SRC+YSTRIDE3] \
|
||||
/*dst+0*ystride*/ \
|
||||
__asm movq [DST],mm0 \
|
||||
/*dst+1*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE],mm1 \
|
||||
/*dst+2*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE*2],mm2 \
|
||||
/*dst+3*ystride*/ \
|
||||
__asm movq [DST+YSTRIDE3],mm3 \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes
|
||||
between rows.*/
|
||||
void oc_frag_copy_mmx(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride){
|
||||
#define SRC edx
|
||||
#define DST eax
|
||||
#define YSTRIDE ecx
|
||||
#define YSTRIDE3 esi
|
||||
OC_FRAG_COPY_MMX(_dst,_src,_ystride);
|
||||
#undef SRC
|
||||
#undef DST
|
||||
#undef YSTRIDE
|
||||
#undef YSTRIDE3
|
||||
}
|
||||
|
||||
/*Copies the fragments specified by the lists of fragment indices from one
|
||||
frame to another.
|
||||
_dst_frame: The reference frame to copy to.
|
||||
_src_frame: The reference frame to copy from.
|
||||
_ystride: The row stride of the reference frames.
|
||||
_fragis: A pointer to a list of fragment indices.
|
||||
_nfragis: The number of fragment indices to copy.
|
||||
_frag_buf_offs: The offsets of fragments in the reference frames.*/
|
||||
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){
|
||||
ptrdiff_t fragii;
|
||||
for(fragii=0;fragii<_nfragis;fragii++){
|
||||
ptrdiff_t frag_buf_off;
|
||||
frag_buf_off=_frag_buf_offs[_fragis[fragii]];
|
||||
#define SRC edx
|
||||
#define DST eax
|
||||
#define YSTRIDE ecx
|
||||
#define YSTRIDE3 edi
|
||||
OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,
|
||||
_src_frame+frag_buf_off,_ystride);
|
||||
#undef SRC
|
||||
#undef DST
|
||||
#undef YSTRIDE
|
||||
#undef YSTRIDE3
|
||||
}
|
||||
}
|
||||
|
||||
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t *_residue){
|
||||
__asm{
|
||||
#define DST edx
|
||||
#define DST4 esi
|
||||
#define YSTRIDE eax
|
||||
#define YSTRIDE3 edi
|
||||
#define RESIDUE ecx
|
||||
mov DST,_dst
|
||||
mov YSTRIDE,_ystride
|
||||
mov RESIDUE,_residue
|
||||
lea DST4,[DST+YSTRIDE*4]
|
||||
lea YSTRIDE3,[YSTRIDE+YSTRIDE*2]
|
||||
/*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/
|
||||
pcmpeqw mm0,mm0
|
||||
/*#0 Load low residue.*/
|
||||
movq mm1,[0*8+RESIDUE]
|
||||
/*#0 Load high residue.*/
|
||||
movq mm2,[1*8+RESIDUE]
|
||||
/*Set mm0 to 0x8000800080008000.*/
|
||||
psllw mm0,15
|
||||
/*#1 Load low residue.*/
|
||||
movq mm3,[2*8+RESIDUE]
|
||||
/*#1 Load high residue.*/
|
||||
movq mm4,[3*8+RESIDUE]
|
||||
/*Set mm0 to 0x0080008000800080.*/
|
||||
psrlw mm0,8
|
||||
/*#2 Load low residue.*/
|
||||
movq mm5,[4*8+RESIDUE]
|
||||
/*#2 Load high residue.*/
|
||||
movq mm6,[5*8+RESIDUE]
|
||||
/*#0 Bias low residue.*/
|
||||
paddsw mm1,mm0
|
||||
/*#0 Bias high residue.*/
|
||||
paddsw mm2,mm0
|
||||
/*#0 Pack to byte.*/
|
||||
packuswb mm1,mm2
|
||||
/*#1 Bias low residue.*/
|
||||
paddsw mm3,mm0
|
||||
/*#1 Bias high residue.*/
|
||||
paddsw mm4,mm0
|
||||
/*#1 Pack to byte.*/
|
||||
packuswb mm3,mm4
|
||||
/*#2 Bias low residue.*/
|
||||
paddsw mm5,mm0
|
||||
/*#2 Bias high residue.*/
|
||||
paddsw mm6,mm0
|
||||
/*#2 Pack to byte.*/
|
||||
packuswb mm5,mm6
|
||||
/*#0 Write row.*/
|
||||
movq [DST],mm1
|
||||
/*#1 Write row.*/
|
||||
movq [DST+YSTRIDE],mm3
|
||||
/*#2 Write row.*/
|
||||
movq [DST+YSTRIDE*2],mm5
|
||||
/*#3 Load low residue.*/
|
||||
movq mm1,[6*8+RESIDUE]
|
||||
/*#3 Load high residue.*/
|
||||
movq mm2,[7*8+RESIDUE]
|
||||
/*#4 Load high residue.*/
|
||||
movq mm3,[8*8+RESIDUE]
|
||||
/*#4 Load high residue.*/
|
||||
movq mm4,[9*8+RESIDUE]
|
||||
/*#5 Load high residue.*/
|
||||
movq mm5,[10*8+RESIDUE]
|
||||
/*#5 Load high residue.*/
|
||||
movq mm6,[11*8+RESIDUE]
|
||||
/*#3 Bias low residue.*/
|
||||
paddsw mm1,mm0
|
||||
/*#3 Bias high residue.*/
|
||||
paddsw mm2,mm0
|
||||
/*#3 Pack to byte.*/
|
||||
packuswb mm1,mm2
|
||||
/*#4 Bias low residue.*/
|
||||
paddsw mm3,mm0
|
||||
/*#4 Bias high residue.*/
|
||||
paddsw mm4,mm0
|
||||
/*#4 Pack to byte.*/
|
||||
packuswb mm3,mm4
|
||||
/*#5 Bias low residue.*/
|
||||
paddsw mm5,mm0
|
||||
/*#5 Bias high residue.*/
|
||||
paddsw mm6,mm0
|
||||
/*#5 Pack to byte.*/
|
||||
packuswb mm5,mm6
|
||||
/*#3 Write row.*/
|
||||
movq [DST+YSTRIDE3],mm1
|
||||
/*#4 Write row.*/
|
||||
movq [DST4],mm3
|
||||
/*#5 Write row.*/
|
||||
movq [DST4+YSTRIDE],mm5
|
||||
/*#6 Load low residue.*/
|
||||
movq mm1,[12*8+RESIDUE]
|
||||
/*#6 Load high residue.*/
|
||||
movq mm2,[13*8+RESIDUE]
|
||||
/*#7 Load low residue.*/
|
||||
movq mm3,[14*8+RESIDUE]
|
||||
/*#7 Load high residue.*/
|
||||
movq mm4,[15*8+RESIDUE]
|
||||
/*#6 Bias low residue.*/
|
||||
paddsw mm1,mm0
|
||||
/*#6 Bias high residue.*/
|
||||
paddsw mm2,mm0
|
||||
/*#6 Pack to byte.*/
|
||||
packuswb mm1,mm2
|
||||
/*#7 Bias low residue.*/
|
||||
paddsw mm3,mm0
|
||||
/*#7 Bias high residue.*/
|
||||
paddsw mm4,mm0
|
||||
/*#7 Pack to byte.*/
|
||||
packuswb mm3,mm4
|
||||
/*#6 Write row.*/
|
||||
movq [DST4+YSTRIDE*2],mm1
|
||||
/*#7 Write row.*/
|
||||
movq [DST4+YSTRIDE3],mm3
|
||||
#undef DST
|
||||
#undef DST4
|
||||
#undef YSTRIDE
|
||||
#undef YSTRIDE3
|
||||
#undef RESIDUE
|
||||
}
|
||||
}
|
||||
|
||||
void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,
|
||||
int _ystride,const ogg_int16_t *_residue){
|
||||
int i;
|
||||
/*Zero mm0.*/
|
||||
__asm pxor mm0,mm0;
|
||||
for(i=4;i-->0;){
|
||||
__asm{
|
||||
#define DST edx
|
||||
#define SRC ecx
|
||||
#define YSTRIDE edi
|
||||
#define RESIDUE eax
|
||||
mov DST,_dst
|
||||
mov SRC,_src
|
||||
mov YSTRIDE,_ystride
|
||||
mov RESIDUE,_residue
|
||||
/*#0 Load source.*/
|
||||
movq mm3,[SRC]
|
||||
/*#1 Load source.*/
|
||||
movq mm7,[SRC+YSTRIDE]
|
||||
/*#0 Get copy of src.*/
|
||||
movq mm4,mm3
|
||||
/*#0 Expand high source.*/
|
||||
punpckhbw mm4,mm0
|
||||
/*#0 Expand low source.*/
|
||||
punpcklbw mm3,mm0
|
||||
/*#0 Add residue high.*/
|
||||
paddsw mm4,[8+RESIDUE]
|
||||
/*#1 Get copy of src.*/
|
||||
movq mm2,mm7
|
||||
/*#0 Add residue low.*/
|
||||
paddsw mm3,[RESIDUE]
|
||||
/*#1 Expand high source.*/
|
||||
punpckhbw mm2,mm0
|
||||
/*#0 Pack final row pixels.*/
|
||||
packuswb mm3,mm4
|
||||
/*#1 Expand low source.*/
|
||||
punpcklbw mm7,mm0
|
||||
/*#1 Add residue low.*/
|
||||
paddsw mm7,[16+RESIDUE]
|
||||
/*#1 Add residue high.*/
|
||||
paddsw mm2,[24+RESIDUE]
|
||||
/*Advance residue.*/
|
||||
lea RESIDUE,[32+RESIDUE]
|
||||
/*#1 Pack final row pixels.*/
|
||||
packuswb mm7,mm2
|
||||
/*Advance src.*/
|
||||
lea SRC,[SRC+YSTRIDE*2]
|
||||
/*#0 Write row.*/
|
||||
movq [DST],mm3
|
||||
/*#1 Write row.*/
|
||||
movq [DST+YSTRIDE],mm7
|
||||
/*Advance dst.*/
|
||||
lea DST,[DST+YSTRIDE*2]
|
||||
mov _residue,RESIDUE
|
||||
mov _dst,DST
|
||||
mov _src,SRC
|
||||
#undef DST
|
||||
#undef SRC
|
||||
#undef YSTRIDE
|
||||
#undef RESIDUE
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){
|
||||
int i;
|
||||
/*Zero mm7.*/
|
||||
__asm pxor mm7,mm7;
|
||||
for(i=4;i-->0;){
|
||||
__asm{
|
||||
#define SRC1 ecx
|
||||
#define SRC2 edi
|
||||
#define YSTRIDE esi
|
||||
#define RESIDUE edx
|
||||
#define DST eax
|
||||
mov YSTRIDE,_ystride
|
||||
mov DST,_dst
|
||||
mov RESIDUE,_residue
|
||||
mov SRC1,_src1
|
||||
mov SRC2,_src2
|
||||
/*#0 Load src1.*/
|
||||
movq mm0,[SRC1]
|
||||
/*#0 Load src2.*/
|
||||
movq mm2,[SRC2]
|
||||
/*#0 Copy src1.*/
|
||||
movq mm1,mm0
|
||||
/*#0 Copy src2.*/
|
||||
movq mm3,mm2
|
||||
/*#1 Load src1.*/
|
||||
movq mm4,[SRC1+YSTRIDE]
|
||||
/*#0 Unpack lower src1.*/
|
||||
punpcklbw mm0,mm7
|
||||
/*#1 Load src2.*/
|
||||
movq mm5,[SRC2+YSTRIDE]
|
||||
/*#0 Unpack higher src1.*/
|
||||
punpckhbw mm1,mm7
|
||||
/*#0 Unpack lower src2.*/
|
||||
punpcklbw mm2,mm7
|
||||
/*#0 Unpack higher src2.*/
|
||||
punpckhbw mm3,mm7
|
||||
/*Advance src1 ptr.*/
|
||||
lea SRC1,[SRC1+YSTRIDE*2]
|
||||
/*Advance src2 ptr.*/
|
||||
lea SRC2,[SRC2+YSTRIDE*2]
|
||||
/*#0 Lower src1+src2.*/
|
||||
paddsw mm0,mm2
|
||||
/*#0 Higher src1+src2.*/
|
||||
paddsw mm1,mm3
|
||||
/*#1 Copy src1.*/
|
||||
movq mm2,mm4
|
||||
/*#0 Build lo average.*/
|
||||
psraw mm0,1
|
||||
/*#1 Copy src2.*/
|
||||
movq mm3,mm5
|
||||
/*#1 Unpack lower src1.*/
|
||||
punpcklbw mm4,mm7
|
||||
/*#0 Build hi average.*/
|
||||
psraw mm1,1
|
||||
/*#1 Unpack higher src1.*/
|
||||
punpckhbw mm2,mm7
|
||||
/*#0 low+=residue.*/
|
||||
paddsw mm0,[RESIDUE]
|
||||
/*#1 Unpack lower src2.*/
|
||||
punpcklbw mm5,mm7
|
||||
/*#0 high+=residue.*/
|
||||
paddsw mm1,[8+RESIDUE]
|
||||
/*#1 Unpack higher src2.*/
|
||||
punpckhbw mm3,mm7
|
||||
/*#1 Lower src1+src2.*/
|
||||
paddsw mm5,mm4
|
||||
/*#0 Pack and saturate.*/
|
||||
packuswb mm0,mm1
|
||||
/*#1 Higher src1+src2.*/
|
||||
paddsw mm3,mm2
|
||||
/*#0 Write row.*/
|
||||
movq [DST],mm0
|
||||
/*#1 Build lo average.*/
|
||||
psraw mm5,1
|
||||
/*#1 Build hi average.*/
|
||||
psraw mm3,1
|
||||
/*#1 low+=residue.*/
|
||||
paddsw mm5,[16+RESIDUE]
|
||||
/*#1 high+=residue.*/
|
||||
paddsw mm3,[24+RESIDUE]
|
||||
/*#1 Pack and saturate.*/
|
||||
packuswb mm5,mm3
|
||||
/*#1 Write row ptr.*/
|
||||
movq [DST+YSTRIDE],mm5
|
||||
/*Advance residue ptr.*/
|
||||
add RESIDUE,32
|
||||
/*Advance dest ptr.*/
|
||||
lea DST,[DST+YSTRIDE*2]
|
||||
mov _dst,DST
|
||||
mov _residue,RESIDUE
|
||||
mov _src1,SRC1
|
||||
mov _src2,SRC2
|
||||
#undef SRC1
|
||||
#undef SRC2
|
||||
#undef YSTRIDE
|
||||
#undef RESIDUE
|
||||
#undef DST
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void oc_restore_fpu_mmx(void){
|
||||
__asm emms;
|
||||
}
|
||||
|
||||
#endif
|
||||
592
engine/thirdparty/libtheora/x86_vc/mmxidct.c
vendored
Normal file
592
engine/thirdparty/libtheora/x86_vc/mmxidct.c
vendored
Normal file
|
|
@ -0,0 +1,592 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/*MMX acceleration of Theora's iDCT.
|
||||
Originally written by Rudolf Marek, based on code from On2's VP3.*/
|
||||
#include "x86int.h"
|
||||
#include "../dct.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*These are offsets into the table of constants below.*/
|
||||
/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
|
||||
#define OC_COSINE_OFFSET (8)
|
||||
/*A row of 8's.*/
|
||||
#define OC_EIGHT_OFFSET (0)
|
||||
|
||||
|
||||
|
||||
/*A table of constants used by the MMX routines.*/
|
||||
static const OC_ALIGN16(ogg_uint16_t) OC_IDCT_CONSTS[(1+7)*4]={
|
||||
8, 8, 8, 8,
|
||||
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
|
||||
(ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
|
||||
(ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
|
||||
(ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
|
||||
(ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
|
||||
(ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
|
||||
(ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
|
||||
(ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
|
||||
(ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
|
||||
(ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
|
||||
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
|
||||
(ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
|
||||
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
|
||||
(ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1
|
||||
};
|
||||
|
||||
/*38 cycles*/
|
||||
#define OC_IDCT_BEGIN(_y,_x) __asm{ \
|
||||
__asm movq mm2,OC_I(3,_x) \
|
||||
__asm movq mm6,OC_C(3) \
|
||||
__asm movq mm4,mm2 \
|
||||
__asm movq mm7,OC_J(5,_x) \
|
||||
__asm pmulhw mm4,mm6 \
|
||||
__asm movq mm1,OC_C(5) \
|
||||
__asm pmulhw mm6,mm7 \
|
||||
__asm movq mm5,mm1 \
|
||||
__asm pmulhw mm1,mm2 \
|
||||
__asm movq mm3,OC_I(1,_x) \
|
||||
__asm pmulhw mm5,mm7 \
|
||||
__asm movq mm0,OC_C(1) \
|
||||
__asm paddw mm4,mm2 \
|
||||
__asm paddw mm6,mm7 \
|
||||
__asm paddw mm2,mm1 \
|
||||
__asm movq mm1,OC_J(7,_x) \
|
||||
__asm paddw mm7,mm5 \
|
||||
__asm movq mm5,mm0 \
|
||||
__asm pmulhw mm0,mm3 \
|
||||
__asm paddw mm4,mm7 \
|
||||
__asm pmulhw mm5,mm1 \
|
||||
__asm movq mm7,OC_C(7) \
|
||||
__asm psubw mm6,mm2 \
|
||||
__asm paddw mm0,mm3 \
|
||||
__asm pmulhw mm3,mm7 \
|
||||
__asm movq mm2,OC_I(2,_x) \
|
||||
__asm pmulhw mm7,mm1 \
|
||||
__asm paddw mm5,mm1 \
|
||||
__asm movq mm1,mm2 \
|
||||
__asm pmulhw mm2,OC_C(2) \
|
||||
__asm psubw mm3,mm5 \
|
||||
__asm movq mm5,OC_J(6,_x) \
|
||||
__asm paddw mm0,mm7 \
|
||||
__asm movq mm7,mm5 \
|
||||
__asm psubw mm0,mm4 \
|
||||
__asm pmulhw mm5,OC_C(2) \
|
||||
__asm paddw mm2,mm1 \
|
||||
__asm pmulhw mm1,OC_C(6) \
|
||||
__asm paddw mm4,mm4 \
|
||||
__asm paddw mm4,mm0 \
|
||||
__asm psubw mm3,mm6 \
|
||||
__asm paddw mm5,mm7 \
|
||||
__asm paddw mm6,mm6 \
|
||||
__asm pmulhw mm7,OC_C(6) \
|
||||
__asm paddw mm6,mm3 \
|
||||
__asm movq OC_I(1,_y),mm4 \
|
||||
__asm psubw mm1,mm5 \
|
||||
__asm movq mm4,OC_C(4) \
|
||||
__asm movq mm5,mm3 \
|
||||
__asm pmulhw mm3,mm4 \
|
||||
__asm paddw mm7,mm2 \
|
||||
__asm movq OC_I(2,_y),mm6 \
|
||||
__asm movq mm2,mm0 \
|
||||
__asm movq mm6,OC_I(0,_x) \
|
||||
__asm pmulhw mm0,mm4 \
|
||||
__asm paddw mm5,mm3 \
|
||||
__asm movq mm3,OC_J(4,_x) \
|
||||
__asm psubw mm5,mm1 \
|
||||
__asm paddw mm2,mm0 \
|
||||
__asm psubw mm6,mm3 \
|
||||
__asm movq mm0,mm6 \
|
||||
__asm pmulhw mm6,mm4 \
|
||||
__asm paddw mm3,mm3 \
|
||||
__asm paddw mm1,mm1 \
|
||||
__asm paddw mm3,mm0 \
|
||||
__asm paddw mm1,mm5 \
|
||||
__asm pmulhw mm4,mm3 \
|
||||
__asm paddw mm6,mm0 \
|
||||
__asm psubw mm6,mm2 \
|
||||
__asm paddw mm2,mm2 \
|
||||
__asm movq mm0,OC_I(1,_y) \
|
||||
__asm paddw mm2,mm6 \
|
||||
__asm paddw mm4,mm3 \
|
||||
__asm psubw mm2,mm1 \
|
||||
}
|
||||
|
||||
/*38+8=46 cycles.*/
|
||||
#define OC_ROW_IDCT(_y,_x) __asm{ \
|
||||
OC_IDCT_BEGIN(_y,_x) \
|
||||
/*r3=D'*/ \
|
||||
__asm movq mm3,OC_I(2,_y) \
|
||||
/*r4=E'=E-G*/ \
|
||||
__asm psubw mm4,mm7 \
|
||||
/*r1=H'+H'*/ \
|
||||
__asm paddw mm1,mm1 \
|
||||
/*r7=G+G*/ \
|
||||
__asm paddw mm7,mm7 \
|
||||
/*r1=R1=A''+H'*/ \
|
||||
__asm paddw mm1,mm2 \
|
||||
/*r7=G'=E+G*/ \
|
||||
__asm paddw mm7,mm4 \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
__asm psubw mm4,mm3 \
|
||||
__asm paddw mm3,mm3 \
|
||||
/*r6=R6=F'-B''*/ \
|
||||
__asm psubw mm6,mm5 \
|
||||
__asm paddw mm5,mm5 \
|
||||
/*r3=R3=E'+D'*/ \
|
||||
__asm paddw mm3,mm4 \
|
||||
/*r5=R5=F'+B''*/ \
|
||||
__asm paddw mm5,mm6 \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
__asm psubw mm7,mm0 \
|
||||
__asm paddw mm0,mm0 \
|
||||
/*Save R1.*/ \
|
||||
__asm movq OC_I(1,_y),mm1 \
|
||||
/*r0=R0=G.+C.*/ \
|
||||
__asm paddw mm0,mm7 \
|
||||
}
|
||||
|
||||
/*The following macro does two 4x4 transposes in place.
|
||||
At entry, we assume:
|
||||
r0 = a3 a2 a1 a0
|
||||
I(1) = b3 b2 b1 b0
|
||||
r2 = c3 c2 c1 c0
|
||||
r3 = d3 d2 d1 d0
|
||||
|
||||
r4 = e3 e2 e1 e0
|
||||
r5 = f3 f2 f1 f0
|
||||
r6 = g3 g2 g1 g0
|
||||
r7 = h3 h2 h1 h0
|
||||
|
||||
At exit, we have:
|
||||
I(0) = d0 c0 b0 a0
|
||||
I(1) = d1 c1 b1 a1
|
||||
I(2) = d2 c2 b2 a2
|
||||
I(3) = d3 c3 b3 a3
|
||||
|
||||
J(4) = h0 g0 f0 e0
|
||||
J(5) = h1 g1 f1 e1
|
||||
J(6) = h2 g2 f2 e2
|
||||
J(7) = h3 g3 f3 e3
|
||||
|
||||
I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
|
||||
J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
|
||||
|
||||
Since r1 is free at entry, we calculate the Js first.*/
|
||||
/*19 cycles.*/
|
||||
#define OC_TRANSPOSE(_y) __asm{ \
|
||||
__asm movq mm1,mm4 \
|
||||
__asm punpcklwd mm4,mm5 \
|
||||
__asm movq OC_I(0,_y),mm0 \
|
||||
__asm punpckhwd mm1,mm5 \
|
||||
__asm movq mm0,mm6 \
|
||||
__asm punpcklwd mm6,mm7 \
|
||||
__asm movq mm5,mm4 \
|
||||
__asm punpckldq mm4,mm6 \
|
||||
__asm punpckhdq mm5,mm6 \
|
||||
__asm movq mm6,mm1 \
|
||||
__asm movq OC_J(4,_y),mm4 \
|
||||
__asm punpckhwd mm0,mm7 \
|
||||
__asm movq OC_J(5,_y),mm5 \
|
||||
__asm punpckhdq mm6,mm0 \
|
||||
__asm movq mm4,OC_I(0,_y) \
|
||||
__asm punpckldq mm1,mm0 \
|
||||
__asm movq mm5,OC_I(1,_y) \
|
||||
__asm movq mm0,mm4 \
|
||||
__asm movq OC_J(7,_y),mm6 \
|
||||
__asm punpcklwd mm0,mm5 \
|
||||
__asm movq OC_J(6,_y),mm1 \
|
||||
__asm punpckhwd mm4,mm5 \
|
||||
__asm movq mm5,mm2 \
|
||||
__asm punpcklwd mm2,mm3 \
|
||||
__asm movq mm1,mm0 \
|
||||
__asm punpckldq mm0,mm2 \
|
||||
__asm punpckhdq mm1,mm2 \
|
||||
__asm movq mm2,mm4 \
|
||||
__asm movq OC_I(0,_y),mm0 \
|
||||
__asm punpckhwd mm5,mm3 \
|
||||
__asm movq OC_I(1,_y),mm1 \
|
||||
__asm punpckhdq mm4,mm5 \
|
||||
__asm punpckldq mm2,mm5 \
|
||||
__asm movq OC_I(3,_y),mm4 \
|
||||
__asm movq OC_I(2,_y),mm2 \
|
||||
}
|
||||
|
||||
/*38+19=57 cycles.*/
|
||||
#define OC_COLUMN_IDCT(_y) __asm{ \
|
||||
OC_IDCT_BEGIN(_y,_y) \
|
||||
__asm paddw mm2,OC_8 \
|
||||
/*r1=H'+H'*/ \
|
||||
__asm paddw mm1,mm1 \
|
||||
/*r1=R1=A''+H'*/ \
|
||||
__asm paddw mm1,mm2 \
|
||||
/*r2=NR2*/ \
|
||||
__asm psraw mm2,4 \
|
||||
/*r4=E'=E-G*/ \
|
||||
__asm psubw mm4,mm7 \
|
||||
/*r1=NR1*/ \
|
||||
__asm psraw mm1,4 \
|
||||
/*r3=D'*/ \
|
||||
__asm movq mm3,OC_I(2,_y) \
|
||||
/*r7=G+G*/ \
|
||||
__asm paddw mm7,mm7 \
|
||||
/*Store NR2 at I(2).*/ \
|
||||
__asm movq OC_I(2,_y),mm2 \
|
||||
/*r7=G'=E+G*/ \
|
||||
__asm paddw mm7,mm4 \
|
||||
/*Store NR1 at I(1).*/ \
|
||||
__asm movq OC_I(1,_y),mm1 \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
__asm psubw mm4,mm3 \
|
||||
__asm paddw mm4,OC_8 \
|
||||
/*r3=D'+D'*/ \
|
||||
__asm paddw mm3,mm3 \
|
||||
/*r3=R3=E'+D'*/ \
|
||||
__asm paddw mm3,mm4 \
|
||||
/*r4=NR4*/ \
|
||||
__asm psraw mm4,4 \
|
||||
/*r6=R6=F'-B''*/ \
|
||||
__asm psubw mm6,mm5 \
|
||||
/*r3=NR3*/ \
|
||||
__asm psraw mm3,4 \
|
||||
__asm paddw mm6,OC_8 \
|
||||
/*r5=B''+B''*/ \
|
||||
__asm paddw mm5,mm5 \
|
||||
/*r5=R5=F'+B''*/ \
|
||||
__asm paddw mm5,mm6 \
|
||||
/*r6=NR6*/ \
|
||||
__asm psraw mm6,4 \
|
||||
/*Store NR4 at J(4).*/ \
|
||||
__asm movq OC_J(4,_y),mm4 \
|
||||
/*r5=NR5*/ \
|
||||
__asm psraw mm5,4 \
|
||||
/*Store NR3 at I(3).*/ \
|
||||
__asm movq OC_I(3,_y),mm3 \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
__asm psubw mm7,mm0 \
|
||||
__asm paddw mm7,OC_8 \
|
||||
/*r0=C'+C'*/ \
|
||||
__asm paddw mm0,mm0 \
|
||||
/*r0=R0=G'+C'*/ \
|
||||
__asm paddw mm0,mm7 \
|
||||
/*r7=NR7*/ \
|
||||
__asm psraw mm7,4 \
|
||||
/*Store NR6 at J(6).*/ \
|
||||
__asm movq OC_J(6,_y),mm6 \
|
||||
/*r0=NR0*/ \
|
||||
__asm psraw mm0,4 \
|
||||
/*Store NR5 at J(5).*/ \
|
||||
__asm movq OC_J(5,_y),mm5 \
|
||||
/*Store NR7 at J(7).*/ \
|
||||
__asm movq OC_J(7,_y),mm7 \
|
||||
/*Store NR0 at I(0).*/ \
|
||||
__asm movq OC_I(0,_y),mm0 \
|
||||
}
|
||||
|
||||
#define OC_MID(_m,_i) [CONSTS+_m+(_i)*8]
|
||||
#define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1)
|
||||
#define OC_8 OC_MID(OC_EIGHT_OFFSET,0)
|
||||
|
||||
static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
int i;
|
||||
/*This routine accepts an 8x8 matrix, but in partially transposed form.
|
||||
Every 4x4 block is transposed.*/
|
||||
__asm{
|
||||
#define CONSTS eax
|
||||
#define Y edx
|
||||
#define X ecx
|
||||
mov CONSTS,offset OC_IDCT_CONSTS
|
||||
mov Y,_y
|
||||
mov X,_x
|
||||
#define OC_I(_k,_y) [(_y)+(_k)*16]
|
||||
#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]
|
||||
OC_ROW_IDCT(Y,X)
|
||||
OC_TRANSPOSE(Y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) [(_y)+(_k)*16+64]
|
||||
#define OC_J(_k,_y) [(_y)+((_k)-4)*16+72]
|
||||
OC_ROW_IDCT(Y,X)
|
||||
OC_TRANSPOSE(Y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) [(_y)+(_k)*16]
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT(Y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) [(_y)+(_k)*16+8]
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT(Y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#undef CONSTS
|
||||
#undef Y
|
||||
#undef X
|
||||
}
|
||||
__asm pxor mm0,mm0;
|
||||
for(i=0;i<4;i++){
|
||||
ogg_int16_t *x;
|
||||
x=_x+16*i;
|
||||
#define X ecx
|
||||
__asm{
|
||||
mov X,x
|
||||
movq [X+0x00],mm0
|
||||
movq [X+0x08],mm0
|
||||
movq [X+0x10],mm0
|
||||
movq [X+0x18],mm0
|
||||
}
|
||||
#undef X
|
||||
}
|
||||
}
|
||||
|
||||
/*25 cycles.*/
|
||||
#define OC_IDCT_BEGIN_10(_y,_x) __asm{ \
|
||||
__asm movq mm2,OC_I(3,_x) \
|
||||
__asm nop \
|
||||
__asm movq mm6,OC_C(3) \
|
||||
__asm movq mm4,mm2 \
|
||||
__asm movq mm1,OC_C(5) \
|
||||
__asm pmulhw mm4,mm6 \
|
||||
__asm movq mm3,OC_I(1,_x) \
|
||||
__asm pmulhw mm1,mm2 \
|
||||
__asm movq mm0,OC_C(1) \
|
||||
__asm paddw mm4,mm2 \
|
||||
__asm pxor mm6,mm6 \
|
||||
__asm paddw mm2,mm1 \
|
||||
__asm movq mm5,OC_I(2,_x) \
|
||||
__asm pmulhw mm0,mm3 \
|
||||
__asm movq mm1,mm5 \
|
||||
__asm paddw mm0,mm3 \
|
||||
__asm pmulhw mm3,OC_C(7) \
|
||||
__asm psubw mm6,mm2 \
|
||||
__asm pmulhw mm5,OC_C(2) \
|
||||
__asm psubw mm0,mm4 \
|
||||
__asm movq mm7,OC_I(2,_x) \
|
||||
__asm paddw mm4,mm4 \
|
||||
__asm paddw mm7,mm5 \
|
||||
__asm paddw mm4,mm0 \
|
||||
__asm pmulhw mm1,OC_C(6) \
|
||||
__asm psubw mm3,mm6 \
|
||||
__asm movq OC_I(1,_y),mm4 \
|
||||
__asm paddw mm6,mm6 \
|
||||
__asm movq mm4,OC_C(4) \
|
||||
__asm paddw mm6,mm3 \
|
||||
__asm movq mm5,mm3 \
|
||||
__asm pmulhw mm3,mm4 \
|
||||
__asm movq OC_I(2,_y),mm6 \
|
||||
__asm movq mm2,mm0 \
|
||||
__asm movq mm6,OC_I(0,_x) \
|
||||
__asm pmulhw mm0,mm4 \
|
||||
__asm paddw mm5,mm3 \
|
||||
__asm paddw mm2,mm0 \
|
||||
__asm psubw mm5,mm1 \
|
||||
__asm pmulhw mm6,mm4 \
|
||||
__asm paddw mm6,OC_I(0,_x) \
|
||||
__asm paddw mm1,mm1 \
|
||||
__asm movq mm4,mm6 \
|
||||
__asm paddw mm1,mm5 \
|
||||
__asm psubw mm6,mm2 \
|
||||
__asm paddw mm2,mm2 \
|
||||
__asm movq mm0,OC_I(1,_y) \
|
||||
__asm paddw mm2,mm6 \
|
||||
__asm psubw mm2,mm1 \
|
||||
__asm nop \
|
||||
}
|
||||
|
||||
/*25+8=33 cycles.*/
|
||||
#define OC_ROW_IDCT_10(_y,_x) __asm{ \
|
||||
OC_IDCT_BEGIN_10(_y,_x) \
|
||||
/*r3=D'*/ \
|
||||
__asm movq mm3,OC_I(2,_y) \
|
||||
/*r4=E'=E-G*/ \
|
||||
__asm psubw mm4,mm7 \
|
||||
/*r1=H'+H'*/ \
|
||||
__asm paddw mm1,mm1 \
|
||||
/*r7=G+G*/ \
|
||||
__asm paddw mm7,mm7 \
|
||||
/*r1=R1=A''+H'*/ \
|
||||
__asm paddw mm1,mm2 \
|
||||
/*r7=G'=E+G*/ \
|
||||
__asm paddw mm7,mm4 \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
__asm psubw mm4,mm3 \
|
||||
__asm paddw mm3,mm3 \
|
||||
/*r6=R6=F'-B''*/ \
|
||||
__asm psubw mm6,mm5 \
|
||||
__asm paddw mm5,mm5 \
|
||||
/*r3=R3=E'+D'*/ \
|
||||
__asm paddw mm3,mm4 \
|
||||
/*r5=R5=F'+B''*/ \
|
||||
__asm paddw mm5,mm6 \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
__asm psubw mm7,mm0 \
|
||||
__asm paddw mm0,mm0 \
|
||||
/*Save R1.*/ \
|
||||
__asm movq OC_I(1,_y),mm1 \
|
||||
/*r0=R0=G'+C'*/ \
|
||||
__asm paddw mm0,mm7 \
|
||||
}
|
||||
|
||||
/*25+19=44 cycles'*/
|
||||
#define OC_COLUMN_IDCT_10(_y) __asm{ \
|
||||
OC_IDCT_BEGIN_10(_y,_y) \
|
||||
__asm paddw mm2,OC_8 \
|
||||
/*r1=H'+H'*/ \
|
||||
__asm paddw mm1,mm1 \
|
||||
/*r1=R1=A''+H'*/ \
|
||||
__asm paddw mm1,mm2 \
|
||||
/*r2=NR2*/ \
|
||||
__asm psraw mm2,4 \
|
||||
/*r4=E'=E-G*/ \
|
||||
__asm psubw mm4,mm7 \
|
||||
/*r1=NR1*/ \
|
||||
__asm psraw mm1,4 \
|
||||
/*r3=D'*/ \
|
||||
__asm movq mm3,OC_I(2,_y) \
|
||||
/*r7=G+G*/ \
|
||||
__asm paddw mm7,mm7 \
|
||||
/*Store NR2 at I(2).*/ \
|
||||
__asm movq OC_I(2,_y),mm2 \
|
||||
/*r7=G'=E+G*/ \
|
||||
__asm paddw mm7,mm4 \
|
||||
/*Store NR1 at I(1).*/ \
|
||||
__asm movq OC_I(1,_y),mm1 \
|
||||
/*r4=R4=E'-D'*/ \
|
||||
__asm psubw mm4,mm3 \
|
||||
__asm paddw mm4,OC_8 \
|
||||
/*r3=D'+D'*/ \
|
||||
__asm paddw mm3,mm3 \
|
||||
/*r3=R3=E'+D'*/ \
|
||||
__asm paddw mm3,mm4 \
|
||||
/*r4=NR4*/ \
|
||||
__asm psraw mm4,4 \
|
||||
/*r6=R6=F'-B''*/ \
|
||||
__asm psubw mm6,mm5 \
|
||||
/*r3=NR3*/ \
|
||||
__asm psraw mm3,4 \
|
||||
__asm paddw mm6,OC_8 \
|
||||
/*r5=B''+B''*/ \
|
||||
__asm paddw mm5,mm5 \
|
||||
/*r5=R5=F'+B''*/ \
|
||||
__asm paddw mm5,mm6 \
|
||||
/*r6=NR6*/ \
|
||||
__asm psraw mm6,4 \
|
||||
/*Store NR4 at J(4).*/ \
|
||||
__asm movq OC_J(4,_y),mm4 \
|
||||
/*r5=NR5*/ \
|
||||
__asm psraw mm5,4 \
|
||||
/*Store NR3 at I(3).*/ \
|
||||
__asm movq OC_I(3,_y),mm3 \
|
||||
/*r7=R7=G'-C'*/ \
|
||||
__asm psubw mm7,mm0 \
|
||||
__asm paddw mm7,OC_8 \
|
||||
/*r0=C'+C'*/ \
|
||||
__asm paddw mm0,mm0 \
|
||||
/*r0=R0=G'+C'*/ \
|
||||
__asm paddw mm0,mm7 \
|
||||
/*r7=NR7*/ \
|
||||
__asm psraw mm7,4 \
|
||||
/*Store NR6 at J(6).*/ \
|
||||
__asm movq OC_J(6,_y),mm6 \
|
||||
/*r0=NR0*/ \
|
||||
__asm psraw mm0,4 \
|
||||
/*Store NR5 at J(5).*/ \
|
||||
__asm movq OC_J(5,_y),mm5 \
|
||||
/*Store NR7 at J(7).*/ \
|
||||
__asm movq OC_J(7,_y),mm7 \
|
||||
/*Store NR0 at I(0).*/ \
|
||||
__asm movq OC_I(0,_y),mm0 \
|
||||
}
|
||||
|
||||
static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
|
||||
__asm{
|
||||
#define CONSTS eax
|
||||
#define Y edx
|
||||
#define X ecx
|
||||
mov CONSTS,offset OC_IDCT_CONSTS
|
||||
mov Y,_y
|
||||
mov X,_x
|
||||
#define OC_I(_k,_y) [(_y)+(_k)*16]
|
||||
#define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]
|
||||
/*Done with dequant, descramble, and partial transpose.
|
||||
Now do the iDCT itself.*/
|
||||
OC_ROW_IDCT_10(Y,X)
|
||||
OC_TRANSPOSE(Y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) [(_y)+(_k)*16]
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT_10(Y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#define OC_I(_k,_y) [(_y)+(_k)*16+8]
|
||||
#define OC_J(_k,_y) OC_I(_k,_y)
|
||||
OC_COLUMN_IDCT_10(Y)
|
||||
#undef OC_I
|
||||
#undef OC_J
|
||||
#undef CONSTS
|
||||
#undef Y
|
||||
#undef X
|
||||
}
|
||||
#define X ecx
|
||||
__asm{
|
||||
pxor mm0,mm0;
|
||||
mov X,_x
|
||||
movq [X+0x00],mm0
|
||||
movq [X+0x10],mm0
|
||||
movq [X+0x20],mm0
|
||||
movq [X+0x30],mm0
|
||||
}
|
||||
#undef X
|
||||
}
|
||||
|
||||
/*Performs an inverse 8x8 Type-II DCT transform.
|
||||
The input is assumed to be scaled by a factor of 4 relative to orthonormal
|
||||
version of the transform.*/
|
||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
|
||||
/*_last_zzi is subtly different from an actual count of the number of
|
||||
coefficients we decoded for this block.
|
||||
It contains the value of zzi BEFORE the final token in the block was
|
||||
decoded.
|
||||
In most cases this is an EOB token (the continuation of an EOB run from a
|
||||
previous block counts), and so this is the same as the coefficient count.
|
||||
However, in the case that the last token was NOT an EOB token, but filled
|
||||
the block up with exactly 64 coefficients, _last_zzi will be less than 64.
|
||||
Provided the last token was not a pure zero run, the minimum value it can
|
||||
be is 46, and so that doesn't affect any of the cases in this routine.
|
||||
However, if the last token WAS a pure zero run of length 63, then _last_zzi
|
||||
will be 1 while the number of coefficients decoded is 64.
|
||||
Thus, we will trigger the following special case, where the real
|
||||
coefficient count would not.
|
||||
Note also that a zero run of length 64 will give _last_zzi a value of 0,
|
||||
but we still process the DC coefficient, which might have a non-zero value
|
||||
due to DC prediction.
|
||||
Although convoluted, this is arguably the correct behavior: it allows us to
|
||||
use a smaller transform when the block ends with a long zero run instead
|
||||
of a normal EOB token.
|
||||
It could be smarter... multiple separate zero runs at the end of a block
|
||||
will fool it, but an encoder that generates these really deserves what it
|
||||
gets.
|
||||
Needless to say we inherited this approach from VP3.*/
|
||||
/*Perform the iDCT.*/
|
||||
if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
|
||||
else oc_idct8x8_slow(_y,_x);
|
||||
}
|
||||
|
||||
#endif
|
||||
219
engine/thirdparty/libtheora/x86_vc/mmxloop.h
vendored
Normal file
219
engine/thirdparty/libtheora/x86_vc/mmxloop.h
vendored
Normal file
|
|
@ -0,0 +1,219 @@
|
|||
#if !defined(_x86_vc_mmxloop_H)
|
||||
# define _x86_vc_mmxloop_H (1)
|
||||
# include <stddef.h>
|
||||
# include "x86int.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*On entry, mm0={a0,...,a7}, mm1={b0,...,b7}, mm2={c0,...,c7}, mm3={d0,...d7}.
|
||||
On exit, mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)} and
|
||||
mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}; mm0 and mm3 are clobbered.*/
|
||||
#define OC_LOOP_FILTER8_MMX __asm{ \
|
||||
/*mm7=0*/ \
|
||||
__asm pxor mm7,mm7 \
|
||||
/*mm6:mm0={a0,...,a7}*/ \
|
||||
__asm movq mm6,mm0 \
|
||||
__asm punpcklbw mm0,mm7 \
|
||||
__asm punpckhbw mm6,mm7 \
|
||||
/*mm3:mm5={d0,...,d7}*/ \
|
||||
__asm movq mm5,mm3 \
|
||||
__asm punpcklbw mm3,mm7 \
|
||||
__asm punpckhbw mm5,mm7 \
|
||||
/*mm6:mm0={a0-d0,...,a7-d7}*/ \
|
||||
__asm psubw mm0,mm3 \
|
||||
__asm psubw mm6,mm5 \
|
||||
/*mm3:mm1={b0,...,b7}*/ \
|
||||
__asm movq mm3,mm1 \
|
||||
__asm punpcklbw mm1,mm7 \
|
||||
__asm movq mm4,mm2 \
|
||||
__asm punpckhbw mm3,mm7 \
|
||||
/*mm5:mm4={c0,...,c7}*/ \
|
||||
__asm movq mm5,mm2 \
|
||||
__asm punpcklbw mm4,mm7 \
|
||||
__asm punpckhbw mm5,mm7 \
|
||||
/*mm7={3}x4 \
|
||||
mm5:mm4={c0-b0,...,c7-b7}*/ \
|
||||
__asm pcmpeqw mm7,mm7 \
|
||||
__asm psubw mm4,mm1 \
|
||||
__asm psrlw mm7,14 \
|
||||
__asm psubw mm5,mm3 \
|
||||
/*Scale by 3.*/ \
|
||||
__asm pmullw mm4,mm7 \
|
||||
__asm pmullw mm5,mm7 \
|
||||
/*mm7={4}x4 \
|
||||
mm5:mm4=f={a0-d0+3*(c0-b0),...,a7-d7+3*(c7-b7)}*/ \
|
||||
__asm psrlw mm7,1 \
|
||||
__asm paddw mm4,mm0 \
|
||||
__asm psllw mm7,2 \
|
||||
__asm movq mm0,[LL] \
|
||||
__asm paddw mm5,mm6 \
|
||||
/*R_i has the range [-127,128], so we compute -R_i instead. \
|
||||
mm4=-R_i=-(f+4>>3)=0xFF^(f-4>>3)*/ \
|
||||
__asm psubw mm4,mm7 \
|
||||
__asm psubw mm5,mm7 \
|
||||
__asm psraw mm4,3 \
|
||||
__asm psraw mm5,3 \
|
||||
__asm pcmpeqb mm7,mm7 \
|
||||
__asm packsswb mm4,mm5 \
|
||||
__asm pxor mm6,mm6 \
|
||||
__asm pxor mm4,mm7 \
|
||||
__asm packuswb mm1,mm3 \
|
||||
/*Now compute lflim of -mm4 cf. Section 7.10 of the sepc.*/ \
|
||||
/*There's no unsigned byte+signed byte with unsigned saturation op code, so \
|
||||
we have to split things by sign (the other option is to work in 16 bits, \
|
||||
but working in 8 bits gives much better parallelism). \
|
||||
We compute abs(R_i), but save a mask of which terms were negative in mm6. \
|
||||
Then we compute mm4=abs(lflim(R_i,L))=min(abs(R_i),max(2*L-abs(R_i),0)). \
|
||||
Finally, we split mm4 into positive and negative pieces using the mask in \
|
||||
mm6, and add and subtract them as appropriate.*/ \
|
||||
/*mm4=abs(-R_i)*/ \
|
||||
/*mm7=255-2*L*/ \
|
||||
__asm pcmpgtb mm6,mm4 \
|
||||
__asm psubb mm7,mm0 \
|
||||
__asm pxor mm4,mm6 \
|
||||
__asm psubb mm7,mm0 \
|
||||
__asm psubb mm4,mm6 \
|
||||
/*mm7=255-max(2*L-abs(R_i),0)*/ \
|
||||
__asm paddusb mm7,mm4 \
|
||||
/*mm4=min(abs(R_i),max(2*L-abs(R_i),0))*/ \
|
||||
__asm paddusb mm4,mm7 \
|
||||
__asm psubusb mm4,mm7 \
|
||||
/*Now split mm4 by the original sign of -R_i.*/ \
|
||||
__asm movq mm5,mm4 \
|
||||
__asm pand mm4,mm6 \
|
||||
__asm pandn mm6,mm5 \
|
||||
/*mm1={b0+lflim(R_0,L),...,b7+lflim(R_7,L)}*/ \
|
||||
/*mm2={c0-lflim(R_0,L),...,c7-lflim(R_7,L)}*/ \
|
||||
__asm paddusb mm1,mm4 \
|
||||
__asm psubusb mm2,mm4 \
|
||||
__asm psubusb mm1,mm6 \
|
||||
__asm paddusb mm2,mm6 \
|
||||
}
|
||||
|
||||
#define OC_LOOP_FILTER_V_MMX(_pix,_ystride,_ll) \
|
||||
do{ \
|
||||
/*Used local variable pix__ in order to fix compilation errors like: \
|
||||
"error C2425: 'SHL' : non-constant expression in 'second operand'".*/ \
|
||||
unsigned char *pix__; \
|
||||
unsigned char *ll__; \
|
||||
ll__=(_ll); \
|
||||
pix__=(_pix); \
|
||||
__asm mov YSTRIDE,_ystride \
|
||||
__asm mov LL,ll__ \
|
||||
__asm mov PIX,pix__ \
|
||||
__asm sub PIX,YSTRIDE \
|
||||
__asm sub PIX,YSTRIDE \
|
||||
/*mm0={a0,...,a7}*/ \
|
||||
__asm movq mm0,[PIX] \
|
||||
/*ystride3=_ystride*3*/ \
|
||||
__asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
|
||||
/*mm3={d0,...,d7}*/ \
|
||||
__asm movq mm3,[PIX+YSTRIDE3] \
|
||||
/*mm1={b0,...,b7}*/ \
|
||||
__asm movq mm1,[PIX+YSTRIDE] \
|
||||
/*mm2={c0,...,c7}*/ \
|
||||
__asm movq mm2,[PIX+YSTRIDE*2] \
|
||||
OC_LOOP_FILTER8_MMX \
|
||||
/*Write it back out.*/ \
|
||||
__asm movq [PIX+YSTRIDE],mm1 \
|
||||
__asm movq [PIX+YSTRIDE*2],mm2 \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
#define OC_LOOP_FILTER_H_MMX(_pix,_ystride,_ll) \
|
||||
do{ \
|
||||
/*Used local variable ll__ in order to fix compilation errors like: \
|
||||
"error C2443: operand size conflict".*/ \
|
||||
unsigned char *ll__; \
|
||||
unsigned char *pix__; \
|
||||
ll__=(_ll); \
|
||||
pix__=(_pix)-2; \
|
||||
__asm mov PIX,pix__ \
|
||||
__asm mov YSTRIDE,_ystride \
|
||||
__asm mov LL,ll__ \
|
||||
/*x x x x d0 c0 b0 a0*/ \
|
||||
__asm movd mm0,[PIX] \
|
||||
/*x x x x d1 c1 b1 a1*/ \
|
||||
__asm movd mm1,[PIX+YSTRIDE] \
|
||||
/*ystride3=_ystride*3*/ \
|
||||
__asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \
|
||||
/*x x x x d2 c2 b2 a2*/ \
|
||||
__asm movd mm2,[PIX+YSTRIDE*2] \
|
||||
/*x x x x d3 c3 b3 a3*/ \
|
||||
__asm lea D,[PIX+YSTRIDE*4] \
|
||||
__asm movd mm3,[PIX+YSTRIDE3] \
|
||||
/*x x x x d4 c4 b4 a4*/ \
|
||||
__asm movd mm4,[D] \
|
||||
/*x x x x d5 c5 b5 a5*/ \
|
||||
__asm movd mm5,[D+YSTRIDE] \
|
||||
/*x x x x d6 c6 b6 a6*/ \
|
||||
__asm movd mm6,[D+YSTRIDE*2] \
|
||||
/*x x x x d7 c7 b7 a7*/ \
|
||||
__asm movd mm7,[D+YSTRIDE3] \
|
||||
/*mm0=d1 d0 c1 c0 b1 b0 a1 a0*/ \
|
||||
__asm punpcklbw mm0,mm1 \
|
||||
/*mm2=d3 d2 c3 c2 b3 b2 a3 a2*/ \
|
||||
__asm punpcklbw mm2,mm3 \
|
||||
/*mm3=d1 d0 c1 c0 b1 b0 a1 a0*/ \
|
||||
__asm movq mm3,mm0 \
|
||||
/*mm0=b3 b2 b1 b0 a3 a2 a1 a0*/ \
|
||||
__asm punpcklwd mm0,mm2 \
|
||||
/*mm3=d3 d2 d1 d0 c3 c2 c1 c0*/ \
|
||||
__asm punpckhwd mm3,mm2 \
|
||||
/*mm1=b3 b2 b1 b0 a3 a2 a1 a0*/ \
|
||||
__asm movq mm1,mm0 \
|
||||
/*mm4=d5 d4 c5 c4 b5 b4 a5 a4*/ \
|
||||
__asm punpcklbw mm4,mm5 \
|
||||
/*mm6=d7 d6 c7 c6 b7 b6 a7 a6*/ \
|
||||
__asm punpcklbw mm6,mm7 \
|
||||
/*mm5=d5 d4 c5 c4 b5 b4 a5 a4*/ \
|
||||
__asm movq mm5,mm4 \
|
||||
/*mm4=b7 b6 b5 b4 a7 a6 a5 a4*/ \
|
||||
__asm punpcklwd mm4,mm6 \
|
||||
/*mm5=d7 d6 d5 d4 c7 c6 c5 c4*/ \
|
||||
__asm punpckhwd mm5,mm6 \
|
||||
/*mm2=d3 d2 d1 d0 c3 c2 c1 c0*/ \
|
||||
__asm movq mm2,mm3 \
|
||||
/*mm0=a7 a6 a5 a4 a3 a2 a1 a0*/ \
|
||||
__asm punpckldq mm0,mm4 \
|
||||
/*mm1=b7 b6 b5 b4 b3 b2 b1 b0*/ \
|
||||
__asm punpckhdq mm1,mm4 \
|
||||
/*mm2=c7 c6 c5 c4 c3 c2 c1 c0*/ \
|
||||
__asm punpckldq mm2,mm5 \
|
||||
/*mm3=d7 d6 d5 d4 d3 d2 d1 d0*/ \
|
||||
__asm punpckhdq mm3,mm5 \
|
||||
OC_LOOP_FILTER8_MMX \
|
||||
/*mm2={b0+R_0'',...,b7+R_7''}*/ \
|
||||
__asm movq mm0,mm1 \
|
||||
/*mm1={b0+R_0'',c0-R_0'',...,b3+R_3'',c3-R_3''}*/ \
|
||||
__asm punpcklbw mm1,mm2 \
|
||||
/*mm2={b4+R_4'',c4-R_4'',...,b7+R_7'',c7-R_7''}*/ \
|
||||
__asm punpckhbw mm0,mm2 \
|
||||
/*[d]=c1 b1 c0 b0*/ \
|
||||
__asm movd D,mm1 \
|
||||
__asm mov [PIX+1],D_WORD \
|
||||
__asm psrlq mm1,32 \
|
||||
__asm shr D,16 \
|
||||
__asm mov [PIX+YSTRIDE+1],D_WORD \
|
||||
/*[d]=c3 b3 c2 b2*/ \
|
||||
__asm movd D,mm1 \
|
||||
__asm mov [PIX+YSTRIDE*2+1],D_WORD \
|
||||
__asm shr D,16 \
|
||||
__asm mov [PIX+YSTRIDE3+1],D_WORD \
|
||||
__asm lea PIX,[PIX+YSTRIDE*4] \
|
||||
/*[d]=c5 b5 c4 b4*/ \
|
||||
__asm movd D,mm0 \
|
||||
__asm mov [PIX+1],D_WORD \
|
||||
__asm psrlq mm0,32 \
|
||||
__asm shr D,16 \
|
||||
__asm mov [PIX+YSTRIDE+1],D_WORD \
|
||||
/*[d]=c7 b7 c6 b6*/ \
|
||||
__asm movd D,mm0 \
|
||||
__asm mov [PIX+YSTRIDE*2+1],D_WORD \
|
||||
__asm shr D,16 \
|
||||
__asm mov [PIX+YSTRIDE3+1],D_WORD \
|
||||
} \
|
||||
while(0)
|
||||
|
||||
# endif
|
||||
#endif
|
||||
176
engine/thirdparty/libtheora/x86_vc/mmxstate.c
vendored
Normal file
176
engine/thirdparty/libtheora/x86_vc/mmxstate.c
vendored
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
/*MMX acceleration of complete fragment reconstruction algorithm.
|
||||
Originally written by Rudolf Marek.*/
|
||||
#include <string.h>
|
||||
#include "x86int.h"
|
||||
#include "mmxloop.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
|
||||
unsigned char *dst;
|
||||
ptrdiff_t frag_buf_off;
|
||||
int ystride;
|
||||
int refi;
|
||||
/*Apply the inverse transform.*/
|
||||
/*Special case only having a DC component.*/
|
||||
if(_last_zzi<2){
|
||||
/*Note that this value must be unsigned, to keep the __asm__ block from
|
||||
sign-extending it when it puts it in a register.*/
|
||||
ogg_uint16_t p;
|
||||
/*We round this dequant product (and not any of the others) because there's
|
||||
no iDCT rounding.*/
|
||||
p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
|
||||
/*Fill _dct_coeffs with p.*/
|
||||
__asm{
|
||||
#define Y eax
|
||||
#define P ecx
|
||||
mov Y,_dct_coeffs
|
||||
movzx P,p
|
||||
lea Y,[Y+128]
|
||||
/*mm0=0000 0000 0000 AAAA*/
|
||||
movd mm0,P
|
||||
/*mm0=0000 0000 AAAA AAAA*/
|
||||
punpcklwd mm0,mm0
|
||||
/*mm0=AAAA AAAA AAAA AAAA*/
|
||||
punpckldq mm0,mm0
|
||||
movq [Y],mm0
|
||||
movq [8+Y],mm0
|
||||
movq [16+Y],mm0
|
||||
movq [24+Y],mm0
|
||||
movq [32+Y],mm0
|
||||
movq [40+Y],mm0
|
||||
movq [48+Y],mm0
|
||||
movq [56+Y],mm0
|
||||
movq [64+Y],mm0
|
||||
movq [72+Y],mm0
|
||||
movq [80+Y],mm0
|
||||
movq [88+Y],mm0
|
||||
movq [96+Y],mm0
|
||||
movq [104+Y],mm0
|
||||
movq [112+Y],mm0
|
||||
movq [120+Y],mm0
|
||||
#undef Y
|
||||
#undef P
|
||||
}
|
||||
}
|
||||
else{
|
||||
/*Dequantize the DC coefficient.*/
|
||||
_dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
|
||||
oc_idct8x8_mmx(_dct_coeffs+64,_dct_coeffs,_last_zzi);
|
||||
}
|
||||
/*Fill in the target buffer.*/
|
||||
frag_buf_off=_state->frag_buf_offs[_fragi];
|
||||
refi=_state->frags[_fragi].refi;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
|
||||
if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
|
||||
else{
|
||||
const unsigned char *ref;
|
||||
int mvoffsets[2];
|
||||
ref=_state->ref_frame_data[refi]+frag_buf_off;
|
||||
if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
|
||||
_state->frag_mvs[_fragi])>1){
|
||||
oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
|
||||
_dct_coeffs+64);
|
||||
}
|
||||
else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
|
||||
}
|
||||
}
|
||||
|
||||
/*We copy these entire function to inline the actual MMX routines so that we
|
||||
use only a single indirect call.*/
|
||||
|
||||
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
|
||||
memset(_bv,~(_flimit<<1),8);
|
||||
}
|
||||
|
||||
/*Apply the loop filter to a given set of fragment rows in the given plane.
|
||||
The filter may be run on the bottom edge, affecting pixels in the next row of
|
||||
fragments, so this row also needs to be available.
|
||||
_bv: The bounding values array.
|
||||
_refi: The index of the frame buffer to filter.
|
||||
_pli: The color plane to filter.
|
||||
_fragy0: The Y coordinate of the first fragment row to filter.
|
||||
_fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
|
||||
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
|
||||
const oc_fragment_plane *fplane;
|
||||
const oc_fragment *frags;
|
||||
const ptrdiff_t *frag_buf_offs;
|
||||
unsigned char *ref_frame_data;
|
||||
ptrdiff_t fragi_top;
|
||||
ptrdiff_t fragi_bot;
|
||||
ptrdiff_t fragi0;
|
||||
ptrdiff_t fragi0_end;
|
||||
int ystride;
|
||||
int nhfrags;
|
||||
fplane=_state->fplanes+_pli;
|
||||
nhfrags=fplane->nhfrags;
|
||||
fragi_top=fplane->froffset;
|
||||
fragi_bot=fragi_top+fplane->nfrags;
|
||||
fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
|
||||
fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
|
||||
ystride=_state->ref_ystride[_pli];
|
||||
frags=_state->frags;
|
||||
frag_buf_offs=_state->frag_buf_offs;
|
||||
ref_frame_data=_state->ref_frame_data[_refi];
|
||||
/*The following loops are constructed somewhat non-intuitively on purpose.
|
||||
The main idea is: if a block boundary has at least one coded fragment on
|
||||
it, the filter is applied to it.
|
||||
However, the order that the filters are applied in matters, and VP3 chose
|
||||
the somewhat strange ordering used below.*/
|
||||
while(fragi0<fragi0_end){
|
||||
ptrdiff_t fragi;
|
||||
ptrdiff_t fragi_end;
|
||||
fragi=fragi0;
|
||||
fragi_end=fragi+nhfrags;
|
||||
while(fragi<fragi_end){
|
||||
if(frags[fragi].coded){
|
||||
unsigned char *ref;
|
||||
ref=ref_frame_data+frag_buf_offs[fragi];
|
||||
#define PIX eax
|
||||
#define YSTRIDE3 edi
|
||||
#define YSTRIDE ecx
|
||||
#define LL edx
|
||||
#define D esi
|
||||
#define D_WORD si
|
||||
if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,_bv);
|
||||
if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,_bv);
|
||||
if(fragi+1<fragi_end&&!frags[fragi+1].coded){
|
||||
OC_LOOP_FILTER_H_MMX(ref+8,ystride,_bv);
|
||||
}
|
||||
if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
|
||||
OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,_bv);
|
||||
}
|
||||
#undef PIX
|
||||
#undef YSTRIDE3
|
||||
#undef YSTRIDE
|
||||
#undef LL
|
||||
#undef D
|
||||
#undef D_WORD
|
||||
}
|
||||
fragi++;
|
||||
}
|
||||
fragi0+=nhfrags;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
192
engine/thirdparty/libtheora/x86_vc/x86cpu.c
vendored
Normal file
192
engine/thirdparty/libtheora/x86_vc/x86cpu.c
vendored
Normal file
|
|
@ -0,0 +1,192 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
CPU capability detection for x86 processors.
|
||||
Originally written by Rudolf Marek.
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include "x86cpu.h"
|
||||
|
||||
#if !defined(OC_X86_ASM)
|
||||
ogg_uint32_t oc_cpu_flags_get(void){
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
/*Why does MSVC need this complicated rigamarole?
|
||||
At this point I honestly do not care.*/
|
||||
|
||||
/*Visual C cpuid helper function.
|
||||
For VS2005 we could as well use the _cpuid builtin, but that wouldn't work
|
||||
for VS2003 users, so we do it in inline assembler.*/
|
||||
static void oc_cpuid_helper(ogg_uint32_t _cpu_info[4],ogg_uint32_t _op){
|
||||
_asm{
|
||||
mov eax,[_op]
|
||||
mov esi,_cpu_info
|
||||
cpuid
|
||||
mov [esi+0],eax
|
||||
mov [esi+4],ebx
|
||||
mov [esi+8],ecx
|
||||
mov [esi+12],edx
|
||||
}
|
||||
}
|
||||
|
||||
# define cpuid(_op,_eax,_ebx,_ecx,_edx) \
|
||||
do{ \
|
||||
ogg_uint32_t cpu_info[4]; \
|
||||
oc_cpuid_helper(cpu_info,_op); \
|
||||
(_eax)=cpu_info[0]; \
|
||||
(_ebx)=cpu_info[1]; \
|
||||
(_ecx)=cpu_info[2]; \
|
||||
(_edx)=cpu_info[3]; \
|
||||
}while(0)
|
||||
|
||||
static void oc_detect_cpuid_helper(ogg_uint32_t *_eax,ogg_uint32_t *_ebx){
|
||||
_asm{
|
||||
pushfd
|
||||
pushfd
|
||||
pop eax
|
||||
mov ebx,eax
|
||||
xor eax,200000h
|
||||
push eax
|
||||
popfd
|
||||
pushfd
|
||||
pop eax
|
||||
popfd
|
||||
mov ecx,_eax
|
||||
mov [ecx],eax
|
||||
mov ecx,_ebx
|
||||
mov [ecx],ebx
|
||||
}
|
||||
}
|
||||
|
||||
static ogg_uint32_t oc_parse_intel_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
||||
ogg_uint32_t flags;
|
||||
/*If there isn't even MMX, give up.*/
|
||||
if(!(_edx&0x00800000))return 0;
|
||||
flags=OC_CPU_X86_MMX;
|
||||
if(_edx&0x02000000)flags|=OC_CPU_X86_MMXEXT|OC_CPU_X86_SSE;
|
||||
if(_edx&0x04000000)flags|=OC_CPU_X86_SSE2;
|
||||
if(_ecx&0x00000001)flags|=OC_CPU_X86_PNI;
|
||||
if(_ecx&0x00000100)flags|=OC_CPU_X86_SSSE3;
|
||||
if(_ecx&0x00080000)flags|=OC_CPU_X86_SSE4_1;
|
||||
if(_ecx&0x00100000)flags|=OC_CPU_X86_SSE4_2;
|
||||
return flags;
|
||||
}
|
||||
|
||||
static ogg_uint32_t oc_parse_amd_flags(ogg_uint32_t _edx,ogg_uint32_t _ecx){
|
||||
ogg_uint32_t flags;
|
||||
/*If there isn't even MMX, give up.*/
|
||||
if(!(_edx&0x00800000))return 0;
|
||||
flags=OC_CPU_X86_MMX;
|
||||
if(_edx&0x00400000)flags|=OC_CPU_X86_MMXEXT;
|
||||
if(_edx&0x80000000)flags|=OC_CPU_X86_3DNOW;
|
||||
if(_edx&0x40000000)flags|=OC_CPU_X86_3DNOWEXT;
|
||||
if(_ecx&0x00000040)flags|=OC_CPU_X86_SSE4A;
|
||||
if(_ecx&0x00000800)flags|=OC_CPU_X86_SSE5;
|
||||
return flags;
|
||||
}
|
||||
|
||||
ogg_uint32_t oc_cpu_flags_get(void){
|
||||
ogg_uint32_t flags;
|
||||
ogg_uint32_t eax;
|
||||
ogg_uint32_t ebx;
|
||||
ogg_uint32_t ecx;
|
||||
ogg_uint32_t edx;
|
||||
# if !defined(__amd64__)&&!defined(__x86_64__)
|
||||
/*Not all x86-32 chips support cpuid, so we have to check.*/
|
||||
oc_detect_cpuid_helper(&eax,&ebx);
|
||||
/*No cpuid.*/
|
||||
if(eax==ebx)return 0;
|
||||
# endif
|
||||
cpuid(0,eax,ebx,ecx,edx);
|
||||
/* l e t n I e n i u n e G*/
|
||||
if(ecx==0x6C65746E&&edx==0x49656E69&&ebx==0x756E6547||
|
||||
/* 6 8 x M T e n i u n e G*/
|
||||
ecx==0x3638784D&&edx==0x54656E69&&ebx==0x756E6547){
|
||||
int family;
|
||||
int model;
|
||||
/*Intel, Transmeta (tested with Crusoe TM5800):*/
|
||||
cpuid(1,eax,ebx,ecx,edx);
|
||||
flags=oc_parse_intel_flags(edx,ecx);
|
||||
family=(eax>>8)&0xF;
|
||||
model=(eax>>4)&0xF;
|
||||
/*The SSE unit on the Pentium M and Core Duo is much slower than the MMX
|
||||
unit, so don't use it.*/
|
||||
if(family==6&&(model==9||model==13||model==14)){
|
||||
flags&=~(OC_CPU_X86_SSE2|OC_CPU_X86_PNI);
|
||||
}
|
||||
}
|
||||
/* D M A c i t n e h t u A*/
|
||||
else if(ecx==0x444D4163&&edx==0x69746E65&&ebx==0x68747541||
|
||||
/* C S N y b e d o e G*/
|
||||
ecx==0x43534e20&&edx==0x79622065&&ebx==0x646f6547){
|
||||
/*AMD, Geode:*/
|
||||
cpuid(0x80000000,eax,ebx,ecx,edx);
|
||||
if(eax<0x80000001)flags=0;
|
||||
else{
|
||||
cpuid(0x80000001,eax,ebx,ecx,edx);
|
||||
flags=oc_parse_amd_flags(edx,ecx);
|
||||
}
|
||||
/*Also check for SSE.*/
|
||||
cpuid(1,eax,ebx,ecx,edx);
|
||||
flags|=oc_parse_intel_flags(edx,ecx);
|
||||
}
|
||||
/*Technically some VIA chips can be configured in the BIOS to return any
|
||||
string here the user wants.
|
||||
There is a special detection method that can be used to identify such
|
||||
processors, but in my opinion, if the user really wants to change it, they
|
||||
deserve what they get.*/
|
||||
/* s l u a H r u a t n e C*/
|
||||
else if(ecx==0x736C7561&&edx==0x48727561&&ebx==0x746E6543){
|
||||
/*VIA:*/
|
||||
/*I only have documentation for the C7 (Esther) and Isaiah (forthcoming)
|
||||
chips (thanks to the engineers from Centaur Technology who provided it).
|
||||
These chips support Intel-like cpuid info.
|
||||
The C3-2 (Nehemiah) cores appear to, as well.*/
|
||||
cpuid(1,eax,ebx,ecx,edx);
|
||||
flags=oc_parse_intel_flags(edx,ecx);
|
||||
if(eax>=0x80000001){
|
||||
/*The (non-Nehemiah) C3 processors support AMD-like cpuid info.
|
||||
We need to check this even if the Intel test succeeds to pick up 3DNow!
|
||||
support on these processors.
|
||||
Unlike actual AMD processors, we cannot _rely_ on this info, since
|
||||
some cores (e.g., the 693 stepping of the Nehemiah) claim to support
|
||||
this function, yet return edx=0, despite the Intel test indicating
|
||||
MMX support.
|
||||
Therefore the features detected here are strictly added to those
|
||||
detected by the Intel test.*/
|
||||
/*TODO: How about earlier chips?*/
|
||||
cpuid(0x80000001,eax,ebx,ecx,edx);
|
||||
/*Note: As of the C7, this function returns Intel-style extended feature
|
||||
flags, not AMD-style.
|
||||
Currently, this only defines bits 11, 20, and 29 (0x20100800), which
|
||||
do not conflict with any of the AMD flags we inspect.
|
||||
For the remaining bits, Intel tells us, "Do not count on their value",
|
||||
but VIA assures us that they will all be zero (at least on the C7 and
|
||||
Isaiah chips).
|
||||
In the (unlikely) event a future processor uses bits 18, 19, 30, or 31
|
||||
(0xC0C00000) for something else, we will have to add code to detect
|
||||
the model to decide when it is appropriate to inspect them.*/
|
||||
flags|=oc_parse_amd_flags(edx,ecx);
|
||||
}
|
||||
}
|
||||
else{
|
||||
/*Implement me.*/
|
||||
flags=0;
|
||||
}
|
||||
return flags;
|
||||
}
|
||||
#endif
|
||||
36
engine/thirdparty/libtheora/x86_vc/x86cpu.h
vendored
Normal file
36
engine/thirdparty/libtheora/x86_vc/x86cpu.h
vendored
Normal file
|
|
@ -0,0 +1,36 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_vc_x86cpu_H)
|
||||
# define _x86_vc_x86cpu_H (1)
|
||||
#include "../internal.h"
|
||||
|
||||
#define OC_CPU_X86_MMX (1<<0)
|
||||
#define OC_CPU_X86_3DNOW (1<<1)
|
||||
#define OC_CPU_X86_3DNOWEXT (1<<2)
|
||||
#define OC_CPU_X86_MMXEXT (1<<3)
|
||||
#define OC_CPU_X86_SSE (1<<4)
|
||||
#define OC_CPU_X86_SSE2 (1<<5)
|
||||
#define OC_CPU_X86_PNI (1<<6)
|
||||
#define OC_CPU_X86_SSSE3 (1<<7)
|
||||
#define OC_CPU_X86_SSE4_1 (1<<8)
|
||||
#define OC_CPU_X86_SSE4_2 (1<<9)
|
||||
#define OC_CPU_X86_SSE4A (1<<10)
|
||||
#define OC_CPU_X86_SSE5 (1<<11)
|
||||
|
||||
ogg_uint32_t oc_cpu_flags_get(void);
|
||||
|
||||
#endif
|
||||
47
engine/thirdparty/libtheora/x86_vc/x86enc.c
vendored
Normal file
47
engine/thirdparty/libtheora/x86_vc/x86enc.c
vendored
Normal file
|
|
@ -0,0 +1,47 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: x86state.c 15675 2009-02-06 09:43:27Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
#include "x86enc.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
void oc_enc_accel_init_x86(oc_enc_ctx *_enc){
|
||||
ogg_uint32_t cpu_flags;
|
||||
cpu_flags=_enc->state.cpu_flags;
|
||||
oc_enc_accel_init_c(_enc);
|
||||
if(cpu_flags&OC_CPU_X86_MMX){
|
||||
_enc->opt_vtable.frag_sub=oc_enc_frag_sub_mmx;
|
||||
_enc->opt_vtable.frag_sub_128=oc_enc_frag_sub_128_mmx;
|
||||
_enc->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
||||
_enc->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
||||
}
|
||||
if(cpu_flags&OC_CPU_X86_MMXEXT){
|
||||
_enc->opt_vtable.frag_sad=oc_enc_frag_sad_mmxext;
|
||||
_enc->opt_vtable.frag_sad_thresh=oc_enc_frag_sad_thresh_mmxext;
|
||||
_enc->opt_vtable.frag_sad2_thresh=oc_enc_frag_sad2_thresh_mmxext;
|
||||
_enc->opt_vtable.frag_satd=oc_enc_frag_satd_mmxext;
|
||||
_enc->opt_vtable.frag_satd2=oc_enc_frag_satd2_mmxext;
|
||||
_enc->opt_vtable.frag_intra_satd=oc_enc_frag_intra_satd_mmxext;
|
||||
_enc->opt_vtable.frag_copy2=oc_enc_frag_copy2_mmxext;
|
||||
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_mmxext;
|
||||
}
|
||||
if(cpu_flags&OC_CPU_X86_SSE2){
|
||||
# if defined(OC_X86_64_ASM)
|
||||
_enc->opt_vtable.fdct8x8=oc_enc_fdct8x8_x86_64sse2;
|
||||
# endif
|
||||
}
|
||||
}
|
||||
#endif
|
||||
51
engine/thirdparty/libtheora/x86_vc/x86enc.h
vendored
Normal file
51
engine/thirdparty/libtheora/x86_vc/x86enc.h
vendored
Normal file
|
|
@ -0,0 +1,51 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: x86int.h 15675 2009-02-06 09:43:27Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_vc_x86enc_H)
|
||||
# define _x86_vc_x86enc_H (1)
|
||||
# include "x86int.h"
|
||||
# if defined(OC_X86_ASM)
|
||||
# define oc_enc_accel_init oc_enc_accel_init_x86
|
||||
# define OC_ENC_USE_VTABLE (1)
|
||||
# endif
|
||||
# include "../encint.h"
|
||||
|
||||
void oc_enc_accel_init_x86(oc_enc_ctx *_enc);
|
||||
|
||||
unsigned oc_enc_frag_sad_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_sad_thresh_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride,unsigned _thresh);
|
||||
unsigned oc_enc_frag_sad2_thresh_mmxext(const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride,
|
||||
unsigned _thresh);
|
||||
unsigned oc_enc_frag_satd_mmxext(unsigned *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref,int _ystride);
|
||||
unsigned oc_enc_frag_satd2_mmxext(unsigned *_dc,const unsigned char *_src,
|
||||
const unsigned char *_ref1,const unsigned char *_ref2,int _ystride);
|
||||
unsigned oc_enc_frag_intra_satd_mmxext(unsigned *_dc,
|
||||
const unsigned char *_src,int _ystride);
|
||||
void oc_enc_frag_sub_mmx(ogg_int16_t _diff[64],
|
||||
const unsigned char *_x,const unsigned char *_y,int _stride);
|
||||
void oc_enc_frag_sub_128_mmx(ogg_int16_t _diff[64],
|
||||
const unsigned char *_x,int _stride);
|
||||
void oc_enc_frag_copy2_mmxext(unsigned char *_dst,
|
||||
const unsigned char *_src1,const unsigned char *_src2,int _ystride);
|
||||
void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||
void oc_enc_fdct8x8_x86_64sse2(ogg_int16_t _y[64],const ogg_int16_t _x[64]);
|
||||
|
||||
#endif
|
||||
49
engine/thirdparty/libtheora/x86_vc/x86int.h
vendored
Normal file
49
engine/thirdparty/libtheora/x86_vc/x86int.h
vendored
Normal file
|
|
@ -0,0 +1,49 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_vc_x86int_H)
|
||||
# define _x86_vc_x86int_H (1)
|
||||
# include "../internal.h"
|
||||
# if defined(OC_X86_ASM)
|
||||
# define oc_state_accel_init oc_state_accel_init_x86
|
||||
# define OC_STATE_USE_VTABLE (1)
|
||||
# endif
|
||||
# include "../state.h"
|
||||
# include "x86cpu.h"
|
||||
|
||||
void oc_state_accel_init_x86(oc_theora_state *_state);
|
||||
|
||||
void oc_frag_copy_mmx(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride);
|
||||
void oc_frag_copy_list_mmx(unsigned char *_dst_frame,
|
||||
const unsigned char *_src_frame,int _ystride,
|
||||
const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs);
|
||||
void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,
|
||||
const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter_mmx(unsigned char *_dst,
|
||||
const unsigned char *_src,int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,
|
||||
const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue);
|
||||
void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi);
|
||||
void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
|
||||
int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant);
|
||||
void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit);
|
||||
void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
|
||||
signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end);
|
||||
void oc_restore_fpu_mmx(void);
|
||||
|
||||
#endif
|
||||
61
engine/thirdparty/libtheora/x86_vc/x86state.c
vendored
Normal file
61
engine/thirdparty/libtheora/x86_vc/x86state.c
vendored
Normal file
|
|
@ -0,0 +1,61 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id$
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#include "x86int.h"
|
||||
|
||||
#if defined(OC_X86_ASM)
|
||||
|
||||
/*This table has been modified from OC_FZIG_ZAG by baking a 4x4 transpose into
|
||||
each quadrant of the destination.*/
|
||||
static const unsigned char OC_FZIG_ZAG_MMX[128]={
|
||||
0, 8, 1, 2, 9,16,24,17,
|
||||
10, 3,32,11,18,25, 4,12,
|
||||
5,26,19,40,33,34,41,48,
|
||||
27, 6,13,20,28,21,14, 7,
|
||||
56,49,42,35,43,50,57,36,
|
||||
15,22,29,30,23,44,37,58,
|
||||
51,59,38,45,52,31,60,53,
|
||||
46,39,47,54,61,62,55,63,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
64,64,64,64,64,64,64,64,
|
||||
};
|
||||
|
||||
void oc_state_accel_init_x86(oc_theora_state *_state){
|
||||
_state->cpu_flags=oc_cpu_flags_get();
|
||||
if(_state->cpu_flags&OC_CPU_X86_MMX){
|
||||
_state->opt_vtable.frag_copy=oc_frag_copy_mmx;
|
||||
_state->opt_vtable.frag_copy_list=oc_frag_copy_list_mmx;
|
||||
_state->opt_vtable.frag_recon_intra=oc_frag_recon_intra_mmx;
|
||||
_state->opt_vtable.frag_recon_inter=oc_frag_recon_inter_mmx;
|
||||
_state->opt_vtable.frag_recon_inter2=oc_frag_recon_inter2_mmx;
|
||||
_state->opt_vtable.idct8x8=oc_idct8x8_mmx;
|
||||
_state->opt_vtable.state_frag_recon=oc_state_frag_recon_mmx;
|
||||
_state->opt_vtable.loop_filter_init=oc_loop_filter_init_mmx;
|
||||
_state->opt_vtable.state_loop_filter_frag_rows=
|
||||
oc_state_loop_filter_frag_rows_mmx;
|
||||
_state->opt_vtable.restore_fpu=oc_restore_fpu_mmx;
|
||||
_state->opt_data.dct_fzig_zag=OC_FZIG_ZAG_MMX;
|
||||
}
|
||||
else oc_state_accel_init_c(_state);
|
||||
}
|
||||
#endif
|
||||
244
engine/thirdparty/libtheora/x86_vc/x86zigzag.h
vendored
Normal file
244
engine/thirdparty/libtheora/x86_vc/x86zigzag.h
vendored
Normal file
|
|
@ -0,0 +1,244 @@
|
|||
/********************************************************************
|
||||
* *
|
||||
* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
|
||||
* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
|
||||
* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
|
||||
* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
|
||||
* *
|
||||
* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
|
||||
* by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
|
||||
* *
|
||||
********************************************************************
|
||||
|
||||
function:
|
||||
last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $
|
||||
|
||||
********************************************************************/
|
||||
|
||||
#if !defined(_x86_vc_x86zigzag_H)
|
||||
# define _x86_vc_x86zigzag_H (1)
|
||||
# include "x86enc.h"
|
||||
|
||||
|
||||
/*Converts DCT coefficients from transposed order into zig-zag scan order and
|
||||
stores them in Y.
|
||||
This relies on two macros to load the contents of each row:
|
||||
OC_ZZ_LOAD_ROW_LO(row,reg) and OC_ZZ_LOAD_ROW_HI(row,reg), which load the
|
||||
first four and second four entries of each row into the specified register,
|
||||
respectively.
|
||||
OC_ZZ_LOAD_ROW_LO must be called before OC_ZZ_LOAD_ROW_HI for the same row
|
||||
(because when the rows are already in SSE2 registers, loading the high half
|
||||
destructively modifies the register).
|
||||
The index of each output element in the original 64-element array should wind
|
||||
up in the following 8x8 matrix (the letters indicate the order we compute
|
||||
each 4-tuple below):
|
||||
A 0 8 1 2 9 16 24 17 B
|
||||
C 10 3 4 11 18 25 32 40 E
|
||||
F 33 26 19 12 5 6 13 20 D
|
||||
G 27 34 41 48 56 49 42 35 I
|
||||
L 28 21 14 7 15 22 29 36 M
|
||||
H 43 50 57 58 51 44 37 30 O
|
||||
N 23 31 38 45 52 59 60 53 J
|
||||
P 46 39 47 54 61 62 55 63 K
|
||||
The order of the coefficients within each tuple is reversed in the comments
|
||||
below to reflect the usual MSB to LSB notation.*/
|
||||
#define OC_TRANSPOSE_ZIG_ZAG_MMXEXT \
|
||||
OC_ZZ_LOAD_ROW_LO(0,mm0) /*mm0=03 02 01 00*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(1,mm1) /*mm1=11 10 09 08*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(2,mm2) /*mm2=19 18 17 16*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(3,mm3) /*mm3=27 26 25 24*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(0,mm4) /*mm4=07 06 05 04*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(1,mm5) /*mm5=15 14 13 12*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(2,mm6) /*mm6=23 22 21 20*/ \
|
||||
__asm movq mm7,mm0 /*mm7=03 02 01 00*/ \
|
||||
__asm punpckhdq mm0,mm1 /*mm0=11 10 03 02*/ \
|
||||
__asm pshufw mm4,mm4,0x39 /*mm4=04 07 06 05*/ \
|
||||
__asm punpcklwd mm1,mm0 /*mm1=03 09 02 08*/ \
|
||||
__asm pshufw mm5,mm5,0x39 /*mm5=12 15 14 13*/ \
|
||||
__asm punpcklwd mm7,mm1 /*mm7=02 01 08 00 *A*/ \
|
||||
__asm movq [Y+0x00],mm7 \
|
||||
__asm punpckhwd mm1,mm4 /*mm1=04 03 07 09*/ \
|
||||
__asm movq mm7,mm2 /*mm7=19 18 17 16*/ \
|
||||
__asm punpckhdq mm0,mm1 /*mm0=04 03 11 10*/ \
|
||||
__asm punpckhwd mm7,mm5 /*mm7=12 19 15 18*/ \
|
||||
__asm punpcklwd mm1,mm3 /*mm1=25 07 24 09*/ \
|
||||
__asm punpcklwd mm5,mm6 /*mm5=21 14 20 13*/ \
|
||||
__asm punpcklwd mm1,mm2 /*mm1=17 24 16 09 *B*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(4,mm2) /*mm2=35 34 33 32*/ \
|
||||
__asm movq [Y+0x08],mm1 \
|
||||
OC_ZZ_LOAD_ROW_LO(5,mm1) /*mm1=43 42 41 40*/ \
|
||||
__asm pshufw mm0,mm0,0x78 /*mm0=11 04 03 10 *C*/ \
|
||||
__asm movq [Y+0x10],mm0 \
|
||||
__asm punpckhdq mm6,mm4 /*mm6=?? 07 23 22*/ \
|
||||
__asm punpckldq mm4,mm5 /*mm4=20 13 06 05 *D*/ \
|
||||
__asm movq [Y+0x28],mm4 \
|
||||
__asm psrlq mm3,16 /*mm3=.. 27 26 25*/ \
|
||||
__asm pshufw mm0,mm2,0x0E /*mm0=?? ?? 35 34*/ \
|
||||
__asm movq mm4,mm7 /*mm4=12 19 15 18*/ \
|
||||
__asm punpcklwd mm2,mm3 /*mm2=26 33 25 32*/ \
|
||||
__asm punpcklwd mm4,mm1 /*mm4=41 15 40 18*/ \
|
||||
__asm punpckhwd mm3,mm1 /*mm3=43 .. 42 27*/ \
|
||||
__asm punpckldq mm4,mm2 /*mm4=25 32 40 18*/ \
|
||||
__asm punpcklwd mm3,mm0 /*mm3=35 42 34 27*/ \
|
||||
OC_ZZ_LOAD_ROW_LO(6,mm0) /*mm0=51 50 49 48*/ \
|
||||
__asm pshufw mm4,mm4,0x6C /*mm4=40 32 25 18 *E*/ \
|
||||
__asm movq [Y+0x18],mm4 \
|
||||
OC_ZZ_LOAD_ROW_LO(7,mm4) /*mm4=59 58 57 56*/ \
|
||||
__asm punpckhdq mm2,mm7 /*mm2=12 19 26 33 *F*/ \
|
||||
__asm movq [Y+0x20],mm2 \
|
||||
__asm pshufw mm1,mm1,0xD0 /*mm1=43 41 ?? ??*/ \
|
||||
__asm pshufw mm0,mm0,0x87 /*mm0=50 48 49 51*/ \
|
||||
__asm movq mm2,mm3 /*mm2=35 42 34 27*/ \
|
||||
__asm punpckhwd mm1,mm0 /*mm1=50 43 48 41*/ \
|
||||
__asm pshufw mm4,mm4,0x93 /*mm4=58 57 56 59*/ \
|
||||
__asm punpckldq mm3,mm1 /*mm3=48 41 34 27 *G*/ \
|
||||
__asm movq [Y+0x30],mm3 \
|
||||
__asm punpckhdq mm1,mm4 /*mm1=58 57 50 43 *H*/ \
|
||||
__asm movq [Y+0x50],mm1 \
|
||||
OC_ZZ_LOAD_ROW_HI(7,mm1) /*mm1=63 62 61 60*/ \
|
||||
__asm punpcklwd mm4,mm0 /*mm4=49 56 51 59*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(6,mm0) /*mm0=55 54 53 52*/ \
|
||||
__asm psllq mm6,16 /*mm6=07 23 22 ..*/ \
|
||||
__asm movq mm3,mm4 /*mm3=49 56 51 59*/ \
|
||||
__asm punpckhdq mm4,mm2 /*mm4=35 42 49 56 *I*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(3,mm2) /*mm2=31 30 29 28*/ \
|
||||
__asm movq [Y+0x38],mm4 \
|
||||
__asm punpcklwd mm3,mm1 /*mm3=61 51 60 59*/ \
|
||||
__asm punpcklwd mm7,mm6 /*mm7=22 15 .. ??*/ \
|
||||
__asm movq mm4,mm3 /*mm4=61 51 60 59*/ \
|
||||
__asm punpcklwd mm3,mm0 /*mm3=53 60 52 59*/ \
|
||||
__asm punpckhwd mm4,mm0 /*mm4=55 61 54 51*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(4,mm0) /*mm0=39 38 37 36*/ \
|
||||
__asm pshufw mm3,mm3,0xE1 /*mm3=53 60 59 52 *J*/ \
|
||||
__asm movq [Y+0x68],mm3 \
|
||||
__asm movq mm3,mm4 /*mm3=?? ?? 54 51*/ \
|
||||
__asm pshufw mm2,mm2,0x39 /*mm2=28 31 30 29*/ \
|
||||
__asm punpckhwd mm4,mm1 /*mm4=63 55 62 61 *K*/ \
|
||||
OC_ZZ_LOAD_ROW_HI(5,mm1) /*mm1=47 46 45 44*/ \
|
||||
__asm movq [Y+0x78],mm4 \
|
||||
__asm punpckhwd mm6,mm2 /*mm6=28 07 31 23*/ \
|
||||
__asm punpcklwd mm2,mm0 /*mm2=37 30 36 29*/ \
|
||||
__asm punpckhdq mm5,mm6 /*mm5=28 07 21 14*/ \
|
||||
__asm pshufw mm2,mm2,0x4B /*mm2=36 29 30 37*/ \
|
||||
__asm pshufw mm5,mm5,0x87 /*mm5=07 14 21 28 *L*/ \
|
||||
__asm movq [Y+0x40],mm5 \
|
||||
__asm punpckhdq mm7,mm2 /*mm7=36 29 22 15 *M*/ \
|
||||
__asm movq [Y+0x48],mm7 \
|
||||
__asm pshufw mm1,mm1,0x9C /*mm1=46 45 47 44*/ \
|
||||
__asm punpckhwd mm0,mm1 /*mm0=46 39 45 38*/ \
|
||||
__asm punpcklwd mm3,mm1 /*mm3=47 54 44 51*/ \
|
||||
__asm punpckldq mm6,mm0 /*mm6=45 38 31 23 *N*/ \
|
||||
__asm movq [Y+0x60],mm6 \
|
||||
__asm punpckhdq mm0,mm3 /*mm0=47 54 46 39*/ \
|
||||
__asm punpckldq mm3,mm2 /*mm3=30 37 44 51 *O*/ \
|
||||
__asm movq [Y+0x58],mm3 \
|
||||
__asm pshufw mm0,mm0,0xB1 /*mm0=54 47 39 46 *P*/ \
|
||||
__asm movq [Y+0x70],mm0 \
|
||||
|
||||
/*Converts DCT coefficients in %[dct] from natural order into zig-zag scan
|
||||
order and stores them in %[qdct].
|
||||
The index of each output element in the original 64-element array should wind
|
||||
up in the following 8x8 matrix (the letters indicate the order we compute
|
||||
each 4-tuple below):
|
||||
A 0 1 8 16 9 2 3 10 B
|
||||
C 17 24 32 25 18 11 4 5 D
|
||||
E 12 19 26 33 40 48 41 34 I
|
||||
H 27 20 13 6 7 14 21 28 G
|
||||
K 35 42 49 56 57 50 43 36 J
|
||||
F 29 22 15 23 30 37 44 51 M
|
||||
P 58 59 52 45 38 31 39 46 L
|
||||
N 53 60 61 54 47 55 62 63 O
|
||||
The order of the coefficients within each tuple is reversed in the comments
|
||||
below to reflect the usual MSB to LSB notation.*/
|
||||
#define OC_ZIG_ZAG_MMXEXT \
|
||||
"movq 0x00(%[dct]),%%mm0\n\t" /*mm0=03 02 01 00*/ \
|
||||
"movq 0x08(%[dct]),%%mm1\n\t" /*mm1=07 06 05 04*/ \
|
||||
"movq 0x10(%[dct]),%%mm2\n\t" /*mm2=11 10 09 08*/ \
|
||||
"movq 0x20(%[dct]),%%mm3\n\t" /*mm3=19 18 17 16*/ \
|
||||
"movq 0x30(%[dct]),%%mm4\n\t" /*mm4=27 26 25 24*/ \
|
||||
"movq 0x40(%[dct]),%%mm5\n\t" /*mm5=35 34 33 32*/ \
|
||||
"movq %%mm2,%%mm7\n\t" /*mm7=11 10 09 08*/ \
|
||||
"punpcklwd %%mm3,%%mm2\n\t" /*mm2=17 09 16 08*/ \
|
||||
"movq %%mm0,%%mm6\n\t" /*mm6=03 02 01 00*/ \
|
||||
"punpckldq %%mm2,%%mm0\n\t" /*mm0=16 08 01 00 *A*/ \
|
||||
"movq %%mm0,0x00(%[qdct])\n\t" \
|
||||
"movq 0x18(%[dct]),%%mm0\n\t" /*mm0=15 14 13 12*/ \
|
||||
"punpckhdq %%mm6,%%mm6\n\t" /*mm6=03 02 03 02*/ \
|
||||
"psrlq $16,%%mm7\n\t" /*mm7=.. 11 10 09*/ \
|
||||
"punpckldq %%mm7,%%mm6\n\t" /*mm6=10 09 03 02*/ \
|
||||
"punpckhwd %%mm7,%%mm3\n\t" /*mm3=.. 19 11 18*/ \
|
||||
"pshufw $0xD2,%%mm6,%%mm6\n\t" /*mm6=10 03 02 09 *B*/ \
|
||||
"movq %%mm6,0x08(%[qdct])\n\t" \
|
||||
"psrlq $48,%%mm2\n\t" /*mm2=.. .. .. 17*/ \
|
||||
"movq %%mm1,%%mm6\n\t" /*mm6=07 06 05 04*/ \
|
||||
"punpcklwd %%mm5,%%mm2\n\t" /*mm2=33 .. 32 17*/ \
|
||||
"movq %%mm3,%%mm7\n\t" /*mm7=.. 19 11 18*/ \
|
||||
"punpckldq %%mm1,%%mm3\n\t" /*mm3=05 04 11 18 *C*/ \
|
||||
"por %%mm2,%%mm7\n\t" /*mm7=33 19 ?? ??*/ \
|
||||
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=25 32 24 17 *D**/ \
|
||||
"movq %%mm2,0x10(%[qdct])\n\t" \
|
||||
"movq %%mm3,0x18(%[qdct])\n\t" \
|
||||
"movq 0x28(%[dct]),%%mm2\n\t" /*mm2=23 22 21 20*/ \
|
||||
"movq 0x38(%[dct]),%%mm1\n\t" /*mm1=31 30 29 28*/ \
|
||||
"pshufw $0x9C,%%mm0,%%mm3\n\t" /*mm3=14 13 15 12*/ \
|
||||
"punpckhdq %%mm7,%%mm7\n\t" /*mm7=33 19 33 19*/ \
|
||||
"punpckhwd %%mm3,%%mm6\n\t" /*mm6=14 07 13 06*/ \
|
||||
"punpckldq %%mm0,%%mm0\n\t" /*mm0=13 12 13 12*/ \
|
||||
"punpcklwd %%mm1,%%mm3\n\t" /*mm3=29 15 28 12*/ \
|
||||
"punpckhwd %%mm4,%%mm0\n\t" /*mm0=27 13 26 12*/ \
|
||||
"pshufw $0xB4,%%mm3,%%mm3\n\t" /*mm3=15 29 28 12*/ \
|
||||
"psrlq $48,%%mm4\n\t" /*mm4=.. .. .. 27*/ \
|
||||
"punpcklwd %%mm7,%%mm0\n\t" /*mm0=33 26 19 12 *E*/ \
|
||||
"punpcklwd %%mm1,%%mm4\n\t" /*mm4=29 .. 28 27*/ \
|
||||
"punpckhwd %%mm2,%%mm3\n\t" /*mm3=23 15 22 29 *F*/ \
|
||||
"movq %%mm0,0x20(%[qdct])\n\t" \
|
||||
"movq %%mm3,0x50(%[qdct])\n\t" \
|
||||
"movq 0x60(%[dct]),%%mm3\n\t" /*mm3=51 50 49 48*/ \
|
||||
"movq 0x70(%[dct]),%%mm7\n\t" /*mm7=59 58 57 56*/ \
|
||||
"movq 0x50(%[dct]),%%mm0\n\t" /*mm0=43 42 41 40*/ \
|
||||
"punpcklwd %%mm4,%%mm2\n\t" /*mm2=28 21 27 20*/ \
|
||||
"psrlq $32,%%mm5\n\t" /*mm5=.. .. 35 34*/ \
|
||||
"movq %%mm2,%%mm4\n\t" /*mm4=28 21 27 20*/ \
|
||||
"punpckldq %%mm6,%%mm2\n\t" /*mm2=13 06 27 20*/ \
|
||||
"punpckhdq %%mm4,%%mm6\n\t" /*mm6=28 21 14 07 *G*/ \
|
||||
"movq %%mm3,%%mm4\n\t" /*mm4=51 50 49 48*/ \
|
||||
"pshufw $0xB1,%%mm2,%%mm2\n\t" /*mm2=06 13 20 27 *H*/ \
|
||||
"movq %%mm2,0x30(%[qdct])\n\t" \
|
||||
"movq %%mm6,0x38(%[qdct])\n\t" \
|
||||
"movq 0x48(%[dct]),%%mm2\n\t" /*mm2=39 38 37 36*/ \
|
||||
"punpcklwd %%mm5,%%mm4\n\t" /*mm4=35 49 34 48*/ \
|
||||
"movq 0x58(%[dct]),%%mm5\n\t" /*mm5=47 46 45 44*/ \
|
||||
"punpckldq %%mm7,%%mm6\n\t" /*mm6=57 56 14 07*/ \
|
||||
"psrlq $32,%%mm3\n\t" /*mm3=.. .. 51 50*/ \
|
||||
"punpckhwd %%mm0,%%mm6\n\t" /*mm6=43 57 42 56*/ \
|
||||
"punpcklwd %%mm4,%%mm0\n\t" /*mm0=34 41 48 40 *I*/ \
|
||||
"pshufw $0x4E,%%mm6,%%mm6\n\t" /*mm6=42 56 43 57*/ \
|
||||
"movq %%mm0,0x28(%[qdct])\n\t" \
|
||||
"punpcklwd %%mm2,%%mm3\n\t" /*mm3=37 51 36 50*/ \
|
||||
"punpckhwd %%mm6,%%mm4\n\t" /*mm4=42 35 56 49*/ \
|
||||
"punpcklwd %%mm3,%%mm6\n\t" /*mm6=36 43 50 57 *J*/ \
|
||||
"pshufw $0x4E,%%mm4,%%mm4\n\t" /*mm4=56 49 42 35 *K*/ \
|
||||
"movq %%mm4,0x40(%[qdct])\n\t" \
|
||||
"movq %%mm6,0x48(%[qdct])\n\t" \
|
||||
"movq 0x68(%[dct]),%%mm6\n\t" /*mm6=55 54 53 52*/ \
|
||||
"movq 0x78(%[dct]),%%mm0\n\t" /*mm0=63 62 61 60*/ \
|
||||
"psrlq $32,%%mm1\n\t" /*mm1=.. .. 31 30*/ \
|
||||
"pshufw $0xD8,%%mm5,%%mm5\n\t" /*mm5=47 45 46 44*/ \
|
||||
"pshufw $0x0B,%%mm3,%%mm3\n\t" /*mm3=50 50 51 37*/ \
|
||||
"punpcklwd %%mm5,%%mm1\n\t" /*mm1=46 31 44 30*/ \
|
||||
"pshufw $0xC9,%%mm6,%%mm6\n\t" /*mm6=55 52 54 53*/ \
|
||||
"punpckhwd %%mm1,%%mm2\n\t" /*mm2=46 39 31 38 *L*/ \
|
||||
"punpcklwd %%mm3,%%mm1\n\t" /*mm1=51 44 37 30 *M*/ \
|
||||
"movq %%mm2,0x68(%[qdct])\n\t" \
|
||||
"movq %%mm1,0x58(%[qdct])\n\t" \
|
||||
"punpckhwd %%mm6,%%mm5\n\t" /*mm5=55 47 52 45*/ \
|
||||
"punpckldq %%mm0,%%mm6\n\t" /*mm6=61 60 54 53*/ \
|
||||
"pshufw $0x10,%%mm5,%%mm4\n\t" /*mm4=45 52 45 45*/ \
|
||||
"pshufw $0x78,%%mm6,%%mm6\n\t" /*mm6=53 60 61 54 *N*/ \
|
||||
"punpckhdq %%mm0,%%mm5\n\t" /*mm5=63 62 55 47 *O*/ \
|
||||
"punpckhdq %%mm4,%%mm7\n\t" /*mm7=45 52 59 58 *P*/ \
|
||||
"movq %%mm6,0x70(%[qdct])\n\t" \
|
||||
"movq %%mm5,0x78(%[qdct])\n\t" \
|
||||
"movq %%mm7,0x60(%[qdct])\n\t" \
|
||||
|
||||
#endif
|
||||
Loading…
Add table
Add a link
Reference in a new issue