libbpg-0.9.6

2015-10-27 11:46:00 +01:00 · 2015-10-27 11:46:00 +01:00 · 35a8402710
commit 35a8402710
parent 3035b41edf
248 changed files with 232891 additions and 100 deletions
--- a/x265/source/encoder/CMakeLists.txt
+++ b/x265/source/encoder/CMakeLists.txt
@ -0,0 +1,48 @@
+# vim: syntax=cmake
+
+if(GCC)
+    add_definitions(-Wno-uninitialized)
+    if(CC_HAS_NO_STRICT_OVERFLOW)
+        # GCC 4.9.2 gives warnings we know we can ignore in this file
+        set_source_files_properties(slicetype.cpp PROPERTIES COMPILE_FLAGS -Wno-strict-overflow)
+    endif(CC_HAS_NO_STRICT_OVERFLOW)
+endif()
+if(MSVC)
+   add_definitions(/wd4701) # potentially uninitialized local variable 'foo' used
+endif()
+
+if(LINKED_8BIT)
+  list(APPEND APIFLAGS "-DLINKED_8BIT=1")
+endif(LINKED_8BIT)
+if(LINKED_10BIT)
+  list(APPEND APIFLAGS "-DLINKED_10BIT=1")
+endif(LINKED_10BIT)
+if(LINKED_12BIT)
+  list(APPEND APIFLAGS "-DLINKED_12BIT=1")
+endif(LINKED_12BIT)
+if(ENABLE_SHARED)
+  list(APPEND APIFLAGS "-DENABLE_SHARED=1")
+endif(ENABLE_SHARED)
+
+string(REPLACE ";" " " APIFLAGSTR "${APIFLAGS}")
+set_source_files_properties(api.cpp PROPERTIES COMPILE_FLAGS "${APIFLAGSTR}")
+
+add_library(encoder OBJECT ../x265.h
+    analysis.cpp analysis.h
+    search.cpp search.h
+    bitcost.cpp bitcost.h rdcost.h
+    motion.cpp motion.h
+    slicetype.cpp slicetype.h
+    frameencoder.cpp frameencoder.h
+    framefilter.cpp framefilter.h
+    level.cpp level.h
+    nal.cpp nal.h
+    sei.cpp sei.h
+    sao.cpp sao.h
+    entropy.cpp entropy.h
+    dpb.cpp dpb.h
+    ratecontrol.cpp ratecontrol.h
+    reference.cpp reference.h
+    encoder.cpp encoder.h
+    api.cpp
+    weightPrediction.cpp)
--- a/x265/source/encoder/analysis.cpp
+++ b/x265/source/encoder/analysis.cpp
--- a/x265/source/encoder/analysis.h
+++ b/x265/source/encoder/analysis.h
@ -0,0 +1,171 @@
+/*****************************************************************************
+* Copyright (C) 2013 x265 project
+*
+* Authors: Deepthi Nandakumar <deepthi@multicorewareinc.com>
+*          Steve Borho <steve@borho.org>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#ifndef X265_ANALYSIS_H
+#define X265_ANALYSIS_H
+
+#include "common.h"
+#include "predict.h"
+#include "quant.h"
+#include "yuv.h"
+#include "shortyuv.h"
+#include "cudata.h"
+
+#include "entropy.h"
+#include "search.h"
+
+namespace X265_NS {
+// private namespace
+
+class Entropy;
+
+class Analysis : public Search
+{
+public:
+
+    enum {
+        PRED_MERGE,
+        PRED_SKIP,
+        PRED_INTRA,
+        PRED_2Nx2N,
+        PRED_BIDIR,
+        PRED_Nx2N,
+        PRED_2NxN,
+        PRED_SPLIT,
+        PRED_2NxnU,
+        PRED_2NxnD,
+        PRED_nLx2N,
+        PRED_nRx2N,
+        PRED_INTRA_NxN, /* 4x4 intra PU blocks for 8x8 CU */
+        PRED_LOSSLESS,  /* lossless encode of best mode */
+        MAX_PRED_TYPES
+    };
+
+    struct ModeDepth
+    {
+        Mode           pred[MAX_PRED_TYPES];
+        Mode*          bestMode;
+        Yuv            fencYuv;
+        CUDataMemPool  cuMemPool;
+    };
+
+    class PMODE : public BondedTaskGroup
+    {
+    public:
+
+        Analysis&     master;
+        const CUGeom& cuGeom;
+        int           modes[MAX_PRED_TYPES];
+
+        PMODE(Analysis& m, const CUGeom& g) : master(m), cuGeom(g) {}
+
+        void processTasks(int workerThreadId);
+
+    protected:
+
+        PMODE operator=(const PMODE&);
+    };
+
+    void processPmode(PMODE& pmode, Analysis& slave);
+
+    ModeDepth m_modeDepth[NUM_CU_DEPTH];
+    bool      m_bTryLossless;
+    bool      m_bChromaSa8d;
+
+    Analysis();
+
+    bool create(ThreadLocalData* tld);
+    void destroy();
+
+    Mode& compressCTU(CUData& ctu, Frame& frame, const CUGeom& cuGeom, const Entropy& initialContext);
+
+protected:
+
+    /* Analysis data for load/save modes, keeps getting incremented as CTU analysis proceeds and data is consumed or read */
+    analysis_intra_data* m_reuseIntraDataCTU;
+    analysis_inter_data* m_reuseInterDataCTU;
+    int32_t*             m_reuseRef;
+    uint32_t*            m_reuseBestMergeCand;
+
+    uint32_t m_splitRefIdx[4];
+
+    /* full analysis for an I-slice CU */
+    void compressIntraCU(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
+
+    /* full analysis for a P or B slice CU */
+    uint32_t compressInterCU_dist(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+    uint32_t compressInterCU_rd0_4(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp);
+    uint32_t compressInterCU_rd5_6(const CUData& parentCTU, const CUGeom& cuGeom, uint32_t &zOrder, int32_t qp);
+
+    /* measure merge and skip */
+    void checkMerge2Nx2N_rd0_4(Mode& skip, Mode& merge, const CUGeom& cuGeom);
+    void checkMerge2Nx2N_rd5_6(Mode& skip, Mode& merge, const CUGeom& cuGeom, bool isShareMergeCand);
+
+    /* measure inter options */
+    void checkInter_rd0_4(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask[2]);
+    void checkInter_rd5_6(Mode& interMode, const CUGeom& cuGeom, PartSize partSize, uint32_t refmask[2]);
+
+    void checkBidir2Nx2N(Mode& inter2Nx2N, Mode& bidir2Nx2N, const CUGeom& cuGeom);
+
+    /* encode current bestMode losslessly, pick best RD cost */
+    void tryLossless(const CUGeom& cuGeom);
+
+    /* add the RD cost of coding a split flag (0 or 1) to the given mode */
+    void addSplitFlagCost(Mode& mode, uint32_t depth);
+
+    /* work-avoidance heuristics for RD levels < 5 */
+    uint32_t topSkipMinDepth(const CUData& parentCTU, const CUGeom& cuGeom);
+    bool recursionDepthCheck(const CUData& parentCTU, const CUGeom& cuGeom, const Mode& bestMode);
+
+    /* generate residual and recon pixels for an entire CTU recursively (RD0) */
+    void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
+
+    int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom);
+
+    /* check whether current mode is the new best */
+    inline void checkBestMode(Mode& mode, uint32_t depth)
+    {
+        X265_CHECK(mode.ok(), "mode costs are uninitialized\n");
+
+        ModeDepth& md = m_modeDepth[depth];
+        if (md.bestMode)
+        {
+            if (mode.rdCost < md.bestMode->rdCost)
+                md.bestMode = &mode;
+        }
+        else
+            md.bestMode = &mode;
+    }
+};
+
+struct ThreadLocalData
+{
+    Analysis analysis;
+
+    void destroy() { analysis.destroy(); }
+};
+
+}
+
+#endif // ifndef X265_ANALYSIS_H
--- a/x265/source/encoder/api.cpp
+++ b/x265/source/encoder/api.cpp
@ -0,0 +1,523 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "bitstream.h"
+#include "param.h"
+
+#include "encoder.h"
+#include "entropy.h"
+#include "level.h"
+#include "nal.h"
+#include "bitcost.h"
+
+/* multilib namespace reflectors */
+#if LINKED_8BIT
+namespace x265_8bit {
+const x265_api* x265_api_get(int bitDepth);
+const x265_api* x265_api_query(int bitDepth, int apiVersion, int* err);
+}
+#endif
+
+#if LINKED_10BIT
+namespace x265_10bit {
+const x265_api* x265_api_get(int bitDepth);
+const x265_api* x265_api_query(int bitDepth, int apiVersion, int* err);
+}
+#endif
+
+#if LINKED_12BIT
+namespace x265_12bit {
+const x265_api* x265_api_get(int bitDepth);
+const x265_api* x265_api_query(int bitDepth, int apiVersion, int* err);
+}
+#endif
+
+#if EXPORT_C_API
+/* these functions are exported as C functions (default) */
+using namespace X265_NS;
+extern "C" {
+#else
+/* these functions exist within private namespace (multilib) */
+namespace X265_NS {
+#endif
+
+x265_encoder *x265_encoder_open(x265_param *p)
+{
+    if (!p)
+        return NULL;
+
+#if _MSC_VER
+#pragma warning(disable: 4127) // conditional expression is constant, yes I know
+#endif
+
+#if HIGH_BIT_DEPTH
+    if (X265_DEPTH != 10 && X265_DEPTH != 12)
+#else
+    if (X265_DEPTH != 8)
+#endif
+    {
+        x265_log(p, X265_LOG_ERROR, "Build error, internal bit depth mismatch\n");
+        return NULL;
+    }
+
+    Encoder* encoder = NULL;
+    x265_param* param = PARAM_NS::x265_param_alloc();
+    x265_param* latestParam = PARAM_NS::x265_param_alloc();
+    if (!param || !latestParam)
+        goto fail;
+
+    memcpy(param, p, sizeof(x265_param));
+    x265_log(param, X265_LOG_INFO, "HEVC encoder version %s\n", PFX(version_str));
+    x265_log(param, X265_LOG_INFO, "build info %s\n", PFX(build_info_str));
+
+    x265_setup_primitives(param);
+
+    if (x265_check_params(param))
+        goto fail;
+
+    if (x265_set_globals(param))
+        goto fail;
+
+    encoder = new Encoder;
+    if (!param->rc.bEnableSlowFirstPass)
+        PARAM_NS::x265_param_apply_fastfirstpass(param);
+
+    // may change params for auto-detect, etc
+    encoder->configure(param);
+    // may change rate control and CPB params
+    if (!enforceLevel(*param, encoder->m_vps))
+        goto fail;
+
+    // will detect and set profile/tier/level in VPS
+    determineLevel(*param, encoder->m_vps);
+
+    if (!param->bAllowNonConformance && encoder->m_vps.ptl.profileIdc == Profile::NONE)
+    {
+        x265_log(param, X265_LOG_INFO, "non-conformant bitstreams not allowed (--allow-non-conformance)\n");
+        goto fail;
+    }
+
+    encoder->create();
+    encoder->m_latestParam = latestParam;
+    memcpy(latestParam, param, sizeof(x265_param));
+    if (encoder->m_aborted)
+        goto fail;
+
+    x265_print_params(param);
+    return encoder;
+
+fail:
+    delete encoder;
+    PARAM_NS::x265_param_free(param);
+    PARAM_NS::x265_param_free(latestParam);
+    return NULL;
+}
+
+int x265_encoder_headers(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal)
+{
+    if (pp_nal && enc)
+    {
+        Encoder *encoder = static_cast<Encoder*>(enc);
+        Entropy sbacCoder;
+        Bitstream bs;
+        encoder->getStreamHeaders(encoder->m_nalList, sbacCoder, bs);
+        *pp_nal = &encoder->m_nalList.m_nal[0];
+        if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal;
+        return encoder->m_nalList.m_occupancy;
+    }
+
+    return -1;
+}
+
+void x265_encoder_parameters(x265_encoder *enc, x265_param *out)
+{
+    if (enc && out)
+    {
+        Encoder *encoder = static_cast<Encoder*>(enc);
+        memcpy(out, encoder->m_param, sizeof(x265_param));
+    }
+}
+
+int x265_encoder_reconfig(x265_encoder* enc, x265_param* param_in)
+{
+    if (!enc || !param_in)
+        return -1;
+
+    x265_param save;
+    Encoder* encoder = static_cast<Encoder*>(enc);
+    memcpy(&save, encoder->m_latestParam, sizeof(x265_param));
+    int ret = encoder->reconfigureParam(encoder->m_latestParam, param_in);
+    if (ret)
+        /* reconfigure failed, recover saved param set */
+        memcpy(encoder->m_latestParam, &save, sizeof(x265_param));
+    else
+    {
+        encoder->m_reconfigured = true;
+        x265_print_reconfigured_params(&save, encoder->m_latestParam);
+    }
+    return ret;
+}
+
+int x265_encoder_encode(x265_encoder *enc, x265_nal **pp_nal, uint32_t *pi_nal, x265_picture *pic_in, x265_picture *pic_out)
+{
+    if (!enc)
+        return -1;
+
+    Encoder *encoder = static_cast<Encoder*>(enc);
+    int numEncoded;
+
+    // While flushing, we cannot return 0 until the entire stream is flushed
+    do
+    {
+        numEncoded = encoder->encode(pic_in, pic_out);
+    }
+    while (numEncoded == 0 && !pic_in && encoder->m_numDelayedPic);
+
+    // do not allow reuse of these buffers for more than one picture. The
+    // encoder now owns these analysisData buffers.
+    if (pic_in)
+    {
+        pic_in->analysisData.intraData = NULL;
+        pic_in->analysisData.interData = NULL;
+    }
+
+    if (pp_nal && numEncoded > 0)
+    {
+        *pp_nal = &encoder->m_nalList.m_nal[0];
+        if (pi_nal) *pi_nal = encoder->m_nalList.m_numNal;
+    }
+    else if (pi_nal)
+        *pi_nal = 0;
+
+    return numEncoded;
+}
+
+void x265_encoder_get_stats(x265_encoder *enc, x265_stats *outputStats, uint32_t statsSizeBytes)
+{
+    if (enc && outputStats)
+    {
+        Encoder *encoder = static_cast<Encoder*>(enc);
+        encoder->fetchStats(outputStats, statsSizeBytes);
+    }
+}
+
+void x265_encoder_log(x265_encoder* enc, int, char **)
+{
+    if (enc)
+    {
+        Encoder *encoder = static_cast<Encoder*>(enc);
+        x265_log(encoder->m_param, X265_LOG_WARNING, "x265_encoder_log is now deprecated\n");
+    }
+}
+
+void x265_encoder_close(x265_encoder *enc)
+{
+    if (enc)
+    {
+        Encoder *encoder = static_cast<Encoder*>(enc);
+
+        encoder->stopJobs();
+        encoder->printSummary();
+        encoder->destroy();
+        delete encoder;
+        ATOMIC_DEC(&g_ctuSizeConfigured);
+    }
+}
+
+void x265_cleanup(void)
+{
+    if (!g_ctuSizeConfigured)
+    {
+        BitCost::destroy();
+        CUData::s_partSet[0] = NULL; /* allow CUData to adjust to new CTU size */
+    }
+}
+
+x265_picture *x265_picture_alloc()
+{
+    return (x265_picture*)x265_malloc(sizeof(x265_picture));
+}
+
+void x265_picture_init(x265_param *param, x265_picture *pic)
+{
+    memset(pic, 0, sizeof(x265_picture));
+
+    pic->bitDepth = param->internalBitDepth;
+    pic->colorSpace = param->internalCsp;
+    pic->forceqp = X265_QP_AUTO;
+    pic->quantOffsets = NULL;
+    if (param->analysisMode)
+    {
+        uint32_t widthInCU       = (param->sourceWidth  + g_maxCUSize - 1) >> g_maxLog2CUSize;
+        uint32_t heightInCU      = (param->sourceHeight + g_maxCUSize - 1) >> g_maxLog2CUSize;
+
+        uint32_t numCUsInFrame   = widthInCU * heightInCU;
+        pic->analysisData.numCUsInFrame = numCUsInFrame;
+        pic->analysisData.numPartitions = NUM_4x4_PARTITIONS;
+    }
+}
+
+void x265_picture_free(x265_picture *p)
+{
+    return x265_free(p);
+}
+
+static const x265_api libapi =
+{
+    X265_MAJOR_VERSION,
+    X265_BUILD,
+    sizeof(x265_param),
+    sizeof(x265_picture),
+    sizeof(x265_analysis_data),
+    sizeof(x265_zone),
+    sizeof(x265_stats),
+
+    PFX(max_bit_depth),
+    PFX(version_str),
+    PFX(build_info_str),
+
+    &PARAM_NS::x265_param_alloc,
+    &PARAM_NS::x265_param_free,
+    &PARAM_NS::x265_param_default,
+    &PARAM_NS::x265_param_parse,
+    &PARAM_NS::x265_param_apply_profile,
+    &PARAM_NS::x265_param_default_preset,
+    &x265_picture_alloc,
+    &x265_picture_free,
+    &x265_picture_init,
+    &x265_encoder_open,
+    &x265_encoder_parameters,
+    &x265_encoder_reconfig,
+    &x265_encoder_headers,
+    &x265_encoder_encode,
+    &x265_encoder_get_stats,
+    &x265_encoder_log,
+    &x265_encoder_close,
+    &x265_cleanup,
+
+    sizeof(x265_frame_stats),
+};
+
+typedef const x265_api* (*api_get_func)(int bitDepth);
+typedef const x265_api* (*api_query_func)(int bitDepth, int apiVersion, int* err);
+
+#define xstr(s) str(s)
+#define str(s) #s
+
+#if _WIN32
+#define ext ".dll"
+#elif MACOS
+#include <dlfcn.h>
+#define ext ".dylib"
+#else
+#include <dlfcn.h>
+#define ext ".so"
+#endif
+
+#if ENABLE_SHARED
+static int g_recursion /* = 0 */;
+#endif
+  
+const x265_api* x265_api_get(int bitDepth)
+{
+    if (bitDepth && bitDepth != X265_DEPTH)
+    {
+#if LINKED_8BIT
+        if (bitDepth == 8) return x265_8bit::x265_api_get(0);
+#endif
+#if LINKED_10BIT
+        if (bitDepth == 10) return x265_10bit::x265_api_get(0);
+#endif
+#if LINKED_12BIT
+        if (bitDepth == 12) return x265_12bit::x265_api_get(0);
+#endif
+#if ENABLE_SHARED
+        const char* libname = NULL;
+        const char* method = "x265_api_get_" xstr(X265_BUILD);
+        const char* multilibname = "libx265" ext;
+
+        if (bitDepth == 12)
+            libname = "libx265_main12" ext;
+        else if (bitDepth == 10)
+            libname = "libx265_main10" ext;
+        else if (bitDepth == 8)
+            libname = "libx265_main" ext;
+        else
+            return NULL;
+
+        const x265_api* api = NULL;
+        int reqDepth = 0;
+
+        if (g_recursion > 1)
+            return NULL;
+        else
+            g_recursion++;
+
+#if _WIN32
+        HMODULE h = LoadLibraryA(libname);
+        if (!h)
+        {
+            h = LoadLibraryA(multilibname);
+            reqDepth = bitDepth;
+        }
+        if (h)
+        {
+            api_get_func get = (api_get_func)GetProcAddress(h, method);
+            if (get)
+                api = get(reqDepth);
+        }
+#else
+        void* h = dlopen(libname, RTLD_LAZY | RTLD_LOCAL);
+        if (!h)
+        {
+            h = dlopen(multilibname, RTLD_LAZY | RTLD_LOCAL);
+            reqDepth = bitDepth;
+        }
+        if (h)
+        {
+            api_get_func get = (api_get_func)dlsym(h, method);
+            if (get)
+                api = get(reqDepth);
+        }
+#endif
+
+        g_recursion--;
+
+        if (api && bitDepth != api->bit_depth)
+        {
+            x265_log(NULL, X265_LOG_WARNING, "%s does not support requested bitDepth %d\n", libname, bitDepth);
+            return NULL;
+        }
+
+        return api;
+#else
+        return NULL;
+#endif
+    }
+
+    return &libapi;
+}
+
+const x265_api* x265_api_query(int bitDepth, int apiVersion, int* err)
+{
+    if (apiVersion < 51)
+    {
+        /* builds before 1.6 had re-ordered public structs */
+        if (err) *err = X265_API_QUERY_ERR_VER_REFUSED;
+        return NULL;
+    }
+
+    if (err) *err = X265_API_QUERY_ERR_NONE;
+
+    if (bitDepth && bitDepth != X265_DEPTH)
+    {
+#if LINKED_8BIT
+        if (bitDepth == 8) return x265_8bit::x265_api_query(0, apiVersion, err);
+#endif
+#if LINKED_10BIT
+        if (bitDepth == 10) return x265_10bit::x265_api_query(0, apiVersion, err);
+#endif
+#if LINKED_12BIT
+        if (bitDepth == 12) return x265_12bit::x265_api_query(0, apiVersion, err);
+#endif
+#if ENABLE_SHARED
+        const char* libname = NULL;
+        const char* method = "x265_api_query";
+        const char* multilibname = "libx265" ext;
+
+        if (bitDepth == 12)
+            libname = "libx265_main12" ext;
+        else if (bitDepth == 10)
+            libname = "libx265_main10" ext;
+        else if (bitDepth == 8)
+            libname = "libx265_main" ext;
+        else
+        {
+            if (err) *err = X265_API_QUERY_ERR_LIB_NOT_FOUND;
+            return NULL;
+        }
+
+        const x265_api* api = NULL;
+        int reqDepth = 0;
+        int e = X265_API_QUERY_ERR_LIB_NOT_FOUND;
+
+        if (g_recursion > 1)
+        {
+            if (err) *err = X265_API_QUERY_ERR_LIB_NOT_FOUND;
+            return NULL;
+        }
+        else
+            g_recursion++;
+
+#if _WIN32
+        HMODULE h = LoadLibraryA(libname);
+        if (!h)
+        {
+            h = LoadLibraryA(multilibname);
+            reqDepth = bitDepth;
+        }
+        if (h)
+        {
+            e = X265_API_QUERY_ERR_FUNC_NOT_FOUND;
+            api_query_func query = (api_query_func)GetProcAddress(h, method);
+            if (query)
+                api = query(reqDepth, apiVersion, err);
+        }
+#else
+        void* h = dlopen(libname, RTLD_LAZY | RTLD_LOCAL);
+        if (!h)
+        {
+            h = dlopen(multilibname, RTLD_LAZY | RTLD_LOCAL);
+            reqDepth = bitDepth;
+        }
+        if (h)
+        {
+            e = X265_API_QUERY_ERR_FUNC_NOT_FOUND;
+            api_query_func query = (api_query_func)dlsym(h, method);
+            if (query)
+                api = query(reqDepth, apiVersion, err);
+        }
+#endif
+
+        g_recursion--;
+
+        if (api && bitDepth != api->bit_depth)
+        {
+            x265_log(NULL, X265_LOG_WARNING, "%s does not support requested bitDepth %d\n", libname, bitDepth);
+            if (err) *err = X265_API_QUERY_ERR_WRONG_BITDEPTH;
+            return NULL;
+        }
+
+        if (err) *err = api ? X265_API_QUERY_ERR_NONE : e;
+        return api;
+#else
+        if (err) *err = X265_API_QUERY_ERR_WRONG_BITDEPTH;
+        return NULL;
+#endif
+    }
+
+    return &libapi;
+}
+
+} /* end namespace or extern "C" */
--- a/x265/source/encoder/bitcost.cpp
+++ b/x265/source/encoder/bitcost.cpp
@ -0,0 +1,91 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+#include "bitcost.h"
+
+using namespace X265_NS;
+
+void BitCost::setQP(unsigned int qp)
+{
+    if (!s_costs[qp])
+    {
+        ScopedLock s(s_costCalcLock);
+
+        // Now that we have acquired the lock, check again if another thread calculated
+        // this row while we were blocked
+        if (!s_costs[qp])
+        {
+            x265_emms(); // just to be safe
+
+            CalculateLogs();
+            s_costs[qp] = new uint16_t[4 * BC_MAX_MV + 1] + 2 * BC_MAX_MV;
+            double lambda = x265_lambda_tab[qp];
+
+            // estimate same cost for negative and positive MVD
+            for (int i = 0; i <= 2 * BC_MAX_MV; i++)
+                s_costs[qp][i] = s_costs[qp][-i] = (uint16_t)X265_MIN(s_bitsizes[i] * lambda + 0.5f, (1 << 15) - 1);
+        }
+    }
+
+    m_cost = s_costs[qp];
+}
+
+/***
+ * Class static data and methods
+ */
+
+uint16_t *BitCost::s_costs[BC_MAX_QP];
+
+float *BitCost::s_bitsizes;
+
+Lock BitCost::s_costCalcLock;
+
+void BitCost::CalculateLogs()
+{
+    if (!s_bitsizes)
+    {
+        s_bitsizes = new float[2 * BC_MAX_MV + 1];
+        s_bitsizes[0] = 0.718f;
+        float log2_2 = 2.0f / log(2.0f);  // 2 x 1/log(2)
+        for (int i = 1; i <= 2 * BC_MAX_MV; i++)
+            s_bitsizes[i] = log((float)(i + 1)) * log2_2 + 1.718f;
+    }
+}
+
+void BitCost::destroy()
+{
+    for (int i = 0; i < BC_MAX_QP; i++)
+    {
+        if (s_costs[i])
+        {
+            delete [] (s_costs[i] - 2 * BC_MAX_MV);
+
+            s_costs[i] = 0;
+        }
+    }
+
+    delete [] s_bitsizes;
+    s_bitsizes = 0;
+}
--- a/x265/source/encoder/bitcost.h
+++ b/x265/source/encoder/bitcost.h
@ -0,0 +1,93 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_BITCOST_H
+#define X265_BITCOST_H
+
+#include "common.h"
+#include "threading.h"
+#include "mv.h"
+
+namespace X265_NS {
+// private x265 namespace
+
+class BitCost
+{
+public:
+
+    BitCost() : m_cost_mvx(0), m_cost_mvy(0), m_cost(0), m_mvp(0) {}
+
+    void setQP(unsigned int qp);
+
+    void setMVP(const MV& mvp)                      { m_mvp = mvp; m_cost_mvx = m_cost - mvp.x; m_cost_mvy = m_cost - mvp.y; }
+
+    // return bit cost of motion vector difference, multiplied by lambda
+    inline uint16_t mvcost(const MV& mv) const      { return m_cost_mvx[mv.x] + m_cost_mvy[mv.y]; }
+
+    // return bit cost of motion vector difference, without lambda
+    inline uint32_t bitcost(const MV& mv) const
+    {
+        return (uint32_t)(s_bitsizes[abs(mv.x - m_mvp.x)] +
+                          s_bitsizes[abs(mv.y - m_mvp.y)] + 0.5f);
+    }
+
+    static inline uint32_t bitcost(const MV& mv, const MV& mvp)
+    {
+        return (uint32_t)(s_bitsizes[abs(mv.x - mvp.x)] +
+                          s_bitsizes[abs(mv.y - mvp.y)] + 0.5f);
+    }
+
+    static void destroy();
+
+protected:
+
+    uint16_t *m_cost_mvx;
+
+    uint16_t *m_cost_mvy;
+
+    uint16_t *m_cost;
+
+    MV        m_mvp;
+
+    BitCost& operator =(const BitCost&);
+
+private:
+
+    /* default log2_max_mv_length_horizontal and log2_max_mv_length_horizontal
+     * are 15, specified in quarter-pel luma sample units. making the maximum
+     * signaled ful-pel motion distance 4096, max qpel is 32768 */
+    enum { BC_MAX_MV = (1 << 15) };
+
+    enum { BC_MAX_QP = 82 };
+
+    static float *s_bitsizes;
+
+    static uint16_t *s_costs[BC_MAX_QP];
+
+    static Lock s_costCalcLock;
+
+    static void CalculateLogs();
+};
+}
+
+#endif // ifndef X265_BITCOST_H
--- a/x265/source/encoder/dpb.cpp
+++ b/x265/source/encoder/dpb.cpp
@ -0,0 +1,303 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "frame.h"
+#include "framedata.h"
+#include "picyuv.h"
+#include "slice.h"
+
+#include "dpb.h"
+
+using namespace X265_NS;
+
+DPB::~DPB()
+{
+    while (!m_freeList.empty())
+    {
+        Frame* curFrame = m_freeList.popFront();
+        curFrame->destroy();
+        delete curFrame;
+    }
+
+    while (!m_picList.empty())
+    {
+        Frame* curFrame = m_picList.popFront();
+        curFrame->destroy();
+        delete curFrame;
+    }
+
+    while (m_frameDataFreeList)
+    {
+        FrameData* next = m_frameDataFreeList->m_freeListNext;
+        m_frameDataFreeList->destroy();
+
+        m_frameDataFreeList->m_reconPic->destroy();
+        delete m_frameDataFreeList->m_reconPic;
+
+        delete m_frameDataFreeList;
+        m_frameDataFreeList = next;
+    }
+}
+
+// move unreferenced pictures from picList to freeList for recycle
+void DPB::recycleUnreferenced()
+{
+    Frame *iterFrame = m_picList.first();
+
+    while (iterFrame)
+    {
+        Frame *curFrame = iterFrame;
+        iterFrame = iterFrame->m_next;
+        if (!curFrame->m_encData->m_bHasReferences && !curFrame->m_countRefEncoders)
+        {
+            curFrame->m_reconRowCount.set(0);
+            curFrame->m_bChromaExtended = false;
+
+            // iterator is invalidated by remove, restart scan
+            m_picList.remove(*curFrame);
+            iterFrame = m_picList.first();
+
+            m_freeList.pushBack(*curFrame);
+            curFrame->m_encData->m_freeListNext = m_frameDataFreeList;
+            m_frameDataFreeList = curFrame->m_encData;
+            curFrame->m_encData = NULL;
+            curFrame->m_reconPic = NULL;
+        }
+    }
+}
+
+void DPB::prepareEncode(Frame *newFrame)
+{
+    Slice* slice = newFrame->m_encData->m_slice;
+    slice->m_poc = newFrame->m_poc;
+
+    int pocCurr = slice->m_poc;
+    int type = newFrame->m_lowres.sliceType;
+    bool bIsKeyFrame = newFrame->m_lowres.bKeyframe;
+
+    slice->m_nalUnitType = getNalUnitType(pocCurr, bIsKeyFrame);
+    if (slice->m_nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL)
+        m_lastIDR = pocCurr;
+    slice->m_lastIDR = m_lastIDR;
+    slice->m_sliceType = IS_X265_TYPE_B(type) ? B_SLICE : (type == X265_TYPE_P) ? P_SLICE : I_SLICE;
+
+    if (type == X265_TYPE_B)
+    {
+        newFrame->m_encData->m_bHasReferences = false;
+
+        // Adjust NAL type for unreferenced B frames (change from _R "referenced"
+        // to _N "non-referenced" NAL unit type)
+        switch (slice->m_nalUnitType)
+        {
+        case NAL_UNIT_CODED_SLICE_TRAIL_R:
+            slice->m_nalUnitType = m_bTemporalSublayer ? NAL_UNIT_CODED_SLICE_TSA_N : NAL_UNIT_CODED_SLICE_TRAIL_N;
+            break;
+        case NAL_UNIT_CODED_SLICE_RADL_R:
+            slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RADL_N;
+            break;
+        case NAL_UNIT_CODED_SLICE_RASL_R:
+            slice->m_nalUnitType = NAL_UNIT_CODED_SLICE_RASL_N;
+            break;
+        default:
+            break;
+        }
+    }
+    else
+    {
+        /* m_bHasReferences starts out as true for non-B pictures, and is set to false
+         * once no more pictures reference it */
+        newFrame->m_encData->m_bHasReferences = true;
+    }
+
+    m_picList.pushFront(*newFrame);
+
+    // Do decoding refresh marking if any
+    decodingRefreshMarking(pocCurr, slice->m_nalUnitType);
+
+    computeRPS(pocCurr, slice->isIRAP(), &slice->m_rps, slice->m_sps->maxDecPicBuffering);
+
+    // Mark pictures in m_piclist as unreferenced if they are not included in RPS
+    applyReferencePictureSet(&slice->m_rps, pocCurr);
+
+    slice->m_numRefIdx[0] = X265_MIN(m_maxRefL0, slice->m_rps.numberOfNegativePictures); // Ensuring L0 contains just the -ve POC
+    slice->m_numRefIdx[1] = X265_MIN(m_maxRefL1, slice->m_rps.numberOfPositivePictures);
+    slice->setRefPicList(m_picList);
+
+    X265_CHECK(slice->m_sliceType != B_SLICE || slice->m_numRefIdx[1], "B slice without L1 references (non-fatal)\n");
+
+    if (slice->m_sliceType == B_SLICE)
+    {
+        /* TODO: the lookahead should be able to tell which reference picture
+         * had the least motion residual.  We should be able to use that here to
+         * select a colocation reference list and index */
+        slice->m_colFromL0Flag = false;
+        slice->m_colRefIdx = 0;
+        slice->m_bCheckLDC = false;
+    }
+    else
+    {
+        slice->m_bCheckLDC = true;
+        slice->m_colFromL0Flag = true;
+        slice->m_colRefIdx = 0;
+    }
+    slice->m_sLFaseFlag = (SLFASE_CONSTANT & (1 << (pocCurr % 31))) > 0;
+
+    /* Increment reference count of all motion-referenced frames to prevent them
+     * from being recycled. These counts are decremented at the end of
+     * compressFrame() */
+    int numPredDir = slice->isInterP() ? 1 : slice->isInterB() ? 2 : 0;
+    for (int l = 0; l < numPredDir; l++)
+    {
+        for (int ref = 0; ref < slice->m_numRefIdx[l]; ref++)
+        {
+            Frame *refpic = slice->m_refFrameList[l][ref];
+            ATOMIC_INC(&refpic->m_countRefEncoders);
+        }
+    }
+}
+
+void DPB::computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer)
+{
+    unsigned int poci = 0, numNeg = 0, numPos = 0;
+
+    Frame* iterPic = m_picList.first();
+
+    while (iterPic && (poci < maxDecPicBuffer - 1))
+    {
+        if ((iterPic->m_poc != curPoc) && iterPic->m_encData->m_bHasReferences)
+        {
+            rps->poc[poci] = iterPic->m_poc;
+            rps->deltaPOC[poci] = rps->poc[poci] - curPoc;
+            (rps->deltaPOC[poci] < 0) ? numNeg++ : numPos++;
+            rps->bUsed[poci] = !isRAP;
+            poci++;
+        }
+        iterPic = iterPic->m_next;
+    }
+
+    rps->numberOfPictures = poci;
+    rps->numberOfPositivePictures = numPos;
+    rps->numberOfNegativePictures = numNeg;
+
+    rps->sortDeltaPOC();
+}
+
+/* Marking reference pictures when an IDR/CRA is encountered. */
+void DPB::decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType)
+{
+    if (nalUnitType == NAL_UNIT_CODED_SLICE_IDR_W_RADL)
+    {
+        /* If the nal_unit_type is IDR, all pictures in the reference picture
+         * list are marked as "unused for reference" */
+        Frame* iterFrame = m_picList.first();
+        while (iterFrame)
+        {
+            if (iterFrame->m_poc != pocCurr)
+                iterFrame->m_encData->m_bHasReferences = false;
+            iterFrame = iterFrame->m_next;
+        }
+    }
+    else // CRA or No DR
+    {
+        if (m_bRefreshPending && pocCurr > m_pocCRA)
+        {
+            /* If the bRefreshPending flag is true (a deferred decoding refresh
+             * is pending) and the current temporal reference is greater than
+             * the temporal reference of the latest CRA picture (pocCRA), mark
+             * all reference pictures except the latest CRA picture as "unused
+             * for reference" and set the bRefreshPending flag to false */
+            Frame* iterFrame = m_picList.first();
+            while (iterFrame)
+            {
+                if (iterFrame->m_poc != pocCurr && iterFrame->m_poc != m_pocCRA)
+                    iterFrame->m_encData->m_bHasReferences = false;
+                iterFrame = iterFrame->m_next;
+            }
+
+            m_bRefreshPending = false;
+        }
+        if (nalUnitType == NAL_UNIT_CODED_SLICE_CRA)
+        {
+            /* If the nal_unit_type is CRA, set the bRefreshPending flag to true
+             * and pocCRA to the temporal reference of the current picture */
+            m_bRefreshPending = true;
+            m_pocCRA = pocCurr;
+        }
+    }
+
+    /* Note that the current picture is already placed in the reference list and
+     * its marking is not changed.  If the current picture has a nal_ref_idc
+     * that is not 0, it will remain marked as "used for reference" */
+}
+
+/** Function for applying picture marking based on the Reference Picture Set */
+void DPB::applyReferencePictureSet(RPS *rps, int curPoc)
+{
+    // loop through all pictures in the reference picture buffer
+    Frame* iterFrame = m_picList.first();
+    while (iterFrame)
+    {
+        if (iterFrame->m_poc != curPoc && iterFrame->m_encData->m_bHasReferences)
+        {
+            // loop through all pictures in the Reference Picture Set
+            // to see if the picture should be kept as reference picture
+            bool referenced = false;
+            for (int i = 0; i < rps->numberOfPositivePictures + rps->numberOfNegativePictures; i++)
+            {
+                if (iterFrame->m_poc == curPoc + rps->deltaPOC[i])
+                {
+                    referenced = true;
+                    break;
+                }
+            }
+            if (!referenced)
+                iterFrame->m_encData->m_bHasReferences = false;
+        }
+        iterFrame = iterFrame->m_next;
+    }
+}
+
+/* deciding the nal_unit_type */
+NalUnitType DPB::getNalUnitType(int curPOC, bool bIsKeyFrame)
+{
+    if (!curPOC)
+        return NAL_UNIT_CODED_SLICE_IDR_W_RADL;
+
+    if (bIsKeyFrame)
+        return m_bOpenGOP ? NAL_UNIT_CODED_SLICE_CRA : NAL_UNIT_CODED_SLICE_IDR_W_RADL;
+
+    if (m_pocCRA && curPOC < m_pocCRA)
+        // All leading pictures are being marked as TFD pictures here since
+        // current encoder uses all reference pictures while encoding leading
+        // pictures. An encoder can ensure that a leading picture can be still
+        // decodable when random accessing to a CRA/CRANT/BLA/BLANT picture by
+        // controlling the reference pictures used for encoding that leading
+        // picture. Such a leading picture need not be marked as a TFD picture.
+        return NAL_UNIT_CODED_SLICE_RASL_R;
+
+    if (m_lastIDR && curPOC < m_lastIDR)
+        return NAL_UNIT_CODED_SLICE_RADL_R;
+
+    return NAL_UNIT_CODED_SLICE_TRAIL_R;
+}
--- a/x265/source/encoder/dpb.h
+++ b/x265/source/encoder/dpb.h
@ -0,0 +1,80 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_DPB_H
+#define X265_DPB_H
+
+#include "piclist.h"
+
+namespace X265_NS {
+// private namespace for x265
+
+class Frame;
+class FrameData;
+class Slice;
+
+class DPB
+{
+public:
+
+    int                m_lastIDR;
+    int                m_pocCRA;
+    int                m_maxRefL0;
+    int                m_maxRefL1;
+    int                m_bOpenGOP;
+    bool               m_bRefreshPending;
+    bool               m_bTemporalSublayer;
+    PicList            m_picList;
+    PicList            m_freeList;
+    FrameData*         m_frameDataFreeList;
+
+    DPB(x265_param *param)
+    {
+        m_lastIDR = 0;
+        m_pocCRA = 0;
+        m_bRefreshPending = false;
+        m_frameDataFreeList = NULL;
+        m_maxRefL0 = param->maxNumReferences;
+        m_maxRefL1 = param->bBPyramid ? 2 : 1;
+        m_bOpenGOP = param->bOpenGOP;
+        m_bTemporalSublayer = !!param->bEnableTemporalSubLayers;
+    }
+
+    ~DPB();
+
+    void prepareEncode(Frame*);
+
+    void recycleUnreferenced();
+
+protected:
+
+    void computeRPS(int curPoc, bool isRAP, RPS * rps, unsigned int maxDecPicBuffer);
+
+    void applyReferencePictureSet(RPS *rps, int curPoc);
+    void decodingRefreshMarking(int pocCurr, NalUnitType nalUnitType);
+
+    NalUnitType getNalUnitType(int curPoc, bool bIsKeyFrame);
+};
+}
+
+#endif // X265_DPB_H
--- a/x265/source/encoder/encoder.cpp
+++ b/x265/source/encoder/encoder.cpp
--- a/x265/source/encoder/encoder.h
+++ b/x265/source/encoder/encoder.h
@ -0,0 +1,179 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_ENCODER_H
+#define X265_ENCODER_H
+
+#include "common.h"
+#include "slice.h"
+#include "scalinglist.h"
+#include "x265.h"
+#include "nal.h"
+
+struct x265_encoder {};
+
+namespace X265_NS {
+// private namespace
+extern const char g_sliceTypeToChar[3];
+
+class Entropy;
+
+struct EncStats
+{
+    double        m_psnrSumY;
+    double        m_psnrSumU;
+    double        m_psnrSumV;
+    double        m_globalSsim;
+    double        m_totalQp;
+    uint64_t      m_accBits;
+    uint32_t      m_numPics;
+    uint16_t      m_maxCLL;
+    double        m_maxFALL;
+
+    EncStats()
+    {
+        m_psnrSumY = m_psnrSumU = m_psnrSumV = m_globalSsim = 0;
+        m_accBits = 0;
+        m_numPics = 0;
+        m_totalQp = 0;
+        m_maxCLL = 0;
+        m_maxFALL = 0;
+    }
+
+    void addQP(double aveQp);
+
+    void addPsnr(double psnrY, double psnrU, double psnrV);
+
+    void addBits(uint64_t bits);
+
+    void addSsim(double ssim);
+};
+
+class FrameEncoder;
+class DPB;
+class Lookahead;
+class RateControl;
+class ThreadPool;
+
+class Encoder : public x265_encoder
+{
+public:
+
+    int                m_pocLast;         // time index (POC)
+    int                m_encodedFrameNum;
+    int                m_outputCount;
+
+    int                m_bframeDelay;
+    int64_t            m_firstPts;
+    int64_t            m_bframeDelayTime;
+    int64_t            m_prevReorderedPts[2];
+
+    ThreadPool*        m_threadPool;
+    FrameEncoder*      m_frameEncoder[X265_MAX_FRAME_THREADS];
+    DPB*               m_dpb;
+
+    Frame*             m_exportedPic;
+
+    int                m_numPools;
+    int                m_curEncoder;
+
+    /* Collect statistics globally */
+    EncStats           m_analyzeAll;
+    EncStats           m_analyzeI;
+    EncStats           m_analyzeP;
+    EncStats           m_analyzeB;
+    int64_t            m_encodeStartTime;
+
+    // weighted prediction
+    int                m_numLumaWPFrames;    // number of P frames with weighted luma reference
+    int                m_numChromaWPFrames;  // number of P frames with weighted chroma reference
+    int                m_numLumaWPBiFrames;  // number of B frames with weighted luma reference
+    int                m_numChromaWPBiFrames; // number of B frames with weighted chroma reference
+    FILE*              m_analysisFile;
+    int                m_conformanceMode;
+    VPS                m_vps;
+    SPS                m_sps;
+    PPS                m_pps;
+    NALList            m_nalList;
+    ScalingList        m_scalingList;      // quantization matrix information
+
+    bool               m_emitCLLSEI;
+    int                m_lastBPSEI;
+    uint32_t           m_numDelayedPic;
+
+    x265_param*        m_param;
+    x265_param*        m_latestParam;
+    RateControl*       m_rateControl;
+    Lookahead*         m_lookahead;
+    Window             m_conformanceWindow;
+
+    bool               m_bZeroLatency;     // x265_encoder_encode() returns NALs for the input picture, zero lag
+    bool               m_aborted;          // fatal error detected
+    bool               m_reconfigured;      // reconfigure of encoder detected
+
+    uint32_t           m_residualSumEmergency[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+    uint16_t           (*m_offsetEmergency)[MAX_NUM_TR_CATEGORIES][MAX_NUM_TR_COEFFS];
+    uint32_t           m_countEmergency[MAX_NUM_TR_CATEGORIES];
+
+    Encoder();
+    ~Encoder() {}
+
+    void create();
+    void stopJobs();
+    void destroy();
+
+    int encode(const x265_picture* pic, x265_picture *pic_out);
+
+    int reconfigureParam(x265_param* encParam, x265_param* param);
+
+    void getStreamHeaders(NALList& list, Entropy& sbacCoder, Bitstream& bs);
+
+    void fetchStats(x265_stats* stats, size_t statsSizeBytes);
+
+    void printSummary();
+
+    char* statsString(EncStats&, char*);
+
+    void configure(x265_param *param);
+
+    void updateVbvPlan(RateControl* rc);
+
+    void allocAnalysis(x265_analysis_data* analysis);
+
+    void freeAnalysis(x265_analysis_data* analysis);
+
+    void readAnalysisFile(x265_analysis_data* analysis, int poc);
+
+    void writeAnalysisFile(x265_analysis_data* pic);
+
+    void finishFrameStats(Frame* pic, FrameEncoder *curEncoder, uint64_t bits, x265_frame_stats* frameStats);
+
+protected:
+
+    void initVPS(VPS *vps);
+    void initSPS(SPS *sps);
+    void initPPS(PPS *pps);
+};
+}
+
+#endif // ifndef X265_ENCODER_H
--- a/x265/source/encoder/entropy.cpp
+++ b/x265/source/encoder/entropy.cpp
--- a/x265/source/encoder/entropy.h
+++ b/x265/source/encoder/entropy.h
@ -0,0 +1,255 @@
+/*****************************************************************************
+* Copyright (C) 2013 x265 project
+*
+* Authors: Steve Borho <steve@borho.org>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#ifndef X265_ENTROPY_H
+#define X265_ENTROPY_H
+
+#include "common.h"
+#include "bitstream.h"
+#include "frame.h"
+#include "cudata.h"
+#include "contexts.h"
+#include "slice.h"
+
+namespace X265_NS {
+// private namespace
+
+struct SaoCtuParam;
+struct EstBitsSbac;
+class ScalingList;
+
+enum SplitType
+{
+    DONT_SPLIT            = 0,
+    VERTICAL_SPLIT        = 1,
+    QUAD_SPLIT            = 2,
+    NUMBER_OF_SPLIT_MODES = 3
+};
+
+struct TURecurse
+{
+    uint32_t section;
+    uint32_t splitMode;
+    uint32_t absPartIdxTURelCU;
+    uint32_t absPartIdxStep;
+
+    TURecurse(SplitType splitType, uint32_t _absPartIdxStep, uint32_t _absPartIdxTU)
+    {
+        static const uint32_t partIdxStepShift[NUMBER_OF_SPLIT_MODES] = { 0, 1, 2 };
+        section           = 0;
+        absPartIdxTURelCU = _absPartIdxTU;
+        splitMode         = (uint32_t)splitType;
+        absPartIdxStep    = _absPartIdxStep >> partIdxStepShift[splitMode];
+    }
+
+    bool isNextSection()
+    {
+        if (splitMode == DONT_SPLIT)
+        {
+            section++;
+            return false;
+        }
+        else
+        {
+            absPartIdxTURelCU += absPartIdxStep;
+
+            section++;
+            return section < (uint32_t)(1 << splitMode);
+        }
+    }
+
+    bool isLastSection() const
+    {
+        return (section + 1) >= (uint32_t)(1 << splitMode);
+    }
+};
+
+struct EstBitsSbac
+{
+    int significantCoeffGroupBits[NUM_SIG_CG_FLAG_CTX][2];
+    int significantBits[2][NUM_SIG_FLAG_CTX];
+    int lastBits[2][10];
+    int greaterOneBits[NUM_ONE_FLAG_CTX][2];
+    int levelAbsBits[NUM_ABS_FLAG_CTX][2];
+    int blockCbpBits[NUM_QT_CBF_CTX][2];
+    int blockRootCbpBits[2];
+};
+
+class Entropy : public SyntaxElementWriter
+{
+public:
+
+    uint64_t      m_pad;
+    uint8_t       m_contextState[160]; // MAX_OFF_CTX_MOD + padding
+
+    /* CABAC state */
+    uint32_t      m_low;
+    uint32_t      m_range;
+    uint32_t      m_bufferedByte;
+    int           m_numBufferedBytes;
+    int           m_bitsLeft;
+    uint64_t      m_fracBits;
+    EstBitsSbac   m_estBitsSbac;
+
+    Entropy();
+
+    void setBitstream(Bitstream* p)    { m_bitIf = p; }
+
+    uint32_t getNumberOfWrittenBits()
+    {
+        X265_CHECK(!m_bitIf, "bit counting mode expected\n");
+        return (uint32_t)(m_fracBits >> 15);
+    }
+
+#if CHECKED_BUILD || _DEBUG
+    bool m_valid;
+    void markInvalid()                 { m_valid = false; }
+    void markValid()                   { m_valid = true; }
+#else
+    void markValid()                   { }
+#endif
+    void zeroFract()                   { m_fracBits = 0; }
+    void resetBits();
+    void resetEntropy(const Slice& slice);
+
+    // SBAC RD
+    void load(const Entropy& src)            { copyFrom(src); }
+    void store(Entropy& dest) const          { dest.copyFrom(*this); }
+    void loadContexts(const Entropy& src)    { copyContextsFrom(src); }
+    void loadIntraDirModeLuma(const Entropy& src);
+    void copyState(const Entropy& other);
+
+    void codeVPS(const VPS& vps);
+    void codeSPS(const SPS& sps, const ScalingList& scalingList, const ProfileTierLevel& ptl);
+    void codePPS(const PPS& pps);
+    void codeVUI(const VUI& vui, int maxSubTLayers);
+    void codeAUD(const Slice& slice);
+    void codeHrdParameters(const HRDInfo& hrd, int maxSubTLayers);
+
+    void codeSliceHeader(const Slice& slice, FrameData& encData);
+    void codeSliceHeaderWPPEntryPoints(const Slice& slice, const uint32_t *substreamSizes, uint32_t maxOffset);
+    void codeShortTermRefPicSet(const RPS& rps);
+    void finishSlice()                 { encodeBinTrm(1); finish(); dynamic_cast<Bitstream*>(m_bitIf)->writeByteAlignment(); }
+
+    void encodeCTU(const CUData& cu, const CUGeom& cuGeom);
+
+    void codeIntraDirLumaAng(const CUData& cu, uint32_t absPartIdx, bool isMultiple);
+    void codeIntraDirChroma(const CUData& cu, uint32_t absPartIdx, uint32_t *chromaDirMode);
+
+    void codeMergeIndex(const CUData& cu, uint32_t absPartIdx);
+    void codeMvd(const CUData& cu, uint32_t absPartIdx, int list);
+
+    void codePartSize(const CUData& cu, uint32_t absPartIdx, uint32_t depth);
+    void codePredInfo(const CUData& cu, uint32_t absPartIdx);
+    inline void codeQtCbfLuma(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth) { codeQtCbfLuma(cu.getCbf(absPartIdx, TEXT_LUMA, tuDepth), tuDepth); }
+
+    void codeQtCbfChroma(const CUData& cu, uint32_t absPartIdx, TextType ttype, uint32_t tuDepth, bool lowestLevel);
+    void codeCoeff(const CUData& cu, uint32_t absPartIdx, bool& bCodeDQP, const uint32_t depthRange[2]);
+    void codeCoeffNxN(const CUData& cu, const coeff_t* coef, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype);
+
+    inline void codeSaoMerge(uint32_t code)                          { encodeBin(code, m_contextState[OFF_SAO_MERGE_FLAG_CTX]); }
+    inline void codeMVPIdx(uint32_t symbol)                          { encodeBin(symbol, m_contextState[OFF_MVP_IDX_CTX]); }
+    inline void codeMergeFlag(const CUData& cu, uint32_t absPartIdx) { encodeBin(cu.m_mergeFlag[absPartIdx], m_contextState[OFF_MERGE_FLAG_EXT_CTX]); }
+    inline void codeSkipFlag(const CUData& cu, uint32_t absPartIdx)  { encodeBin(cu.isSkipped(absPartIdx), m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]); }
+    inline void codeSplitFlag(const CUData& cu, uint32_t absPartIdx, uint32_t depth) { encodeBin(cu.m_cuDepth[absPartIdx] > depth, m_contextState[OFF_SPLIT_FLAG_CTX + cu.getCtxSplitFlag(absPartIdx, depth)]); }
+    inline void codeTransformSubdivFlag(uint32_t symbol, uint32_t ctx)    { encodeBin(symbol, m_contextState[OFF_TRANS_SUBDIV_FLAG_CTX + ctx]); }
+    inline void codePredMode(int predMode)                                { encodeBin(predMode == MODE_INTRA ? 1 : 0, m_contextState[OFF_PRED_MODE_CTX]); }
+    inline void codeCUTransquantBypassFlag(uint32_t symbol)               { encodeBin(symbol, m_contextState[OFF_TQUANT_BYPASS_FLAG_CTX]); }
+    inline void codeQtCbfLuma(uint32_t cbf, uint32_t tuDepth)             { encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + !tuDepth]); }
+    inline void codeQtCbfChroma(uint32_t cbf, uint32_t tuDepth)           { encodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + 2 + tuDepth]); }
+    inline void codeQtRootCbf(uint32_t cbf)                               { encodeBin(cbf, m_contextState[OFF_QT_ROOT_CBF_CTX]); }
+    inline void codeTransformSkipFlags(uint32_t transformSkip, TextType ttype) { encodeBin(transformSkip, m_contextState[OFF_TRANSFORMSKIP_FLAG_CTX + (ttype ? NUM_TRANSFORMSKIP_FLAG_CTX : 0)]); }
+    void codeDeltaQP(const CUData& cu, uint32_t absPartIdx);
+    void codeSaoOffset(const SaoCtuParam& ctuParam, int plane);
+
+    /* RDO functions */
+    void estBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const;
+    void estCBFBit(EstBitsSbac& estBitsSbac) const;
+    void estSignificantCoeffGroupMapBit(EstBitsSbac& estBitsSbac, bool bIsLuma) const;
+    void estSignificantMapBit(EstBitsSbac& estBitsSbac, uint32_t log2TrSize, bool bIsLuma) const;
+    void estSignificantCoefficientsBit(EstBitsSbac& estBitsSbac, bool bIsLuma) const;
+
+    inline uint32_t bitsIntraModeNonMPM() const { return bitsCodeBin(0, m_contextState[OFF_ADI_CTX]) + 5; }
+    inline uint32_t bitsIntraModeMPM(const uint32_t preds[3], uint32_t dir) const { return bitsCodeBin(1, m_contextState[OFF_ADI_CTX]) + (dir == preds[0] ? 1 : 2); }
+    inline uint32_t estimateCbfBits(uint32_t cbf, TextType ttype, uint32_t tuDepth) const { return bitsCodeBin(cbf, m_contextState[OFF_QT_CBF_CTX + ctxCbf[ttype][tuDepth]]); }
+    uint32_t bitsInterMode(const CUData& cu, uint32_t absPartIdx, uint32_t depth) const;
+    uint32_t bitsIntraMode(const CUData& cu, uint32_t absPartIdx) const
+    {
+        return bitsCodeBin(0, m_contextState[OFF_SKIP_FLAG_CTX + cu.getCtxSkipFlag(absPartIdx)]) + /* not skip */
+               bitsCodeBin(1, m_contextState[OFF_PRED_MODE_CTX]); /* intra */
+    }
+
+    /* these functions are only used to estimate the bits when cbf is 0 and will never be called when writing the bistream. */
+    inline void codeQtRootCbfZero() { encodeBin(0, m_contextState[OFF_QT_ROOT_CBF_CTX]); }
+
+private:
+
+    /* CABAC private methods */
+    void start();
+    void finish();
+
+    void encodeBin(uint32_t binValue, uint8_t& ctxModel);
+    void encodeBinEP(uint32_t binValue);
+    void encodeBinsEP(uint32_t binValues, int numBins);
+    void encodeBinTrm(uint32_t binValue);
+
+    /* return the bits of encoding the context bin without updating */
+    inline uint32_t bitsCodeBin(uint32_t binValue, uint32_t ctxModel) const
+    {
+        uint64_t fracBits = (m_fracBits & 32767) + sbacGetEntropyBits(ctxModel, binValue);
+        return (uint32_t)(fracBits >> 15);
+    }
+
+    void encodeCU(const CUData& ctu, const CUGeom &cuGeom, uint32_t absPartIdx, uint32_t depth, bool& bEncodeDQP);
+    void finishCU(const CUData& ctu, uint32_t absPartIdx, uint32_t depth, bool bEncodeDQP);
+
+    void writeOut();
+
+    /* SBac private methods */
+    void writeUnaryMaxSymbol(uint32_t symbol, uint8_t* scmModel, int offset, uint32_t maxSymbol);
+    void writeEpExGolomb(uint32_t symbol, uint32_t count);
+    void writeCoefRemainExGolomb(uint32_t symbol, const uint32_t absGoRice);
+
+    void codeProfileTier(const ProfileTierLevel& ptl, int maxTempSubLayers);
+    void codeScalingList(const ScalingList&);
+    void codeScalingList(const ScalingList& scalingList, uint32_t sizeId, uint32_t listId);
+
+    void codePredWeightTable(const Slice& slice);
+    void codeInterDir(const CUData& cu, uint32_t absPartIdx);
+    void codePUWise(const CUData& cu, uint32_t absPartIdx);
+    void codeRefFrmIdxPU(const CUData& cu, uint32_t absPartIdx, int list);
+    void codeRefFrmIdx(const CUData& cu, uint32_t absPartIdx, int list);
+
+    void codeSaoMaxUvlc(uint32_t code, uint32_t maxSymbol);
+
+    void codeLastSignificantXY(uint32_t posx, uint32_t posy, uint32_t log2TrSize, bool bIsLuma, uint32_t scanIdx);
+
+    void encodeTransform(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, uint32_t log2TrSize,
+                         bool& bCodeDQP, const uint32_t depthRange[2]);
+
+    void copyFrom(const Entropy& src);
+    void copyContextsFrom(const Entropy& src);
+};
+}
+
+#endif // ifndef X265_ENTROPY_H
--- a/x265/source/encoder/frameencoder.cpp
+++ b/x265/source/encoder/frameencoder.cpp
--- a/x265/source/encoder/frameencoder.h
+++ b/x265/source/encoder/frameencoder.h
@ -0,0 +1,234 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Shin Yee <shinyee@multicorewareinc.com>
+ *          Min Chen <chenm003@163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_FRAMEENCODER_H
+#define X265_FRAMEENCODER_H
+
+#include "common.h"
+#include "wavefront.h"
+#include "bitstream.h"
+#include "frame.h"
+#include "picyuv.h"
+#include "md5.h"
+
+#include "analysis.h"
+#include "sao.h"
+
+#include "entropy.h"
+#include "framefilter.h"
+#include "ratecontrol.h"
+#include "reference.h"
+#include "nal.h"
+
+namespace X265_NS {
+// private x265 namespace
+
+class ThreadPool;
+class Encoder;
+
+#define ANGULAR_MODE_ID 2
+#define AMP_ID 3
+
+struct StatisticLog
+{
+    uint64_t cntInter[4];
+    uint64_t cntIntra[4];
+    uint64_t cuInterDistribution[4][INTER_MODES];
+    uint64_t cuIntraDistribution[4][INTRA_MODES];
+    uint64_t cntIntraNxN;
+    uint64_t cntSkipCu[4];
+    uint64_t cntTotalCu[4];
+    uint64_t totalCu;
+
+    StatisticLog()
+    {
+        memset(this, 0, sizeof(StatisticLog));
+    }
+};
+
+/* manages the state of encoding one row of CTU blocks.  When
+ * WPP is active, several rows will be simultaneously encoded. */
+struct CTURow
+{
+    Entropy           bufferedEntropy;  /* store CTU2 context for next row CTU0 */
+    Entropy           rowGoOnCoder;     /* store context between CTUs, code bitstream if !SAO */
+
+    FrameStats        rowStats;
+
+    /* Threading variables */
+
+    /* This lock must be acquired when reading or writing m_active or m_busy */
+    Lock              lock;
+
+    /* row is ready to run, has no neighbor dependencies. The row may have
+     * external dependencies (reference frame pixels) that prevent it from being
+     * processed, so it may stay with m_active=true for some time before it is
+     * encoded by a worker thread. */
+    volatile bool     active;
+
+    /* row is being processed by a worker thread.  This flag is only true when a
+     * worker thread is within the context of FrameEncoder::processRow(). This
+     * flag is used to detect multiple possible wavefront problems. */
+    volatile bool     busy;
+
+    /* count of completed CUs in this row */
+    volatile uint32_t completed;
+
+    /* called at the start of each frame to initialize state */
+    void init(Entropy& initContext)
+    {
+        active = false;
+        busy = false;
+        completed = 0;
+        memset(&rowStats, 0, sizeof(rowStats));
+        rowGoOnCoder.load(initContext);
+    }
+};
+
+// Manages the wave-front processing of a single encoding frame
+class FrameEncoder : public WaveFront, public Thread
+{
+public:
+
+    FrameEncoder();
+
+    virtual ~FrameEncoder() {}
+
+    virtual bool init(Encoder *top, int numRows, int numCols);
+
+    void destroy();
+
+    /* triggers encode of a new frame by the worker thread */
+    bool startCompressFrame(Frame* curFrame);
+
+    /* blocks until worker thread is done, returns access unit */
+    Frame *getEncodedPicture(NALList& list);
+
+    Event                    m_enable;
+    Event                    m_done;
+    Event                    m_completionEvent;
+    int                      m_localTldIdx;
+
+    volatile bool            m_threadActive;
+    volatile bool            m_bAllRowsStop;
+    volatile int             m_completionCount;
+    volatile int             m_vbvResetTriggerRow;
+
+    uint32_t                 m_numRows;
+    uint32_t                 m_numCols;
+    uint32_t                 m_filterRowDelay;
+    uint32_t                 m_filterRowDelayCus;
+    uint32_t                 m_refLagRows;
+
+    CTURow*                  m_rows;
+    RateControlEntry         m_rce;
+    SEIDecodedPictureHash    m_seiReconPictureDigest;
+
+    uint64_t                 m_SSDY;
+    uint64_t                 m_SSDU;
+    uint64_t                 m_SSDV;
+    double                   m_ssim;
+    uint64_t                 m_accessUnitBits;
+    uint32_t                 m_ssimCnt;
+    MD5Context               m_state[3];
+    uint32_t                 m_crc[3];
+    uint32_t                 m_checksum[3];
+
+    volatile int             m_activeWorkerCount;        // count of workers currently encoding or filtering CTUs
+    volatile int             m_totalActiveWorkerCount;   // sum of m_activeWorkerCount sampled at end of each CTU
+    volatile int             m_activeWorkerCountSamples; // count of times m_activeWorkerCount was sampled (think vbv restarts)
+    volatile int             m_countRowBlocks;           // count of workers forced to abandon a row because of top dependency
+    int64_t                  m_startCompressTime;        // timestamp when frame encoder is given a frame
+    int64_t                  m_row0WaitTime;             // timestamp when row 0 is allowed to start
+    int64_t                  m_allRowsAvailableTime;     // timestamp when all reference dependencies are resolved
+    int64_t                  m_endCompressTime;          // timestamp after all CTUs are compressed
+    int64_t                  m_endFrameTime;             // timestamp after RCEnd, NR updates, etc
+    int64_t                  m_stallStartTime;           // timestamp when worker count becomes 0
+    int64_t                  m_prevOutputTime;           // timestamp when prev frame was retrieved by API thread
+    int64_t                  m_slicetypeWaitTime;        // total elapsed time waiting for decided frame
+    int64_t                  m_totalWorkerElapsedTime;   // total elapsed time spent by worker threads processing CTUs
+    int64_t                  m_totalNoWorkerTime;        // total elapsed time without any active worker threads
+#if DETAILED_CU_STATS
+    CUStats                  m_cuStats;
+#endif
+
+    Encoder*                 m_top;
+    x265_param*              m_param;
+    Frame*                   m_frame;
+    NoiseReduction*          m_nr;
+    ThreadLocalData*         m_tld; /* for --no-wpp */
+    Bitstream*               m_outStreams;
+    uint32_t*                m_substreamSizes;
+
+    CUGeom*                  m_cuGeoms;
+    uint32_t*                m_ctuGeomMap;
+
+    Bitstream                m_bs;
+    MotionReference          m_mref[2][MAX_NUM_REF + 1];
+    Entropy                  m_entropyCoder;
+    Entropy                  m_initSliceContext;
+    FrameFilter              m_frameFilter;
+    NALList                  m_nalList;
+
+    class WeightAnalysis : public BondedTaskGroup
+    {
+    public:
+
+        FrameEncoder& master;
+
+        WeightAnalysis(FrameEncoder& fe) : master(fe) {}
+
+        void processTasks(int workerThreadId);
+
+    protected:
+
+        WeightAnalysis operator=(const WeightAnalysis&);
+    };
+
+protected:
+
+    bool initializeGeoms();
+
+    /* analyze / compress frame, can be run in parallel within reference constraints */
+    void compressFrame();
+
+    /* called by compressFrame to generate final per-row bitstreams */
+    void encodeSlice();
+
+    void threadMain();
+    int  collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);
+    void noiseReductionUpdate();
+
+    /* Called by WaveFront::findJob() */
+    virtual void processRow(int row, int threadId);
+    virtual void processRowEncoder(int row, ThreadLocalData& tld);
+
+    void enqueueRowEncoder(int row) { WaveFront::enqueueRow(row * 2 + 0); }
+    void enqueueRowFilter(int row)  { WaveFront::enqueueRow(row * 2 + 1); }
+    void enableRowEncoder(int row)  { WaveFront::enableRow(row * 2 + 0); }
+    void enableRowFilter(int row)   { WaveFront::enableRow(row * 2 + 1); }
+};
+}
+
+#endif // ifndef X265_FRAMEENCODER_H
--- a/x265/source/encoder/framefilter.cpp
+++ b/x265/source/encoder/framefilter.cpp
@ -0,0 +1,494 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
+ *          Min Chen <chenm003@163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "frame.h"
+#include "framedata.h"
+#include "encoder.h"
+#include "framefilter.h"
+#include "frameencoder.h"
+#include "wavefront.h"
+
+using namespace X265_NS;
+
+static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height);
+static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt);
+
+FrameFilter::FrameFilter()
+    : m_param(NULL)
+    , m_frame(NULL)
+    , m_frameEncoder(NULL)
+    , m_ssimBuf(NULL)
+{
+}
+
+void FrameFilter::destroy()
+{
+    if (m_param->bEnableSAO)
+        m_sao.destroy();
+
+    X265_FREE(m_ssimBuf);
+}
+
+void FrameFilter::init(Encoder *top, FrameEncoder *frame, int numRows)
+{
+    m_param = top->m_param;
+    m_frameEncoder = frame;
+    m_numRows = numRows;
+    m_hChromaShift = CHROMA_H_SHIFT(m_param->internalCsp);
+    m_vChromaShift = CHROMA_V_SHIFT(m_param->internalCsp);
+    m_pad[0] = top->m_sps.conformanceWindow.rightOffset;
+    m_pad[1] = top->m_sps.conformanceWindow.bottomOffset;
+    m_saoRowDelay = m_param->bEnableLoopFilter ? 1 : 0;
+    m_lastHeight = m_param->sourceHeight % g_maxCUSize ? m_param->sourceHeight % g_maxCUSize : g_maxCUSize;
+
+    if (m_param->bEnableSAO)
+        if (!m_sao.create(m_param))
+            m_param->bEnableSAO = 0;
+
+    if (m_param->bEnableSsim)
+        m_ssimBuf = X265_MALLOC(int, 8 * (m_param->sourceWidth / 4 + 3));
+}
+
+void FrameFilter::start(Frame *frame, Entropy& initState, int qp)
+{
+    m_frame = frame;
+
+    if (m_param->bEnableSAO)
+        m_sao.startSlice(frame, initState, qp);
+}
+
+void FrameFilter::processRow(int row)
+{
+    ProfileScopeEvent(filterCTURow);
+
+#if DETAILED_CU_STATS
+    ScopedElapsedTime filterPerfScope(m_frameEncoder->m_cuStats.loopFilterElapsedTime);
+    m_frameEncoder->m_cuStats.countLoopFilter++;
+#endif
+
+    if (!m_param->bEnableLoopFilter && !m_param->bEnableSAO)
+    {
+        processRowPost(row);
+        return;
+    }
+    FrameData& encData = *m_frame->m_encData;
+    const uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
+    const uint32_t lineStartCUAddr = row * numCols;
+
+    if (m_param->bEnableLoopFilter)
+    {
+        const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
+        const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
+
+        for (uint32_t col = 0; col < numCols; col++)
+        {
+            uint32_t cuAddr = lineStartCUAddr + col;
+            const CUData* ctu = encData.getPicCTU(cuAddr);
+            deblockCTU(ctu, cuGeoms[ctuGeomMap[cuAddr]], Deblock::EDGE_VER);
+
+            if (col > 0)
+            {
+                const CUData* ctuPrev = encData.getPicCTU(cuAddr - 1);
+                deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[cuAddr - 1]], Deblock::EDGE_HOR);
+            }
+        }
+
+        const CUData* ctuPrev = encData.getPicCTU(lineStartCUAddr + numCols - 1);
+        deblockCTU(ctuPrev, cuGeoms[ctuGeomMap[lineStartCUAddr + numCols - 1]], Deblock::EDGE_HOR);
+    }
+
+    // SAO
+    SAOParam* saoParam = encData.m_saoParam;
+    if (m_param->bEnableSAO)
+    {
+        m_sao.m_entropyCoder.load(m_frameEncoder->m_initSliceContext);
+        m_sao.m_rdContexts.next.load(m_frameEncoder->m_initSliceContext);
+        m_sao.m_rdContexts.cur.load(m_frameEncoder->m_initSliceContext);
+
+        m_sao.rdoSaoUnitRow(saoParam, row);
+
+        // NOTE: Delay a row because SAO decide need top row pixels at next row, is it HM's bug?
+        if (row >= m_saoRowDelay)
+            processSao(row - m_saoRowDelay);
+    }
+
+    // this row of CTUs has been encoded
+
+    if (row > 0)
+        processRowPost(row - 1);
+
+    if (row == m_numRows - 1)
+    {
+        if (m_param->bEnableSAO)
+        {
+            m_sao.rdoSaoUnitRowEnd(saoParam, encData.m_slice->m_sps->numCUsInFrame);
+
+            for (int i = m_numRows - m_saoRowDelay; i < m_numRows; i++)
+                processSao(i);
+        }
+
+        processRowPost(row);
+    }
+}
+
+uint32_t FrameFilter::getCUHeight(int rowNum) const
+{
+    return rowNum == m_numRows - 1 ? m_lastHeight : g_maxCUSize;
+}
+
+void FrameFilter::processRowPost(int row)
+{
+    PicYuv *reconPic = m_frame->m_reconPic;
+    const uint32_t numCols = m_frame->m_encData->m_slice->m_sps->numCuInWidth;
+    const uint32_t lineStartCUAddr = row * numCols;
+    const int realH = getCUHeight(row);
+
+    // Border extend Left and Right
+    primitives.extendRowBorder(reconPic->getLumaAddr(lineStartCUAddr), reconPic->m_stride, reconPic->m_picWidth, realH, reconPic->m_lumaMarginX);
+    if (reconPic->m_picCsp != X265_CSP_I400) {
+        primitives.extendRowBorder(reconPic->getCbAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX);
+        primitives.extendRowBorder(reconPic->getCrAddr(lineStartCUAddr), reconPic->m_strideC, reconPic->m_picWidth >> m_hChromaShift, realH >> m_vChromaShift, reconPic->m_chromaMarginX);
+    }
+
+    // Border extend Top
+    if (!row)
+    {
+        const intptr_t stride = reconPic->m_stride;
+        pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX;
+
+        for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++)
+            memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel));
+
+        if (reconPic->m_picCsp != X265_CSP_I400) {
+            const intptr_t strideC = reconPic->m_strideC;
+            pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX;
+            pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX;
+            for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
+            {
+                memcpy(pixU - (y + 1) * strideC, pixU, strideC * sizeof(pixel));
+                memcpy(pixV - (y + 1) * strideC, pixV, strideC * sizeof(pixel));
+            }
+        }
+    }
+
+    // Border extend Bottom
+    if (row == m_numRows - 1)
+    {
+        const intptr_t stride = reconPic->m_stride;
+        pixel *pixY = reconPic->getLumaAddr(lineStartCUAddr) - reconPic->m_lumaMarginX + (realH - 1) * stride;
+        for (uint32_t y = 0; y < reconPic->m_lumaMarginY; y++)
+            memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel));
+
+        if (reconPic->m_picCsp != X265_CSP_I400) {
+            const intptr_t strideC = reconPic->m_strideC;
+            pixel *pixU = reconPic->getCbAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) * strideC;
+            pixel *pixV = reconPic->getCrAddr(lineStartCUAddr) - reconPic->m_chromaMarginX + ((realH >> m_vChromaShift) - 1) * strideC;
+            for (uint32_t y = 0; y < reconPic->m_chromaMarginY; y++)
+            {
+                memcpy(pixU + (y + 1) * strideC, pixU, strideC * sizeof(pixel));
+                memcpy(pixV + (y + 1) * strideC, pixV, strideC * sizeof(pixel));
+            }
+        }
+    }
+
+    // Notify other FrameEncoders that this row of reconstructed pixels is available
+    m_frame->m_reconRowCount.incr();
+
+    uint32_t cuAddr = lineStartCUAddr;
+    if (m_param->bEnablePsnr)
+    {
+        PicYuv* fencPic = m_frame->m_fencPic;
+
+        intptr_t stride = reconPic->m_stride;
+        uint32_t width  = reconPic->m_picWidth - m_pad[0];
+        uint32_t height = getCUHeight(row);
+
+        uint64_t ssdY = computeSSD(fencPic->getLumaAddr(cuAddr), reconPic->getLumaAddr(cuAddr), stride, width, height);
+        m_frameEncoder->m_SSDY += ssdY;
+        if (reconPic->m_picCsp != X265_CSP_I400) {
+            height >>= m_vChromaShift;
+            width  >>= m_hChromaShift;
+            stride = reconPic->m_strideC;
+            
+            uint64_t ssdU = computeSSD(fencPic->getCbAddr(cuAddr), reconPic->getCbAddr(cuAddr), stride, width, height);
+            uint64_t ssdV = computeSSD(fencPic->getCrAddr(cuAddr), reconPic->getCrAddr(cuAddr), stride, width, height);
+
+            m_frameEncoder->m_SSDU += ssdU;
+            m_frameEncoder->m_SSDV += ssdV;
+        }
+    }
+    if (m_param->bEnableSsim && m_ssimBuf)
+    {
+        pixel *rec = reconPic->m_picOrg[0];
+        pixel *fenc = m_frame->m_fencPic->m_picOrg[0];
+        intptr_t stride1 = reconPic->m_stride;
+        intptr_t stride2 = m_frame->m_fencPic->m_stride;
+        uint32_t bEnd = ((row + 1) == (this->m_numRows - 1));
+        uint32_t bStart = (row == 0);
+        uint32_t minPixY = row * g_maxCUSize - 4 * !bStart;
+        uint32_t maxPixY = (row + 1) * g_maxCUSize - 4 * !bEnd;
+        uint32_t ssim_cnt;
+        x265_emms();
+
+        /* SSIM is done for each row in blocks of 4x4 . The First blocks are offset by 2 pixels to the right
+        * to avoid alignment of ssim blocks with DCT blocks. */
+        minPixY += bStart ? 2 : -6;
+        m_frameEncoder->m_ssim += calculateSSIM(rec + 2 + minPixY * stride1, stride1, fenc + 2 + minPixY * stride2, stride2,
+                                                m_param->sourceWidth - 2, maxPixY - minPixY, m_ssimBuf, ssim_cnt);
+        m_frameEncoder->m_ssimCnt += ssim_cnt;
+    }
+    if (m_param->decodedPictureHashSEI == 1)
+    {
+        uint32_t height = getCUHeight(row);
+        uint32_t width = reconPic->m_picWidth;
+        intptr_t stride = reconPic->m_stride;
+
+        if (!row)
+        {
+            for (int i = 0; i < 3; i++)
+                MD5Init(&m_frameEncoder->m_state[i]);
+        }
+
+        updateMD5Plane(m_frameEncoder->m_state[0], reconPic->getLumaAddr(cuAddr), width, height, stride);
+        if (reconPic->m_picCsp != X265_CSP_I400) {
+            width  >>= m_hChromaShift;
+            height >>= m_vChromaShift;
+            stride = reconPic->m_strideC;
+            
+            updateMD5Plane(m_frameEncoder->m_state[1], reconPic->getCbAddr(cuAddr), width, height, stride);
+            updateMD5Plane(m_frameEncoder->m_state[2], reconPic->getCrAddr(cuAddr), width, height, stride);
+        }
+    }
+    else if (m_param->decodedPictureHashSEI == 2)
+    {
+        uint32_t height = getCUHeight(row);
+        uint32_t width = reconPic->m_picWidth;
+        intptr_t stride = reconPic->m_stride;
+        if (!row)
+            m_frameEncoder->m_crc[0] = m_frameEncoder->m_crc[1] = m_frameEncoder->m_crc[2] = 0xffff;
+        updateCRC(reconPic->getLumaAddr(cuAddr), m_frameEncoder->m_crc[0], height, width, stride);
+        if (reconPic->m_picCsp != X265_CSP_I400) {
+            width  >>= m_hChromaShift;
+            height >>= m_vChromaShift;
+            stride = reconPic->m_strideC;
+            
+            updateCRC(reconPic->getCbAddr(cuAddr), m_frameEncoder->m_crc[1], height, width, stride);
+            updateCRC(reconPic->getCrAddr(cuAddr), m_frameEncoder->m_crc[2], height, width, stride);
+        }
+    }
+    else if (m_param->decodedPictureHashSEI == 3)
+    {
+        uint32_t width = reconPic->m_picWidth;
+        uint32_t height = getCUHeight(row);
+        intptr_t stride = reconPic->m_stride;
+        uint32_t cuHeight = g_maxCUSize;
+        if (!row)
+            m_frameEncoder->m_checksum[0] = m_frameEncoder->m_checksum[1] = m_frameEncoder->m_checksum[2] = 0;
+        updateChecksum(reconPic->m_picOrg[0], m_frameEncoder->m_checksum[0], height, width, stride, row, cuHeight);
+        if (reconPic->m_picCsp != X265_CSP_I400) {
+            width  >>= m_hChromaShift;
+            height >>= m_vChromaShift;
+            stride = reconPic->m_strideC;
+            cuHeight >>= m_vChromaShift;
+            
+            updateChecksum(reconPic->m_picOrg[1], m_frameEncoder->m_checksum[1], height, width, stride, row, cuHeight);
+            updateChecksum(reconPic->m_picOrg[2], m_frameEncoder->m_checksum[2], height, width, stride, row, cuHeight);
+        }
+    }
+
+    if (ATOMIC_INC(&m_frameEncoder->m_completionCount) == 2 * (int)m_frameEncoder->m_numRows)
+        m_frameEncoder->m_completionEvent.trigger();
+}
+
+static uint64_t computeSSD(pixel *fenc, pixel *rec, intptr_t stride, uint32_t width, uint32_t height)
+{
+    uint64_t ssd = 0;
+
+    if ((width | height) & 3)
+    {
+        /* Slow Path */
+        for (uint32_t y = 0; y < height; y++)
+        {
+            for (uint32_t x = 0; x < width; x++)
+            {
+                int diff = (int)(fenc[x] - rec[x]);
+                ssd += diff * diff;
+            }
+
+            fenc += stride;
+            rec += stride;
+        }
+
+        return ssd;
+    }
+
+    uint32_t y = 0;
+
+    /* Consume rows in ever narrower chunks of height */
+    for (int size = BLOCK_64x64; size >= BLOCK_4x4 && y < height; size--)
+    {
+        uint32_t rowHeight = 1 << (size + 2);
+
+        for (; y + rowHeight <= height; y += rowHeight)
+        {
+            uint32_t y1, x = 0;
+
+            /* Consume each row using the largest square blocks possible */
+            if (size == BLOCK_64x64 && !(stride & 31))
+                for (; x + 64 <= width; x += 64)
+                    ssd += primitives.cu[BLOCK_64x64].sse_pp(fenc + x, stride, rec + x, stride);
+
+            if (size >= BLOCK_32x32 && !(stride & 15))
+                for (; x + 32 <= width; x += 32)
+                    for (y1 = 0; y1 + 32 <= rowHeight; y1 += 32)
+                        ssd += primitives.cu[BLOCK_32x32].sse_pp(fenc + y1 * stride + x, stride, rec + y1 * stride + x, stride);
+
+            if (size >= BLOCK_16x16)
+                for (; x + 16 <= width; x += 16)
+                    for (y1 = 0; y1 + 16 <= rowHeight; y1 += 16)
+                        ssd += primitives.cu[BLOCK_16x16].sse_pp(fenc + y1 * stride + x, stride, rec + y1 * stride + x, stride);
+
+            if (size >= BLOCK_8x8)
+                for (; x + 8 <= width; x += 8)
+                    for (y1 = 0; y1 + 8 <= rowHeight; y1 += 8)
+                        ssd += primitives.cu[BLOCK_8x8].sse_pp(fenc + y1 * stride + x, stride, rec + y1 * stride + x, stride);
+
+            for (; x + 4 <= width; x += 4)
+                for (y1 = 0; y1 + 4 <= rowHeight; y1 += 4)
+                    ssd += primitives.cu[BLOCK_4x4].sse_pp(fenc + y1 * stride + x, stride, rec + y1 * stride + x, stride);
+
+            fenc += stride * rowHeight;
+            rec += stride * rowHeight;
+        }
+    }
+
+    return ssd;
+}
+
+/* Function to calculate SSIM for each row */
+static float calculateSSIM(pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, uint32_t width, uint32_t height, void *buf, uint32_t& cnt)
+{
+    uint32_t z = 0;
+    float ssim = 0.0;
+
+    int(*sum0)[4] = (int(*)[4])buf;
+    int(*sum1)[4] = sum0 + (width >> 2) + 3;
+    width >>= 2;
+    height >>= 2;
+
+    for (uint32_t y = 1; y < height; y++)
+    {
+        for (; z <= y; z++)
+        {
+            std::swap(sum0, sum1);
+            for (uint32_t x = 0; x < width; x += 2)
+                primitives.ssim_4x4x2_core(&pix1[(4 * x + (z * stride1))], stride1, &pix2[(4 * x + (z * stride2))], stride2, &sum0[x]);
+        }
+
+        for (uint32_t x = 0; x < width - 1; x += 4)
+            ssim += primitives.ssim_end_4(sum0 + x, sum1 + x, X265_MIN(4, width - x - 1));
+    }
+
+    cnt = (height - 1) * (width - 1);
+    return ssim;
+}
+
+/* restore original YUV samples to recon after SAO (if lossless) */
+static void restoreOrigLosslessYuv(const CUData* cu, Frame& frame, uint32_t absPartIdx)
+{
+    int size = cu->m_log2CUSize[absPartIdx] - 2;
+    uint32_t cuAddr = cu->m_cuAddr;
+
+    PicYuv* reconPic = frame.m_reconPic;
+    PicYuv* fencPic  = frame.m_fencPic;
+
+    pixel* dst = reconPic->getLumaAddr(cuAddr, absPartIdx);
+    pixel* src = fencPic->getLumaAddr(cuAddr, absPartIdx);
+
+    primitives.cu[size].copy_pp(dst, reconPic->m_stride, src, fencPic->m_stride);
+   
+    int csp = fencPic->m_picCsp;
+    if (csp != X265_CSP_I400) {
+        pixel* dstCb = reconPic->getCbAddr(cuAddr, absPartIdx);
+        pixel* srcCb = fencPic->getCbAddr(cuAddr, absPartIdx);
+
+        pixel* dstCr = reconPic->getCrAddr(cuAddr, absPartIdx);
+        pixel* srcCr = fencPic->getCrAddr(cuAddr, absPartIdx);
+
+        primitives.chroma[csp].cu[size].copy_pp(dstCb, reconPic->m_strideC, srcCb, fencPic->m_strideC);
+        primitives.chroma[csp].cu[size].copy_pp(dstCr, reconPic->m_strideC, srcCr, fencPic->m_strideC);
+    }
+}
+
+/* Original YUV restoration for CU in lossless coding */
+static void origCUSampleRestoration(const CUData* cu, const CUGeom& cuGeom, Frame& frame)
+{
+    uint32_t absPartIdx = cuGeom.absPartIdx;
+    if (cu->m_cuDepth[absPartIdx] > cuGeom.depth)
+    {
+        for (int subPartIdx = 0; subPartIdx < 4; subPartIdx++)
+        {
+            const CUGeom& childGeom = *(&cuGeom + cuGeom.childOffset + subPartIdx);
+            if (childGeom.flags & CUGeom::PRESENT)
+                origCUSampleRestoration(cu, childGeom, frame);
+        }
+        return;
+    }
+
+    // restore original YUV samples
+    if (cu->m_tqBypass[absPartIdx])
+        restoreOrigLosslessYuv(cu, frame, absPartIdx);
+}
+
+void FrameFilter::processSao(int row)
+{
+    FrameData& encData = *m_frame->m_encData;
+    SAOParam* saoParam = encData.m_saoParam;
+
+    if (saoParam->bSaoFlag[0])
+        m_sao.processSaoUnitRow(saoParam->ctuParam[0], row, 0);
+
+    if (saoParam->bSaoFlag[1])
+    {
+        m_sao.processSaoUnitRow(saoParam->ctuParam[1], row, 1);
+        m_sao.processSaoUnitRow(saoParam->ctuParam[2], row, 2);
+    }
+
+    if (encData.m_slice->m_pps->bTransquantBypassEnabled)
+    {
+        uint32_t numCols = encData.m_slice->m_sps->numCuInWidth;
+        uint32_t lineStartCUAddr = row * numCols;
+
+        const CUGeom* cuGeoms = m_frameEncoder->m_cuGeoms;
+        const uint32_t* ctuGeomMap = m_frameEncoder->m_ctuGeomMap;
+
+        for (uint32_t col = 0; col < numCols; col++)
+        {
+            uint32_t cuAddr = lineStartCUAddr + col;
+            const CUData* ctu = encData.getPicCTU(cuAddr);
+            origCUSampleRestoration(ctu, cuGeoms[ctuGeomMap[cuAddr]], *m_frame);
+        }
+    }
+}
--- a/x265/source/encoder/framefilter.h
+++ b/x265/source/encoder/framefilter.h
@ -0,0 +1,74 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Chung Shin Yee <shinyee@multicorewareinc.com>
+ *          Min Chen <chenm003@163.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_FRAMEFILTER_H
+#define X265_FRAMEFILTER_H
+
+#include "common.h"
+#include "frame.h"
+#include "deblock.h"
+#include "sao.h"
+
+namespace X265_NS {
+// private x265 namespace
+
+class Encoder;
+class Entropy;
+class FrameEncoder;
+struct ThreadLocalData;
+
+// Manages the processing of a single frame loopfilter
+class FrameFilter : public Deblock
+{
+public:
+
+    x265_param*   m_param;
+    Frame*        m_frame;
+    FrameEncoder* m_frameEncoder;
+    int           m_hChromaShift;
+    int           m_vChromaShift;
+    int           m_pad[2];
+
+    SAO           m_sao;
+    int           m_numRows;
+    int           m_saoRowDelay;
+    int           m_lastHeight;
+    
+    void*         m_ssimBuf; /* Temp storage for ssim computation */
+
+    FrameFilter();
+
+    void init(Encoder *top, FrameEncoder *frame, int numRows);
+    void destroy();
+
+    void start(Frame *pic, Entropy& initState, int qp);
+
+    void processRow(int row);
+    void processRowPost(int row);
+    void processSao(int row);
+    uint32_t getCUHeight(int rowNum) const;
+};
+}
+
+#endif // ifndef X265_FRAMEFILTER_H
--- a/x265/source/encoder/level.cpp
+++ b/x265/source/encoder/level.cpp
@ -0,0 +1,496 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "slice.h"
+#include "level.h"
+
+namespace X265_NS {
+typedef struct
+{
+    uint32_t maxLumaSamples;
+    uint32_t maxLumaSamplesPerSecond;
+    uint32_t maxBitrateMain;
+    uint32_t maxBitrateHigh;
+    uint32_t maxCpbSizeMain;
+    uint32_t maxCpbSizeHigh;
+    uint32_t minCompressionRatio;
+    Level::Name levelEnum;
+    const char* name;
+    int levelIdc;
+} LevelSpec;
+
+LevelSpec levels[] =
+{
+    { 36864,    552960,     128,      MAX_UINT, 350,    MAX_UINT, 2, Level::LEVEL1,   "1",   10 },
+    { 122880,   3686400,    1500,     MAX_UINT, 1500,   MAX_UINT, 2, Level::LEVEL2,   "2",   20 },
+    { 245760,   7372800,    3000,     MAX_UINT, 3000,   MAX_UINT, 2, Level::LEVEL2_1, "2.1", 21 },
+    { 552960,   16588800,   6000,     MAX_UINT, 6000,   MAX_UINT, 2, Level::LEVEL3,   "3",   30 },
+    { 983040,   33177600,   10000,    MAX_UINT, 10000,  MAX_UINT, 2, Level::LEVEL3_1, "3.1", 31 },
+    { 2228224,  66846720,   12000,    30000,    12000,  30000,    4, Level::LEVEL4,   "4",   40 },
+    { 2228224,  133693440,  20000,    50000,    20000,  50000,    4, Level::LEVEL4_1, "4.1", 41 },
+    { 8912896,  267386880,  25000,    100000,   25000,  100000,   6, Level::LEVEL5,   "5",   50 },
+    { 8912896,  534773760,  40000,    160000,   40000,  160000,   8, Level::LEVEL5_1, "5.1", 51 },
+    { 8912896,  1069547520, 60000,    240000,   60000,  240000,   8, Level::LEVEL5_2, "5.2", 52 },
+    { 35651584, 1069547520, 60000,    240000,   60000,  240000,   8, Level::LEVEL6,   "6",   60 },
+    { 35651584, 2139095040, 120000,   480000,   120000, 480000,   8, Level::LEVEL6_1, "6.1", 61 },
+    { 35651584, 4278190080U, 240000,  800000,   240000, 800000,   6, Level::LEVEL6_2, "6.2", 62 },
+    { MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, MAX_UINT, 1, Level::LEVEL8_5, "8.5", 85 },
+};
+
+/* determine minimum decoder level required to decode the described video */
+void determineLevel(const x265_param &param, VPS& vps)
+{
+    vps.ptl.onePictureOnlyConstraintFlag = param.totalFrames == 1;
+    vps.ptl.intraConstraintFlag = param.keyframeMax <= 1 || vps.ptl.onePictureOnlyConstraintFlag;
+    vps.ptl.bitDepthConstraint = param.internalBitDepth;
+    vps.ptl.chromaFormatConstraint = param.internalCsp;
+
+    /* TODO: figure out HighThroughput signaling, aka: HbrFactor in section A.4.2, only available
+     * for intra-only profiles (vps.ptl.intraConstraintFlag) */
+    vps.ptl.lowerBitRateConstraintFlag = true;
+
+    vps.maxTempSubLayers = param.bEnableTemporalSubLayers ? 2 : 1;
+    
+    if (param.internalCsp == X265_CSP_I420 && param.internalBitDepth <= 10)
+    {
+        /* Probably an HEVC v1 profile, but must check to be sure */
+        if (param.internalBitDepth <= 8)
+        {
+            if (vps.ptl.onePictureOnlyConstraintFlag)
+                vps.ptl.profileIdc = Profile::MAINSTILLPICTURE;
+            else if (vps.ptl.intraConstraintFlag)
+                vps.ptl.profileIdc = Profile::MAINREXT; /* Main Intra */
+            else 
+                vps.ptl.profileIdc = Profile::MAIN;
+        }
+        else if (param.internalBitDepth <= 10)
+        {
+            /* note there is no 10bit still picture profile */
+            if (vps.ptl.intraConstraintFlag)
+                vps.ptl.profileIdc = Profile::MAINREXT; /* Main10 Intra */
+            else
+                vps.ptl.profileIdc = Profile::MAIN10;
+        }
+    }
+    else
+        vps.ptl.profileIdc = Profile::MAINREXT;
+
+    /* determine which profiles are compatible with this stream */
+
+    memset(vps.ptl.profileCompatibilityFlag, 0, sizeof(vps.ptl.profileCompatibilityFlag));
+    vps.ptl.profileCompatibilityFlag[vps.ptl.profileIdc] = true;
+    if (vps.ptl.profileIdc == Profile::MAIN10 && param.internalBitDepth == 8)
+        vps.ptl.profileCompatibilityFlag[Profile::MAIN] = true;
+    else if (vps.ptl.profileIdc == Profile::MAIN)
+        vps.ptl.profileCompatibilityFlag[Profile::MAIN10] = true;
+    else if (vps.ptl.profileIdc == Profile::MAINSTILLPICTURE)
+    {
+        vps.ptl.profileCompatibilityFlag[Profile::MAIN] = true;
+        vps.ptl.profileCompatibilityFlag[Profile::MAIN10] = true;
+    }
+    else if (vps.ptl.profileIdc == Profile::MAINREXT)
+        vps.ptl.profileCompatibilityFlag[Profile::MAINREXT] = true;
+
+    uint32_t lumaSamples = param.sourceWidth * param.sourceHeight;
+    uint32_t samplesPerSec = (uint32_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom));
+    uint32_t bitrate = param.rc.vbvMaxBitrate ? param.rc.vbvMaxBitrate : param.rc.bitrate;
+
+    const uint32_t MaxDpbPicBuf = 6;
+    vps.ptl.levelIdc = Level::NONE;
+    vps.ptl.tierFlag = Level::MAIN;
+
+    const size_t NumLevels = sizeof(levels) / sizeof(levels[0]);
+    uint32_t i;
+    if (param.bLossless)
+    {
+        i = 13;
+        vps.ptl.minCrForLevel = 1;
+        vps.ptl.maxLumaSrForLevel = MAX_UINT;
+        vps.ptl.levelIdc = Level::LEVEL8_5;
+        vps.ptl.tierFlag = Level::MAIN;
+    }
+    else for (i = 0; i < NumLevels; i++)
+    {
+        if (lumaSamples > levels[i].maxLumaSamples)
+            continue;
+        else if (samplesPerSec > levels[i].maxLumaSamplesPerSecond)
+            continue;
+        else if (bitrate > levels[i].maxBitrateMain && levels[i].maxBitrateHigh == MAX_UINT)
+            continue;
+        else if (bitrate > levels[i].maxBitrateHigh)
+            continue;
+        else if (param.sourceWidth > sqrt(levels[i].maxLumaSamples * 8.0f))
+            continue;
+        else if (param.sourceHeight > sqrt(levels[i].maxLumaSamples * 8.0f))
+            continue;
+
+        uint32_t maxDpbSize = MaxDpbPicBuf;
+        if (lumaSamples <= (levels[i].maxLumaSamples >> 2))
+            maxDpbSize = X265_MIN(4 * MaxDpbPicBuf, 16);
+        else if (lumaSamples <= (levels[i].maxLumaSamples >> 1))
+            maxDpbSize = X265_MIN(2 * MaxDpbPicBuf, 16);
+        else if (lumaSamples <= ((3 * levels[i].maxLumaSamples) >> 2))
+            maxDpbSize = X265_MIN((4 * MaxDpbPicBuf) / 3, 16);
+
+        /* The value of sps_max_dec_pic_buffering_minus1[ HighestTid ] + 1 shall be less than
+         * or equal to MaxDpbSize */
+        if (vps.maxDecPicBuffering > maxDpbSize)
+            continue;
+
+        /* For level 5 and higher levels, the value of CtbSizeY shall be equal to 32 or 64 */
+        if (levels[i].levelEnum >= Level::LEVEL5 && param.maxCUSize < 32)
+        {
+            x265_log(&param, X265_LOG_WARNING, "level %s detected, but CTU size 16 is non-compliant\n", levels[i].name);
+            vps.ptl.profileIdc = Profile::NONE;
+            vps.ptl.levelIdc = Level::NONE;
+            vps.ptl.tierFlag = Level::MAIN;
+            x265_log(&param, X265_LOG_INFO, "NONE profile, Level-NONE (Main tier)\n");
+            return;
+        }
+
+        /* The value of NumPocTotalCurr shall be less than or equal to 8 */
+        int numPocTotalCurr = param.maxNumReferences + vps.numReorderPics;
+        if (numPocTotalCurr > 8)
+        {
+            x265_log(&param, X265_LOG_WARNING, "level %s detected, but NumPocTotalCurr (total references) is non-compliant\n", levels[i].name);
+            vps.ptl.profileIdc = Profile::NONE;
+            vps.ptl.levelIdc = Level::NONE;
+            vps.ptl.tierFlag = Level::MAIN;
+            x265_log(&param, X265_LOG_INFO, "NONE profile, Level-NONE (Main tier)\n");
+            return;
+        }
+
+#define CHECK_RANGE(value, main, high) (high != MAX_UINT && value > main && value <= high)
+
+        if (CHECK_RANGE(bitrate, levels[i].maxBitrateMain, levels[i].maxBitrateHigh) ||
+            CHECK_RANGE((uint32_t)param.rc.vbvBufferSize, levels[i].maxCpbSizeMain, levels[i].maxCpbSizeHigh))
+        {
+            /* The bitrate or buffer size are out of range for Main tier, but in
+             * range for High tier. If the user requested High tier then give
+             * them High tier at this level.  Otherwise allow the loop to
+             * progress to the Main tier of the next level */
+            if (param.bHighTier)
+                vps.ptl.tierFlag = Level::HIGH;
+            else
+                continue;
+        }
+        else
+            vps.ptl.tierFlag = Level::MAIN;
+#undef CHECK_RANGE
+
+        vps.ptl.levelIdc = levels[i].levelEnum;
+        vps.ptl.minCrForLevel = levels[i].minCompressionRatio;
+        vps.ptl.maxLumaSrForLevel = levels[i].maxLumaSamplesPerSecond;
+        break;
+    }
+
+    static const char *profiles[] = { "None", "Main", "Main 10", "Main Still Picture", "RExt" };
+    static const char *tiers[]    = { "Main", "High" };
+
+    char profbuf[64];
+    strcpy(profbuf, profiles[vps.ptl.profileIdc]);
+
+    bool bStillPicture = false;
+    if (vps.ptl.profileIdc == Profile::MAINREXT)
+    {
+        if (vps.ptl.bitDepthConstraint > 12 && vps.ptl.intraConstraintFlag)
+        {
+            if (vps.ptl.onePictureOnlyConstraintFlag)
+            {
+                strcpy(profbuf, "Main 4:4:4 16 Still Picture");
+                bStillPicture = true;
+            }
+            else
+                strcpy(profbuf, "Main 4:4:4 16");
+        }
+        else if (param.internalCsp == X265_CSP_I420)
+        {
+            X265_CHECK(vps.ptl.intraConstraintFlag || vps.ptl.bitDepthConstraint > 10, "rext fail\n");
+            if (vps.ptl.bitDepthConstraint <= 8)
+                strcpy(profbuf, "Main");
+            else if (vps.ptl.bitDepthConstraint <= 10)
+                strcpy(profbuf, "Main 10");
+            else if (vps.ptl.bitDepthConstraint <= 12)
+                strcpy(profbuf, "Main 12");
+        }
+        else if (param.internalCsp == X265_CSP_I422)
+        {
+            /* there is no Main 4:2:2 profile, so it must be signaled as Main10 4:2:2 */
+            if (param.internalBitDepth <= 10)
+                strcpy(profbuf, "Main 4:2:2 10");
+            else if (vps.ptl.bitDepthConstraint <= 12)
+                strcpy(profbuf, "Main 4:2:2 12");
+        }
+        else if (param.internalCsp == X265_CSP_I444)
+        {
+            if (vps.ptl.bitDepthConstraint <= 8)
+            {
+                if (vps.ptl.onePictureOnlyConstraintFlag)
+                {
+                    strcpy(profbuf, "Main 4:4:4 Still Picture");
+                    bStillPicture = true;
+                }
+                else
+                    strcpy(profbuf, "Main 4:4:4");
+            }
+            else if (vps.ptl.bitDepthConstraint <= 10)
+                strcpy(profbuf, "Main 4:4:4 10");
+            else if (vps.ptl.bitDepthConstraint <= 12)
+                strcpy(profbuf, "Main 4:4:4 12");
+        }
+        else
+            strcpy(profbuf, "Unknown");
+
+        if (vps.ptl.intraConstraintFlag && !bStillPicture)
+            strcat(profbuf, " Intra");
+    }
+    x265_log(&param, X265_LOG_INFO, "%s profile, Level-%s (%s tier)\n",
+             profbuf, levels[i].name, tiers[vps.ptl.tierFlag]);
+}
+
+/* enforce a maximum decoder level requirement, in other words assure that a
+ * decoder of the specified level may decode the video about to be created.
+ * Lower parameters where necessary to ensure the video will be decodable by a
+ * decoder meeting this level of requirement.  Some parameters (resolution and
+ * frame rate) are non-negotiable and thus this function may fail. In those
+ * circumstances it will be quite noisy */
+bool enforceLevel(x265_param& param, VPS& vps)
+{
+    vps.numReorderPics = (param.bBPyramid && param.bframes > 1) ? 2 : !!param.bframes;
+    vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 2, (uint32_t)param.maxNumReferences) + vps.numReorderPics);
+
+    /* no level specified by user, just auto-detect from the configuration */
+    if (param.levelIdc <= 0)
+        return true;
+
+    uint32_t level = 0;
+    while (levels[level].levelIdc != param.levelIdc && level + 1 < sizeof(levels) / sizeof(levels[0]))
+        level++;
+    if (levels[level].levelIdc != param.levelIdc)
+    {
+        x265_log(&param, X265_LOG_WARNING, "specified level %d does not exist\n", param.levelIdc);
+        return false;
+    }
+
+    LevelSpec& l = levels[level];
+    bool highTier = !!param.bHighTier;
+    if (highTier && l.maxBitrateHigh == MAX_UINT)
+    {
+        highTier = false;
+        x265_log(&param, X265_LOG_WARNING, "Level %s has no High tier, using Main tier\n", l.name);
+    }
+
+    uint32_t lumaSamples = param.sourceWidth * param.sourceHeight;
+    uint32_t samplesPerSec = (uint32_t)(lumaSamples * ((double)param.fpsNum / param.fpsDenom));
+    bool ok = true;
+    if (lumaSamples > l.maxLumaSamples)
+        ok = false;
+    else if (param.sourceWidth > sqrt(l.maxLumaSamples * 8.0f))
+        ok = false;
+    else if (param.sourceHeight > sqrt(l.maxLumaSamples * 8.0f))
+        ok = false;
+    if (!ok)
+    {
+        x265_log(&param, X265_LOG_WARNING, "picture dimensions are out of range for specified level\n");
+        return false;
+    }
+    else if (samplesPerSec > l.maxLumaSamplesPerSecond)
+    {
+        x265_log(&param, X265_LOG_WARNING, "frame rate is out of range for specified level\n");
+        return false;
+    }
+
+    if ((uint32_t)param.rc.vbvMaxBitrate > (highTier ? l.maxBitrateHigh : l.maxBitrateMain))
+    {
+        param.rc.vbvMaxBitrate = highTier ? l.maxBitrateHigh : l.maxBitrateMain;
+        x265_log(&param, X265_LOG_INFO, "lowering VBV max bitrate to %dKbps\n", param.rc.vbvMaxBitrate);
+    }
+    if ((uint32_t)param.rc.vbvBufferSize > (highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain))
+    {
+        param.rc.vbvBufferSize = highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
+        x265_log(&param, X265_LOG_INFO, "lowering VBV buffer size to %dKb\n", param.rc.vbvBufferSize);
+    }
+
+    switch (param.rc.rateControlMode)
+    {
+    case X265_RC_ABR:
+        if ((uint32_t)param.rc.bitrate > (highTier ? l.maxBitrateHigh : l.maxBitrateMain))
+        {
+            param.rc.bitrate = l.maxBitrateHigh;
+            x265_log(&param, X265_LOG_INFO, "lowering target bitrate to High tier limit of %dKbps\n", param.rc.bitrate);
+        }
+        break;
+
+    case X265_RC_CQP:
+        x265_log(&param, X265_LOG_WARNING, "Constant QP is inconsistent with specifying a decoder level, no bitrate guarantee is possible.\n");
+        return false;
+
+    case X265_RC_CRF:
+        if (!param.rc.vbvBufferSize || !param.rc.vbvMaxBitrate)
+        {
+            if (!param.rc.vbvMaxBitrate)
+                param.rc.vbvMaxBitrate = highTier ? l.maxBitrateHigh : l.maxBitrateMain;
+            if (!param.rc.vbvBufferSize)
+                param.rc.vbvBufferSize = highTier ? l.maxCpbSizeHigh : l.maxCpbSizeMain;
+            x265_log(&param, X265_LOG_WARNING, "Specifying a decoder level with constant rate factor rate-control requires\n");
+            x265_log(&param, X265_LOG_WARNING, "enabling VBV with vbv-bufsize=%dkb vbv-maxrate=%dkbps. VBV outputs are non-deterministic!\n",
+                     param.rc.vbvBufferSize, param.rc.vbvMaxBitrate);
+        }
+        break;
+
+    default:
+        x265_log(&param, X265_LOG_ERROR, "Unknown rate control mode is inconsistent with specifying a decoder level\n");
+        return false;
+    }
+
+    /* The value of sps_max_dec_pic_buffering_minus1[ HighestTid ] + 1 shall be less than or equal to MaxDpbSize */
+    const uint32_t MaxDpbPicBuf = 6;
+    uint32_t maxDpbSize = MaxDpbPicBuf;
+    if (lumaSamples <= (l.maxLumaSamples >> 2))
+        maxDpbSize = X265_MIN(4 * MaxDpbPicBuf, 16);
+    else if (lumaSamples <= (l.maxLumaSamples >> 1))
+        maxDpbSize = X265_MIN(2 * MaxDpbPicBuf, 16);
+    else if (lumaSamples <= ((3 * l.maxLumaSamples) >> 2))
+        maxDpbSize = X265_MIN((4 * MaxDpbPicBuf) / 3, 16);
+
+    int savedRefCount = param.maxNumReferences;
+    while (vps.maxDecPicBuffering > maxDpbSize && param.maxNumReferences > 1)
+    {
+        param.maxNumReferences--;
+        vps.maxDecPicBuffering = X265_MIN(MAX_NUM_REF, X265_MAX(vps.numReorderPics + 1, (uint32_t)param.maxNumReferences) + vps.numReorderPics);
+    }
+    if (param.maxNumReferences != savedRefCount)
+        x265_log(&param, X265_LOG_INFO, "Lowering max references to %d to meet level requirement\n", param.maxNumReferences);
+
+    /* For level 5 and higher levels, the value of CtbSizeY shall be equal to 32 or 64 */
+    if (param.levelIdc >= 50 && param.maxCUSize < 32)
+    {
+        param.maxCUSize = 32;
+        x265_log(&param, X265_LOG_INFO, "Levels 5.0 and above require a maximum CTU size of at least 32, using --ctu 32\n");
+    }
+
+    /* The value of NumPocTotalCurr shall be less than or equal to 8 */
+    int numPocTotalCurr = param.maxNumReferences + !!param.bframes;
+    if (numPocTotalCurr > 8)
+    {
+        param.maxNumReferences = 8 - !!param.bframes;
+        x265_log(&param, X265_LOG_INFO, "Lowering max references to %d to meet numPocTotalCurr requirement\n", param.maxNumReferences);
+    }
+
+    return true;
+}
+}
+
+#if EXPORT_C_API
+
+/* these functions are exported as C functions (default) */
+using namespace X265_NS;
+extern "C" {
+
+#else
+
+/* these functions exist within private namespace (multilib) */
+namespace X265_NS {
+
+#endif
+
+int x265_param_apply_profile(x265_param *param, const char *profile)
+{
+    if (!param || !profile)
+        return 0;
+
+    /* Check if profile bit-depth requirement is exceeded by internal bit depth */
+    bool bInvalidDepth = false;
+#if X265_DEPTH > 8
+    if (!strcmp(profile, "main") || !strcmp(profile, "mainstillpicture") || !strcmp(profile, "msp") ||
+        !strcmp(profile, "main444-8") || !strcmp(profile, "main-intra") ||
+        !strcmp(profile, "main444-intra") || !strcmp(profile, "main444-stillpicture"))
+        bInvalidDepth = true;
+#endif
+#if X265_DEPTH > 10
+    if (!strcmp(profile, "main10") || !strcmp(profile, "main422-10") || !strcmp(profile, "main444-10") ||
+        !strcmp(profile, "main10-intra") || !strcmp(profile, "main422-10-intra") || !strcmp(profile, "main444-10-intra"))
+        bInvalidDepth = true;
+#endif
+#if X265_DEPTH > 12
+    if (!strcmp(profile, "main12") || !strcmp(profile, "main422-12") || !strcmp(profile, "main444-12") ||
+        !strcmp(profile, "main12-intra") || !strcmp(profile, "main422-12-intra") || !strcmp(profile, "main444-12-intra"))
+        bInvalidDepth = true;
+#endif
+
+    if (bInvalidDepth)
+    {
+        x265_log(param, X265_LOG_ERROR, "%s profile not supported, internal bit depth %d.\n", profile, X265_DEPTH);
+        return -1;
+    }
+
+    size_t l = strlen(profile);
+    bool bBoolIntra = (l > 6 && !strcmp(profile + l - 6, "-intra")) ||
+                      !strcmp(profile, "mainstillpicture") || !strcmp(profile, "msp");
+    if (bBoolIntra)
+    {
+        /* The profile may be detected as still picture if param->totalFrames is 1 */
+        param->keyframeMax = 1;
+    }
+    
+    /* check that input color space is supported by profile */
+    if (!strcmp(profile, "main") || !strcmp(profile, "main-intra") ||
+        !strcmp(profile, "main10") || !strcmp(profile, "main10-intra") ||
+        !strcmp(profile, "main12") || !strcmp(profile, "main12-intra") ||
+        !strcmp(profile, "mainstillpicture") || !strcmp(profile, "msp"))
+    {
+        if (param->internalCsp != X265_CSP_I420)
+        {
+            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input color space.\n",
+                     profile, x265_source_csp_names[param->internalCsp]);
+            return -1;
+        }
+    }
+    else if (!strcmp(profile, "main422-10") || !strcmp(profile, "main422-10-intra") ||
+             !strcmp(profile, "main422-12") || !strcmp(profile, "main422-12-intra"))
+    {
+        if (param->internalCsp != X265_CSP_I420 && param->internalCsp != X265_CSP_I422)
+        {
+            x265_log(param, X265_LOG_ERROR, "%s profile not compatible with %s input color space.\n",
+                     profile, x265_source_csp_names[param->internalCsp]);
+            return -1;
+        }
+    }
+    else if (!strcmp(profile, "main444-8") ||
+             !strcmp(profile, "main444-intra") || !strcmp(profile, "main444-stillpicture") ||
+             !strcmp(profile, "main444-10") || !strcmp(profile, "main444-10-intra") ||
+             !strcmp(profile, "main444-12") || !strcmp(profile, "main444-12-intra") ||
+             !strcmp(profile, "main444-16-intra") || !strcmp(profile, "main444-16-stillpicture"))
+    {
+        /* any color space allowed */
+    }
+    else
+    {
+        x265_log(param, X265_LOG_ERROR, "unknown profile <%s>\n", profile);
+        return -1;
+    }
+
+    return 0;
+}
+}
--- a/x265/source/encoder/level.h
+++ b/x265/source/encoder/level.h
@ -0,0 +1,39 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_LEVEL_H
+#define X265_LEVEL_H 1
+
+#include "common.h"
+#include "x265.h"
+
+namespace X265_NS {
+// encoder private namespace
+
+struct VPS;
+void determineLevel(const x265_param &param, VPS& vps);
+bool enforceLevel(x265_param& param, VPS& vps);
+
+}
+
+#endif // ifndef X265_LEVEL_H
--- a/x265/source/encoder/motion.cpp
+++ b/x265/source/encoder/motion.cpp
--- a/x265/source/encoder/motion.h
+++ b/x265/source/encoder/motion.h
@ -0,0 +1,110 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_MOTIONESTIMATE_H
+#define X265_MOTIONESTIMATE_H
+
+#include "primitives.h"
+#include "reference.h"
+#include "mv.h"
+#include "bitcost.h"
+#include "yuv.h"
+
+namespace X265_NS {
+// private x265 namespace
+
+class MotionEstimate : public BitCost
+{
+protected:
+
+    intptr_t blockOffset;
+    
+    int ctuAddr;
+    int absPartIdx;  // part index of PU, including CU offset within CTU
+
+    int searchMethod;
+    int subpelRefine;
+
+    int blockwidth;
+    int blockheight;
+
+    pixelcmp_t sad;
+    pixelcmp_x3_t sad_x3;
+    pixelcmp_x4_t sad_x4;
+    pixelcmp_t satd;
+    pixelcmp_t chromaSatd;
+
+    MotionEstimate& operator =(const MotionEstimate&);
+
+public:
+
+    static const int COST_MAX = 1 << 28;
+
+    Yuv fencPUYuv;
+    int partEnum;
+    bool bChromaSATD;
+
+    MotionEstimate();
+    ~MotionEstimate();
+
+    static void initScales();
+    static int hpelIterationCount(int subme);
+    void init(int method, int refine, int csp);
+
+    /* Methods called at slice setup */
+
+    void setSourcePU(pixel *fencY, intptr_t stride, intptr_t offset, int pwidth, int pheight);
+    void setSourcePU(const Yuv& srcFencYuv, int ctuAddr, int cuPartIdx, int puPartIdx, int pwidth, int pheight);
+
+    /* buf*() and motionEstimate() methods all use cached fenc pixels and thus
+     * require setSourcePU() to be called prior. */
+
+    inline int bufSAD(const pixel* fref, intptr_t stride)  { return sad(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride); }
+
+    inline int bufSATD(const pixel* fref, intptr_t stride) { return satd(fencPUYuv.m_buf[0], FENC_STRIDE, fref, stride); }
+
+    inline int bufChromaSATD(const Yuv& refYuv, int puPartIdx)
+    {
+        return chromaSatd(refYuv.getCbAddr(puPartIdx), refYuv.m_csize, fencPUYuv.m_buf[1], fencPUYuv.m_csize) +
+               chromaSatd(refYuv.getCrAddr(puPartIdx), refYuv.m_csize, fencPUYuv.m_buf[2], fencPUYuv.m_csize);
+    }
+
+    int motionEstimate(ReferencePlanes* ref, const MV & mvmin, const MV & mvmax, const MV & qmvp, int numCandidates, const MV * mvc, int merange, MV & outQMv);
+
+    int subpelCompare(ReferencePlanes* ref, const MV &qmv, pixelcmp_t);
+
+protected:
+
+    inline void StarPatternSearch(ReferencePlanes *ref,
+                                  const MV &       mvmin,
+                                  const MV &       mvmax,
+                                  MV &             bmv,
+                                  int &            bcost,
+                                  int &            bPointNr,
+                                  int &            bDistance,
+                                  int              earlyExitIters,
+                                  int              merange);
+};
+}
+
+#endif // ifndef X265_MOTIONESTIMATE_H
--- a/x265/source/encoder/nal.cpp
+++ b/x265/source/encoder/nal.cpp
@ -0,0 +1,232 @@
+/*****************************************************************************
+* Copyright (C) 2013 x265 project
+*
+* Authors: Steve Borho <steve@borho.org>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#include "common.h"
+#include "bitstream.h"
+#include "nal.h"
+
+using namespace X265_NS;
+
+NALList::NALList()
+    : m_numNal(0)
+    , m_buffer(NULL)
+    , m_occupancy(0)
+    , m_allocSize(0)
+    , m_extraBuffer(NULL)
+    , m_extraOccupancy(0)
+    , m_extraAllocSize(0)
+    , m_annexB(true)
+{}
+
+void NALList::takeContents(NALList& other)
+{
+    /* take other NAL buffer, discard our old one */
+    X265_FREE(m_buffer);
+    m_buffer = other.m_buffer;
+    m_allocSize = other.m_allocSize;
+    m_occupancy = other.m_occupancy;
+
+    /* copy packet data */
+    m_numNal = other.m_numNal;
+    memcpy(m_nal, other.m_nal, sizeof(x265_nal) * m_numNal);
+
+    /* reset other list, re-allocate their buffer with same size */
+    other.m_numNal = 0;
+    other.m_occupancy = 0;
+    other.m_buffer = X265_MALLOC(uint8_t, m_allocSize);
+}
+
+void NALList::serialize(NalUnitType nalUnitType, const Bitstream& bs)
+{
+    static const char startCodePrefix[] = { 0, 0, 0, 1 };
+
+    uint32_t payloadSize = bs.getNumberOfWrittenBytes();
+    const uint8_t* bpayload = bs.getFIFO();
+    if (!bpayload)
+        return;
+
+    uint32_t nextSize = m_occupancy + sizeof(startCodePrefix) + 2 + payloadSize + (payloadSize >> 1) + m_extraOccupancy;
+    if (nextSize > m_allocSize)
+    {
+        uint8_t *temp = X265_MALLOC(uint8_t, nextSize);
+        if (temp)
+        {
+            memcpy(temp, m_buffer, m_occupancy);
+
+            /* fixup existing payload pointers */
+            for (uint32_t i = 0; i < m_numNal; i++)
+                m_nal[i].payload = temp + (m_nal[i].payload - m_buffer);
+
+            X265_FREE(m_buffer);
+            m_buffer = temp;
+            m_allocSize = nextSize;
+        }
+        else
+        {
+            x265_log(NULL, X265_LOG_ERROR, "Unable to realloc access unit buffer\n");
+            return;
+        }
+    }
+
+    uint8_t *out = m_buffer + m_occupancy;
+    uint32_t bytes = 0;
+
+    if (!m_annexB)
+    {
+        /* Will write size later */
+        bytes += 4;
+    }
+    else if (!m_numNal || nalUnitType == NAL_UNIT_VPS || nalUnitType == NAL_UNIT_SPS || nalUnitType == NAL_UNIT_PPS)
+    {
+        memcpy(out, startCodePrefix, 4);
+        bytes += 4;
+    }
+    else
+    {
+        memcpy(out, startCodePrefix + 1, 3);
+        bytes += 3;
+    }
+
+    /* 16 bit NAL header:
+     * forbidden_zero_bit       1-bit
+     * nal_unit_type            6-bits
+     * nuh_reserved_zero_6bits  6-bits
+     * nuh_temporal_id_plus1    3-bits */
+    out[bytes++] = (uint8_t)nalUnitType << 1;
+    out[bytes++] = 1 + (nalUnitType == NAL_UNIT_CODED_SLICE_TSA_N);
+
+    /* 7.4.1 ...
+     * Within the NAL unit, the following three-byte sequences shall not occur at
+     * any byte-aligned position:
+     *  - 0x000000
+     *  - 0x000001
+     *  - 0x000002 */
+    for (uint32_t i = 0; i < payloadSize; i++)
+    {
+        if (i > 2 && !out[bytes - 2] && !out[bytes - 3] && out[bytes - 1] <= 0x03)
+        {
+            /* inject 0x03 to prevent emulating a start code */
+            out[bytes] = out[bytes - 1];
+            out[bytes - 1] = 0x03;
+            bytes++;
+        }
+
+        out[bytes++] = bpayload[i];
+    }
+
+    X265_CHECK(bytes <= 4 + 2 + payloadSize + (payloadSize >> 1), "NAL buffer overflow\n");
+
+    if (m_extraOccupancy)
+    {
+        /* these bytes were escaped by serializeSubstreams */
+        memcpy(out + bytes, m_extraBuffer, m_extraOccupancy);
+        bytes += m_extraOccupancy;
+        m_extraOccupancy = 0;
+    }
+
+    /* 7.4.1.1
+     * ... when the last byte of the RBSP data is equal to 0x00 (which can
+     * only occur when the RBSP ends in a cabac_zero_word), a final byte equal
+     * to 0x03 is appended to the end of the data.  */
+    if (!out[bytes - 1])
+        out[bytes++] = 0x03;
+
+    if (!m_annexB)
+    {
+        uint32_t dataSize = bytes - 4;
+        out[0] = (uint8_t)(dataSize >> 24);
+        out[1] = (uint8_t)(dataSize >> 16);
+        out[2] = (uint8_t)(dataSize >> 8);
+        out[3] = (uint8_t)dataSize;
+    }
+
+    m_occupancy += bytes;
+
+    X265_CHECK(m_numNal < (uint32_t)MAX_NAL_UNITS, "NAL count overflow\n");
+
+    x265_nal& nal = m_nal[m_numNal++];
+    nal.type = nalUnitType;
+    nal.sizeBytes = bytes;
+    nal.payload = out;
+}
+
+/* concatenate and escape WPP sub-streams, return escaped row lengths.
+ * These streams will be appended to the next serialized NAL */
+uint32_t NALList::serializeSubstreams(uint32_t* streamSizeBytes, uint32_t streamCount, const Bitstream* streams)
+{
+    uint32_t maxStreamSize = 0;
+    uint32_t estSize = 0;
+    for (uint32_t s = 0; s < streamCount; s++)
+        estSize += streams[s].getNumberOfWrittenBytes();
+    estSize += estSize >> 1;
+
+    if (estSize > m_extraAllocSize)
+    {
+        uint8_t *temp = X265_MALLOC(uint8_t, estSize);
+        if (temp)
+        {
+            X265_FREE(m_extraBuffer);
+            m_extraBuffer = temp;
+            m_extraAllocSize = estSize;
+        }
+        else
+        {
+            x265_log(NULL, X265_LOG_ERROR, "Unable to realloc WPP substream concatenation buffer\n");
+            return 0;
+        }
+    }
+
+    uint32_t bytes = 0;
+    uint8_t *out = m_extraBuffer;
+    for (uint32_t s = 0; s < streamCount; s++)
+    {
+        const Bitstream& stream = streams[s];
+        uint32_t inSize = stream.getNumberOfWrittenBytes();
+        const uint8_t *inBytes = stream.getFIFO();
+        uint32_t prevBufSize = bytes;
+
+        if (inBytes)
+        {
+            for (uint32_t i = 0; i < inSize; i++)
+            {
+                if (bytes >= 2 && !out[bytes - 2] && !out[bytes - 1] && inBytes[i] <= 0x03)
+                {
+                    /* inject 0x03 to prevent emulating a start code */
+                    out[bytes++] = 3;
+                }
+
+                out[bytes++] = inBytes[i];
+            }
+        }
+
+        if (s < streamCount - 1)
+        {
+            streamSizeBytes[s] = bytes - prevBufSize;
+            if (streamSizeBytes[s] > maxStreamSize)
+                maxStreamSize = streamSizeBytes[s];
+        }
+    }
+
+    m_extraOccupancy = bytes;
+    return maxStreamSize;
+}
--- a/x265/source/encoder/nal.h
+++ b/x265/source/encoder/nal.h
@ -0,0 +1,65 @@
+/*****************************************************************************
+* Copyright (C) 2013 x265 project
+*
+* Authors: Steve Borho <steve@borho.org>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#ifndef X265_NAL_H
+#define X265_NAL_H
+
+#include "common.h"
+#include "x265.h"
+
+namespace X265_NS {
+// private namespace
+
+class Bitstream;
+
+class NALList
+{
+    static const int MAX_NAL_UNITS = 16;
+
+public:
+
+    x265_nal    m_nal[MAX_NAL_UNITS];
+    uint32_t    m_numNal;
+
+    uint8_t*    m_buffer;
+    uint32_t    m_occupancy;
+    uint32_t    m_allocSize;
+
+    uint8_t*    m_extraBuffer;
+    uint32_t    m_extraOccupancy;
+    uint32_t    m_extraAllocSize;
+    bool        m_annexB;
+
+    NALList();
+    ~NALList() { X265_FREE(m_buffer); X265_FREE(m_extraBuffer); }
+
+    void takeContents(NALList& other);
+
+    void serialize(NalUnitType nalUnitType, const Bitstream& bs);
+
+    uint32_t serializeSubstreams(uint32_t* streamSizeBytes, uint32_t streamCount, const Bitstream* streams);
+};
+
+}
+
+#endif // ifndef X265_NAL_H
--- a/x265/source/encoder/ratecontrol.cpp
+++ b/x265/source/encoder/ratecontrol.cpp
--- a/x265/source/encoder/ratecontrol.h
+++ b/x265/source/encoder/ratecontrol.h
@ -0,0 +1,267 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Sumalatha Polureddy <sumalatha@multicorewareinc.com>
+ *          Aarthi Priya Thirumalai <aarthi@multicorewareinc.com>
+ *          Xun Xu, PPLive Corporation <xunxu@pptv.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_RATECONTROL_H
+#define X265_RATECONTROL_H
+
+#include "common.h"
+#include "sei.h"
+
+namespace X265_NS {
+// encoder namespace
+
+class Encoder;
+class Frame;
+class SEIBufferingPeriod;
+struct SPS;
+#define BASE_FRAME_DURATION 0.04
+
+/* Arbitrary limitations as a sanity check. */
+#define MAX_FRAME_DURATION 1.00
+#define MIN_FRAME_DURATION 0.01
+
+#define MIN_AMORTIZE_FRAME 10
+#define MIN_AMORTIZE_FRACTION 0.2
+#define CLIP_DURATION(f) x265_clip3(MIN_FRAME_DURATION, MAX_FRAME_DURATION, f)
+
+struct Predictor
+{
+    double coeff;
+    double count;
+    double decay;
+    double offset;
+};
+
+struct HRDTiming
+{
+    double cpbInitialAT;
+    double cpbFinalAT;
+    double dpbOutputTime;
+    double cpbRemovalTime;
+};
+
+struct RateControlEntry
+{
+    Predictor  rowPreds[3][2];
+    Predictor* rowPred[2];
+
+    int64_t lastSatd;      /* Contains the picture cost of the previous frame, required for resetAbr and VBV */
+    int64_t leadingNoBSatd;
+    int64_t rowTotalBits;  /* update cplxrsum and totalbits at the end of 2 rows */
+    double  blurredComplexity;
+    double  qpaRc;
+    double  qpAq;
+    double  qRceq;
+    double  frameSizePlanned;  /* frame Size decided by RateCotrol before encoding the frame */
+    double  bufferRate;
+    double  movingAvgSum;
+    double  rowCplxrSum;
+    double  qpNoVbv;
+    double  bufferFill;
+    double  frameDuration;
+    double  clippedDuration;
+    double  frameSizeEstimated; /* hold frameSize, updated from cu level vbv rc */
+    double  frameSizeMaximum;   /* max frame Size according to minCR restrictions and level of the video */
+    int     sliceType;
+    int     bframes;
+    int     poc;
+    int     encodeOrder;
+    bool    bLastMiniGopBFrame;
+    bool    isActive;
+    double  amortizeFrames;
+    double  amortizeFraction;
+    /* Required in 2-pass rate control */
+    uint64_t expectedBits; /* total expected bits up to the current frame (current one excluded) */
+    double   iCuCount;
+    double   pCuCount;
+    double   skipCuCount;
+    double   expectedVbv;
+    double   qScale;
+    double   newQScale;
+    double   newQp;
+    int      mvBits;
+    int      miscBits;
+    int      coeffBits;
+    bool     keptAsRef;
+
+    SEIPictureTiming *picTimingSEI;
+    HRDTiming        *hrdTiming;
+};
+
+class RateControl
+{
+public:
+
+    x265_param* m_param;
+    Slice*      m_curSlice;      /* all info about the current frame */
+    SliceType   m_sliceType;     /* Current frame type */
+    int         m_ncu;           /* number of CUs in a frame */
+    int         m_qp;            /* updated qp for current frame */
+
+    bool   m_isAbr;
+    bool   m_isVbv;
+    bool   m_isCbr;
+    bool   m_singleFrameVbv;
+
+    bool   m_isAbrReset;
+    int    m_lastAbrResetPoc;
+
+    double m_rateTolerance;
+    double m_frameDuration;     /* current frame duration in seconds */
+    double m_bitrate;
+    double m_rateFactorConstant;
+    double m_bufferSize;
+    double m_bufferFillFinal;  /* real buffer as of the last finished frame */
+    double m_bufferFill;       /* planned buffer, if all in-progress frames hit their bit budget */
+    double m_bufferRate;       /* # of bits added to buffer_fill after each frame */
+    double m_vbvMaxRate;       /* in kbps */
+    double m_rateFactorMaxIncrement; /* Don't allow RF above (CRF + this value). */
+    double m_rateFactorMaxDecrement; /* don't allow RF below (this value). */
+
+    Predictor m_pred[4];       /* Slice predictors to preidct bits for each Slice type - I,P,Bref and B */
+    int64_t m_leadingNoBSatd;
+    int     m_predType;       /* Type of slice predictors to be used - depends on the slice type */
+    double  m_ipOffset;
+    double  m_pbOffset;
+    int64_t m_bframeBits;
+    int64_t m_currentSatd;
+    int     m_qpConstant[3];
+    int     m_lastNonBPictType;
+    int     m_framesDone;        /* # of frames passed through RateCotrol already */
+
+    double  m_cplxrSum;          /* sum of bits*qscale/rceq */
+    double  m_wantedBitsWindow;  /* target bitrate * window */
+    double  m_accumPQp;          /* for determining I-frame quant */
+    double  m_accumPNorm;
+    double  m_lastQScaleFor[3];  /* last qscale for a specific pict type, used for max_diff & ipb factor stuff */
+    double  m_lstep;
+    double  m_shortTermCplxSum;
+    double  m_shortTermCplxCount;
+    double  m_lastRceq;
+    double  m_qCompress;
+    int64_t m_totalBits;        /* total bits used for already encoded frames (after ammortization) */
+    int64_t m_encodedBits;      /* bits used for encoded frames (without ammortization) */
+    double  m_fps;
+    int64_t m_satdCostWindow[50];
+    int64_t m_encodedBitsWindow[50];
+    int     m_sliderPos;
+
+    /* To detect a pattern of low detailed static frames in single pass ABR using satdcosts */
+    int64_t m_lastBsliceSatdCost;
+    int     m_numBframesInPattern;
+    bool    m_isPatternPresent;
+    bool    m_isSceneTransition;
+
+    /* a common variable on which rateControlStart, rateControlEnd and rateControUpdateStats waits to
+     * sync the calls to these functions. For example
+     * -F2:
+     * rceStart  10
+     * rceUpdate 10
+     * rceEnd    9
+     * rceStart  11
+     * rceUpdate 11
+     * rceEnd    10
+     * rceStart  12
+     * rceUpdate 12
+     * rceEnd    11 */
+    ThreadSafeInteger m_startEndOrder;
+    int     m_finalFrameCount;   /* set when encoder begins flushing */
+    bool    m_bTerminated;       /* set true when encoder is closing */
+
+    /* hrd stuff */
+    SEIBufferingPeriod m_bufPeriodSEI;
+    double  m_nominalRemovalTime;
+    double  m_prevCpbFinalAT;
+
+    /* 2 pass */
+    bool    m_2pass;
+    int     m_numEntries;
+    FILE*   m_statFileOut;
+    FILE*   m_cutreeStatFileOut;
+    FILE*   m_cutreeStatFileIn;
+    double  m_lastAccumPNorm;
+    double  m_expectedBitsSum;   /* sum of qscale2bits after rceq, ratefactor, and overflow, only includes finished frames */
+    int64_t m_predictedBits;
+    RateControlEntry* m_rce2Pass;
+
+    struct
+    {
+        uint16_t *qpBuffer[2]; /* Global buffers for converting MB-tree quantizer data. */
+        int qpBufPos;          /* In order to handle pyramid reordering, QP buffer acts as a stack.
+                                * This value is the current position (0 or 1). */
+    } m_cuTreeStats;
+
+    RateControl(x265_param& p);
+    bool init(const SPS& sps);
+    void initHRD(SPS& sps);
+
+    void setFinalFrameCount(int count);
+    void terminate();          /* un-block all waiting functions so encoder may close */
+    void destroy();
+
+    // to be called for each curFrame to process RateControl and set QP
+    int  rateControlStart(Frame* curFrame, RateControlEntry* rce, Encoder* enc);
+    void rateControlUpdateStats(RateControlEntry* rce);
+    int  rateControlEnd(Frame* curFrame, int64_t bits, RateControlEntry* rce);
+    int  rowDiagonalVbvRateControl(Frame* curFrame, uint32_t row, RateControlEntry* rce, double& qpVbv);
+    int  rateControlSliceType(int frameNum);
+    bool cuTreeReadFor2Pass(Frame* curFrame);
+    void hrdFullness(SEIBufferingPeriod* sei);
+    int writeRateControlFrameStats(Frame* curFrame, RateControlEntry* rce);
+protected:
+
+    static const int   s_slidingWindowFrames;
+    static const char* s_defaultStatFileName;
+
+    double m_amortizeFraction;
+    int    m_amortizeFrames;
+    int    m_residualFrames;
+    int    m_partialResidualFrames;
+    int    m_residualCost;
+    int    m_partialResidualCost;
+
+    x265_zone* getZone();
+    double getQScale(RateControlEntry *rce, double rateFactor);
+    double rateEstimateQscale(Frame* pic, RateControlEntry *rce); // main logic for calculating QP based on ABR
+    double tuneAbrQScaleFromFeedback(double qScale);
+    void   accumPQpUpdate();
+
+    int    getPredictorType(int lowresSliceType, int sliceType);
+    void   updateVbv(int64_t bits, RateControlEntry* rce);
+    void   updatePredictor(Predictor *p, double q, double var, double bits);
+    double clipQscale(Frame* pic, RateControlEntry* rce, double q);
+    void   updateVbvPlan(Encoder* enc);
+    double predictSize(Predictor *p, double q, double var);
+    void   checkAndResetABR(RateControlEntry* rce, bool isFrameDone);
+    double predictRowsSizeSum(Frame* pic, RateControlEntry* rce, double qpm, int32_t& encodedBits);
+    bool   initPass2();
+    double getDiffLimitedQScale(RateControlEntry *rce, double q);
+    double countExpectedBits();
+    bool   vbv2Pass(uint64_t allAvailableBits);
+    bool   findUnderflow(double *fills, int *t0, int *t1, int over);
+    bool   fixUnderflow(int t0, int t1, double adjustment, double qscaleMin, double qscaleMax);
+};
+}
+#endif // ifndef X265_RATECONTROL_H
--- a/x265/source/encoder/rdcost.h
+++ b/x265/source/encoder/rdcost.h
@ -0,0 +1,147 @@
+/*****************************************************************************
+* Copyright (C) 2013 x265 project
+*
+* Authors: Steve Borho <steve@borho.org>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#ifndef X265_RDCOST_H
+#define X265_RDCOST_H
+
+#include "common.h"
+#include "slice.h"
+
+namespace X265_NS {
+// private namespace
+
+class RDCost
+{
+public:
+
+    /* all weights and factors stored as FIX8 */
+    uint64_t  m_lambda2;
+    uint64_t  m_lambda;
+    uint32_t  m_chromaDistWeight[2];
+    uint32_t  m_psyRdBase;
+    uint32_t  m_psyRd;
+    int       m_qp; /* QP used to configure lambda, may be higher than QP_MAX_SPEC but <= QP_MAX_MAX */
+
+    void setPsyRdScale(double scale)                { m_psyRdBase = (uint32_t)floor(65536.0 * scale * 0.33); }
+
+    void setQP(const Slice& slice, int qp)
+    {
+        x265_emms(); /* TODO: if the lambda tables were ints, this would not be necessary */
+        m_qp = qp;
+        setLambda(x265_lambda2_tab[qp], x265_lambda_tab[qp]);
+
+        /* Scale PSY RD factor by a slice type factor */
+        static const uint32_t psyScaleFix8[3] = { 300, 256, 96 }; /* B, P, I */
+        m_psyRd = (m_psyRdBase * psyScaleFix8[slice.m_sliceType]) >> 8;
+
+        /* Scale PSY RD factor by QP, at high QP psy-rd can cause artifacts */
+        if (qp >= 40)
+        {
+            int scale = qp >= QP_MAX_SPEC ? 0 : (QP_MAX_SPEC - qp) * 23;
+            m_psyRd = (m_psyRd * scale) >> 8;
+        }
+
+        int qpCb, qpCr;
+        if (slice.m_sps->chromaFormatIdc == X265_CSP_I420)
+        {
+            qpCb = (int)g_chromaScale[x265_clip3(QP_MIN, QP_MAX_MAX, qp + slice.m_pps->chromaQpOffset[0])];
+            qpCr = (int)g_chromaScale[x265_clip3(QP_MIN, QP_MAX_MAX, qp + slice.m_pps->chromaQpOffset[1])];
+        }
+        else
+        {
+            qpCb = x265_clip3(QP_MIN, QP_MAX_SPEC, qp + slice.m_pps->chromaQpOffset[0]);
+            qpCr = x265_clip3(QP_MIN, QP_MAX_SPEC, qp + slice.m_pps->chromaQpOffset[1]);
+        }
+
+        int chroma_offset_idx = X265_MIN(qp - qpCb + 12, MAX_CHROMA_LAMBDA_OFFSET);
+        uint16_t lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
+        m_chromaDistWeight[0] = lambdaOffset;
+
+        chroma_offset_idx = X265_MIN(qp - qpCr + 12, MAX_CHROMA_LAMBDA_OFFSET);
+        lambdaOffset = m_psyRd ? x265_chroma_lambda2_offset_tab[chroma_offset_idx] : 256;
+        m_chromaDistWeight[1] = lambdaOffset;
+    }
+
+    void setLambda(double lambda2, double lambda)
+    {
+        m_lambda2 = (uint64_t)floor(256.0 * lambda2);
+        m_lambda = (uint64_t)floor(256.0 * lambda);
+    }
+
+    inline uint64_t calcRdCost(sse_ret_t distortion, uint32_t bits) const
+    {
+        X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda2,
+#if X265_DEPTH <= 10
+                   "calcRdCost wrap detected dist: %u, bits %u, lambda: " X265_LL "\n",
+#else
+                   "calcRdCost wrap detected dist: " X265_LL ", bits %u, lambda: " X265_LL "\n",
+#endif
+                   distortion, bits, m_lambda2);
+        return distortion + ((bits * m_lambda2 + 128) >> 8);
+    }
+
+    /* return the difference in energy between the source block and the recon block */
+    inline int psyCost(int size, const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride) const
+    {
+        return primitives.cu[size].psy_cost_pp(source, sstride, recon, rstride);
+    }
+
+    /* return the difference in energy between the source block and the recon block */
+    inline int psyCost(int size, const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride) const
+    {
+        return primitives.cu[size].psy_cost_ss(source, sstride, recon, rstride);
+    }
+
+    /* return the RD cost of this prediction, including the effect of psy-rd */
+    inline uint64_t calcPsyRdCost(sse_ret_t distortion, uint32_t bits, uint32_t psycost) const
+    {
+        return distortion + ((m_lambda * m_psyRd * psycost) >> 24) + ((bits * m_lambda2) >> 8);
+    }
+
+    inline uint64_t calcRdSADCost(uint32_t sadCost, uint32_t bits) const
+    {
+        X265_CHECK(bits <= (UINT64_MAX - 128) / m_lambda,
+                   "calcRdSADCost wrap detected dist: %u, bits %u, lambda: " X265_LL "\n", sadCost, bits, m_lambda);
+        return sadCost + ((bits * m_lambda + 128) >> 8);
+    }
+
+    inline sse_ret_t scaleChromaDist(uint32_t plane, sse_ret_t dist) const
+    {
+        X265_CHECK(dist <= (UINT64_MAX - 128) / m_chromaDistWeight[plane - 1],
+#if X265_DEPTH <= 10
+                   "scaleChromaDist wrap detected dist: %u, lambda: %u\n",
+#else
+                   "scaleChromaDist wrap detected dist: " X265_LL " lambda: %u\n",
+#endif
+                   dist, m_chromaDistWeight[plane - 1]);
+        return (sse_ret_t)((dist * (uint64_t)m_chromaDistWeight[plane - 1] + 128) >> 8);
+    }
+
+    inline uint32_t getCost(uint32_t bits) const
+    {
+        return (uint32_t)((bits * m_lambda + 128) >> 8);
+    }
+};
+}
+
+#endif // ifndef X265_TCOMRDCOST_H
--- a/x265/source/encoder/reference.cpp
+++ b/x265/source/encoder/reference.cpp
@ -0,0 +1,174 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *          Deepthi Devaki <deepthidevaki@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "primitives.h"
+#include "slice.h"
+#include "picyuv.h"
+
+#include "reference.h"
+
+using namespace X265_NS;
+
+MotionReference::MotionReference()
+{
+    weightBuffer[0] = NULL;
+    weightBuffer[1] = NULL;
+    weightBuffer[2] = NULL;
+}
+
+MotionReference::~MotionReference()
+{
+    X265_FREE(weightBuffer[0]);
+    X265_FREE(weightBuffer[1]);
+    X265_FREE(weightBuffer[2]);
+}
+
+int MotionReference::init(PicYuv* recPic, WeightParam *wp, const x265_param& p)
+{
+    reconPic = recPic;
+    numWeightedRows = 0;
+    lumaStride = recPic->m_stride;
+    chromaStride = recPic->m_strideC;
+    numInterpPlanes = p.subpelRefine > 2 ? 3 : 1; /* is chroma satd possible? */
+
+    /* directly reference the extended integer pel planes */
+    fpelPlane[0] = recPic->m_picOrg[0];
+    fpelPlane[1] = recPic->m_picOrg[1];
+    fpelPlane[2] = recPic->m_picOrg[2];
+    isWeighted = false;
+
+    if (wp)
+    {
+        uint32_t numCUinHeight = (reconPic->m_picHeight + g_maxCUSize - 1) / g_maxCUSize;
+
+        int marginX = reconPic->m_lumaMarginX;
+        int marginY = reconPic->m_lumaMarginY;
+        intptr_t stride = reconPic->m_stride;
+        int cuHeight = g_maxCUSize;
+
+        for (int c = 0; c < numInterpPlanes; c++)
+        {
+            if (c == 1)
+            {
+                marginX = reconPic->m_chromaMarginX;
+                marginY = reconPic->m_chromaMarginY;
+                stride  = reconPic->m_strideC;
+                cuHeight >>= reconPic->m_vChromaShift;
+            }
+
+            if (wp[c].bPresentFlag)
+            {
+                if (!weightBuffer[c])
+                {
+                    size_t padheight = (numCUinHeight * cuHeight) + marginY * 2;
+                    weightBuffer[c] = X265_MALLOC(pixel, stride * padheight);
+                    if (!weightBuffer[c])
+                        return -1;
+                }
+
+                /* use our buffer which will have weighted pixels written to it */
+                fpelPlane[c] = weightBuffer[c] + marginY * stride + marginX;
+                X265_CHECK(recPic->m_picOrg[c] - recPic->m_picBuf[c] == marginY * stride + marginX, "PicYuv pad calculation mismatch\n");
+
+                w[c].weight = wp[c].inputWeight;
+                w[c].offset = wp[c].inputOffset * (1 << (X265_DEPTH - 8));
+                w[c].shift = wp[c].log2WeightDenom;
+                w[c].round = w[c].shift ? 1 << (w[c].shift - 1) : 0;
+            }
+        }
+
+        isWeighted = true;
+    }
+
+    return 0;
+}
+
+void MotionReference::applyWeight(int finishedRows, int maxNumRows)
+{
+    finishedRows = X265_MIN(finishedRows, maxNumRows);
+    if (numWeightedRows >= finishedRows)
+        return;
+
+    int marginX = reconPic->m_lumaMarginX;
+    int marginY = reconPic->m_lumaMarginY;
+    intptr_t stride = reconPic->m_stride;
+    int width   = reconPic->m_picWidth;
+    int height  = (finishedRows - numWeightedRows) * g_maxCUSize;
+    if (finishedRows == maxNumRows && (reconPic->m_picHeight % g_maxCUSize))
+    {
+        /* the last row may be partial height */
+        height -= g_maxCUSize;
+        height += reconPic->m_picHeight % g_maxCUSize;
+    }
+    int cuHeight = g_maxCUSize;
+
+    for (int c = 0; c < numInterpPlanes; c++)
+    {
+        if (c == 1)
+        {
+            marginX = reconPic->m_chromaMarginX;
+            marginY = reconPic->m_chromaMarginY;
+            stride  = reconPic->m_strideC;
+            width    >>= reconPic->m_hChromaShift;
+            height   >>= reconPic->m_vChromaShift;
+            cuHeight >>= reconPic->m_vChromaShift;
+        }
+
+        /* Do not generate weighted predictions if using original picture */
+        if (fpelPlane[c] == reconPic->m_picOrg[c])
+            continue;
+
+        const pixel* src = reconPic->m_picOrg[c] + numWeightedRows * cuHeight * stride;
+        pixel* dst = fpelPlane[c] + numWeightedRows * cuHeight * stride;
+
+        // Computing weighted CU rows
+        int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth
+        int padwidth = (width + 15) & ~15;              // weightp assembly needs even 16 byte widths
+        primitives.weight_pp(src, dst, stride, padwidth, height, w[c].weight, w[c].round << correction, w[c].shift + correction, w[c].offset);
+
+        // Extending Left & Right
+        primitives.extendRowBorder(dst, stride, width, height, marginX);
+
+        // Extending Above
+        if (numWeightedRows == 0)
+        {
+            pixel *pixY = fpelPlane[c] - marginX;
+            for (int y = 0; y < marginY; y++)
+                memcpy(pixY - (y + 1) * stride, pixY, stride * sizeof(pixel));
+        }
+
+        // Extending Bottom
+        if (finishedRows == maxNumRows)
+        {
+            int picHeight = reconPic->m_picHeight;
+            if (c) picHeight >>= reconPic->m_vChromaShift;
+            pixel *pixY = fpelPlane[c] - marginX + (picHeight - 1) * stride;
+            for (int y = 0; y < marginY; y++)
+                memcpy(pixY + (y + 1) * stride, pixY, stride * sizeof(pixel));
+        }
+    }
+
+    numWeightedRows = finishedRows;
+}
--- a/x265/source/encoder/reference.h
+++ b/x265/source/encoder/reference.h
@ -0,0 +1,56 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_REFERENCE_H
+#define X265_REFERENCE_H
+
+#include "primitives.h"
+#include "picyuv.h"
+#include "lowres.h"
+#include "mv.h"
+
+namespace X265_NS {
+// private x265 namespace
+
+struct WeightParam;
+
+class MotionReference : public ReferencePlanes
+{
+public:
+
+    MotionReference();
+    ~MotionReference();
+    int  init(PicYuv*, WeightParam* wp, const x265_param& p);
+    void applyWeight(int rows, int numRows);
+
+    pixel*  weightBuffer[3];
+    int     numInterpPlanes;
+    int     numWeightedRows;
+
+protected:
+
+    MotionReference& operator =(const MotionReference&);
+};
+}
+
+#endif // ifndef X265_REFERENCE_H
--- a/x265/source/encoder/sao.cpp
+++ b/x265/source/encoder/sao.cpp
--- a/x265/source/encoder/sao.h
+++ b/x265/source/encoder/sao.h
@ -0,0 +1,154 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *          Min Chen <chenm003@163.com>
+ *          Praveen Kumar Tiwari <praveen@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_SAO_H
+#define X265_SAO_H
+
+#include "common.h"
+#include "frame.h"
+#include "entropy.h"
+
+namespace X265_NS {
+// private namespace
+
+enum SAOTypeLen
+{
+    SAO_EO_LEN = 4,
+    SAO_BO_LEN = 4,
+    SAO_NUM_BO_CLASSES = 32
+};
+
+enum SAOType
+{
+    SAO_EO_0 = 0,
+    SAO_EO_1,
+    SAO_EO_2,
+    SAO_EO_3,
+    SAO_BO,
+    MAX_NUM_SAO_TYPE
+};
+
+class SAO
+{
+public:
+
+    enum { SAO_MAX_DEPTH = 4 };
+    enum { SAO_BO_BITS  = 5 };
+    enum { MAX_NUM_SAO_CLASS = 33 };
+    enum { SAO_BIT_INC = 0 }; /* in HM12.0, it wrote as X265_MAX(X265_DEPTH - 10, 0) */
+    enum { OFFSET_THRESH = 1 << X265_MIN(X265_DEPTH - 5, 5) };
+    enum { NUM_EDGETYPE = 5 };
+    enum { NUM_PLANE = 3 };
+    enum { NUM_MERGE_MODE = 3 };
+
+    static const uint32_t s_eoTable[NUM_EDGETYPE];
+
+    typedef int32_t (PerClass[MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
+    typedef int32_t (PerPlane[NUM_PLANE][MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
+
+protected:
+
+    /* allocated per part */
+    PerClass*   m_count;
+    PerClass*   m_offset;
+    PerClass*   m_offsetOrg;
+
+    /* allocated per CTU */
+    PerPlane*   m_countPreDblk;
+    PerPlane*   m_offsetOrgPreDblk;
+
+    double      m_depthSaoRate[2][4];
+    int8_t      m_offsetBo[SAO_NUM_BO_CLASSES];
+    int8_t      m_offsetEo[NUM_EDGETYPE];
+
+    int         m_numCuInWidth;
+    int         m_numCuInHeight;
+    int         m_numPlanes;
+    int         m_hChromaShift;
+    int         m_vChromaShift;
+
+    pixel*      m_clipTable;
+    pixel*      m_clipTableBase;
+
+    pixel*      m_tmpU1[3];
+    pixel*      m_tmpU2[3];
+    pixel*      m_tmpL1;
+    pixel*      m_tmpL2;
+
+public:
+
+    struct SAOContexts
+    {
+        Entropy cur;
+        Entropy next;
+        Entropy temp;
+    };
+
+    Frame*      m_frame;
+    Entropy     m_entropyCoder;
+    SAOContexts m_rdContexts;
+
+    x265_param* m_param;
+    int         m_refDepth;
+    int         m_numNoSao[2];
+
+    double      m_lumaLambda;
+    double      m_chromaLambda;
+    /* TODO: No doubles for distortion */
+
+    SAO();
+
+    bool create(x265_param* param);
+    void destroy();
+
+    void allocSaoParam(SAOParam* saoParam) const;
+
+    void startSlice(Frame* pic, Entropy& initState, int qp);
+    void resetStats();
+    void resetSaoUnit(SaoCtuParam* saoUnit);
+
+    // CTU-based SAO process without slice granularity
+    void processSaoCu(int addr, int typeIdx, int plane);
+    void processSaoUnitRow(SaoCtuParam* ctuParam, int idxY, int plane);
+
+    void copySaoUnit(SaoCtuParam* saoUnitDst, const SaoCtuParam* saoUnitSrc);
+
+    void calcSaoStatsCu(int addr, int plane);
+    void calcSaoStatsCu_BeforeDblk(Frame* pic, int idxX, int idxY);
+
+    void saoComponentParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam mergeSaoParam[2], double* mergeDist);
+    void sao2ChromaParamDist(SAOParam* saoParam, int addr, int addrUp, int addrLeft, SaoCtuParam mergeSaoParam[][2], double* mergeDist);
+
+    inline int estIterOffset(int typeIdx, int classIdx, double lambda, int offset, int32_t count, int32_t offsetOrg,
+                             int32_t* currentDistortionTableBo, double* currentRdCostTableBo);
+    inline int64_t estSaoTypeDist(int plane, int typeIdx, double lambda, int32_t* currentDistortionTableBo, double* currentRdCostTableBo);
+
+    void rdoSaoUnitRowEnd(const SAOParam* saoParam, int numctus);
+    void rdoSaoUnitRow(SAOParam* saoParam, int idxY);
+};
+
+}
+
+#endif // ifndef X265_SAO_H
--- a/x265/source/encoder/search.cpp
+++ b/x265/source/encoder/search.cpp
--- a/x265/source/encoder/search.h
+++ b/x265/source/encoder/search.h
@ -0,0 +1,468 @@
+/*****************************************************************************
+* Copyright (C) 2013 x265 project
+*
+* Authors: Steve Borho <steve@borho.org>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#ifndef X265_SEARCH_H
+#define X265_SEARCH_H
+
+#include "common.h"
+#include "predict.h"
+#include "quant.h"
+#include "bitcost.h"
+#include "framedata.h"
+#include "yuv.h"
+#include "threadpool.h"
+
+#include "rdcost.h"
+#include "entropy.h"
+#include "motion.h"
+
+#if DETAILED_CU_STATS
+#define ProfileCUScopeNamed(name, cu, acc, count) \
+    m_stats[cu.m_encData->m_frameEncoderID].count++; \
+    ScopedElapsedTime name(m_stats[cu.m_encData->m_frameEncoderID].acc)
+#define ProfileCUScope(cu, acc, count) ProfileCUScopeNamed(timedScope, cu, acc, count)
+#define ProfileCounter(cu, count) m_stats[cu.m_encData->m_frameEncoderID].count++;
+#else
+#define ProfileCUScopeNamed(name, cu, acc, count)
+#define ProfileCUScope(cu, acc, count)
+#define ProfileCounter(cu, count)
+#endif
+
+namespace X265_NS {
+// private namespace
+
+class Entropy;
+struct ThreadLocalData;
+
+/* All the CABAC contexts that Analysis needs to keep track of at each depth
+ * and temp buffers for residual, coeff, and recon for use during residual
+ * quad-tree depth recursion */
+struct RQTData
+{
+    Entropy  cur;     /* starting context for current CU */
+
+    /* these are indexed by qtLayer (log2size - 2) so nominally 0=4x4, 1=8x8, 2=16x16, 3=32x32
+     * the coeffRQT and reconQtYuv are allocated to the max CU size at every depth. The parts
+     * which are reconstructed at each depth are valid. At the end, the transform depth table
+     * is walked and the coeff and recon at the final split depths are collected */
+    Entropy  rqtRoot;      /* residual quad-tree start context */
+    Entropy  rqtTemp;      /* residual quad-tree temp context */
+    Entropy  rqtTest;      /* residual quad-tree test context */
+    coeff_t* coeffRQT[3];  /* coeff storage for entire CTU for each RQT layer */
+    Yuv      reconQtYuv;   /* recon storage for entire CTU for each RQT layer (intra) */
+    ShortYuv resiQtYuv;    /* residual storage for entire CTU for each RQT layer (inter) */
+    
+    /* per-depth temp buffers for inter prediction */
+    ShortYuv tmpResiYuv;
+    Yuv      tmpPredYuv;
+    Yuv      bidirPredYuv[2];
+};
+
+struct MotionData
+{
+    MV       mv;
+    MV       mvp;
+    int      mvpIdx;
+    int      ref;
+    uint32_t cost;
+    int      bits;
+};
+
+struct Mode
+{
+    CUData     cu;
+    const Yuv* fencYuv;
+    Yuv        predYuv;
+    Yuv        reconYuv;
+    Entropy    contexts;
+
+    enum { MAX_INTER_PARTS = 2 };
+
+    MotionData bestME[MAX_INTER_PARTS][2];
+    MV         amvpCand[2][MAX_NUM_REF][AMVP_NUM_CANDS];
+
+    // Neighbour MVs of the current partition. 5 spatial candidates and the
+    // temporal candidate.
+    InterNeighbourMV interNeighbours[6];
+
+    uint64_t   rdCost;     // sum of partition (psy) RD costs          (sse(fenc, recon) + lambda2 * bits)
+    uint64_t   sa8dCost;   // sum of partition sa8d distortion costs   (sa8d(fenc, pred) + lambda * bits)
+    uint32_t   sa8dBits;   // signal bits used in sa8dCost calculation
+    uint32_t   psyEnergy;  // sum of partition psycho-visual energy difference
+    sse_ret_t  resEnergy;  // sum of partition residual energy after motion prediction
+    sse_ret_t  lumaDistortion;
+    sse_ret_t  chromaDistortion;
+    sse_ret_t  distortion; // sum of partition SSE distortion
+    uint32_t   totalBits;  // sum of partition bits (mv + coeff)
+    uint32_t   mvBits;     // Mv bits + Ref + block type (or intra mode)
+    uint32_t   coeffBits;  // Texture bits (DCT Coeffs)
+
+    void initCosts()
+    {
+        rdCost = 0;
+        sa8dCost = 0;
+        sa8dBits = 0;
+        psyEnergy = 0;
+        resEnergy = 0;
+        lumaDistortion = 0;
+        chromaDistortion = 0;
+        distortion = 0;
+        totalBits = 0;
+        mvBits = 0;
+        coeffBits = 0;
+    }
+
+    void invalidate()
+    {
+        /* set costs to invalid data, catch uninitialized re-use */
+        rdCost = UINT64_MAX / 2;
+        sa8dCost = UINT64_MAX / 2;
+        sa8dBits = MAX_UINT / 2;
+        psyEnergy = MAX_UINT / 2;
+#if X265_DEPTH <= 10
+        resEnergy = MAX_UINT / 2;
+        lumaDistortion = MAX_UINT / 2;
+        chromaDistortion = MAX_UINT / 2;
+        distortion = MAX_UINT / 2;
+#else
+        resEnergy = UINT64_MAX / 2;
+        lumaDistortion = UINT64_MAX / 2;
+        chromaDistortion = UINT64_MAX / 2;
+        distortion = UINT64_MAX / 2;
+#endif
+        totalBits = MAX_UINT / 2;
+        mvBits = MAX_UINT / 2;
+        coeffBits = MAX_UINT / 2;
+    }
+
+    bool ok() const
+    {
+#if X265_DEPTH <= 10
+        return !(rdCost >= UINT64_MAX / 2 ||
+            sa8dCost >= UINT64_MAX / 2 ||
+            sa8dBits >= MAX_UINT / 2 ||
+            psyEnergy >= MAX_UINT / 2 ||
+            resEnergy >= MAX_UINT / 2 ||
+            lumaDistortion >= MAX_UINT / 2 ||
+            chromaDistortion >= MAX_UINT / 2 ||
+            distortion >= MAX_UINT / 2 ||
+            totalBits >= MAX_UINT / 2 ||
+            mvBits >= MAX_UINT / 2 ||
+            coeffBits >= MAX_UINT / 2);
+#else
+        return !(rdCost >= UINT64_MAX / 2 ||
+                 sa8dCost >= UINT64_MAX / 2 ||
+                 sa8dBits >= MAX_UINT / 2 ||
+                 psyEnergy >= MAX_UINT / 2 ||
+                 resEnergy >= UINT64_MAX / 2 ||
+                 lumaDistortion >= UINT64_MAX / 2 ||
+                 chromaDistortion >= UINT64_MAX / 2 ||
+                 distortion >= UINT64_MAX / 2 ||
+                 totalBits >= MAX_UINT / 2 ||
+                 mvBits >= MAX_UINT / 2 ||
+                 coeffBits >= MAX_UINT / 2);
+#endif
+    }
+
+    void addSubCosts(const Mode& subMode)
+    {
+        X265_CHECK(subMode.ok(), "sub-mode not initialized");
+
+        rdCost += subMode.rdCost;
+        sa8dCost += subMode.sa8dCost;
+        sa8dBits += subMode.sa8dBits;
+        psyEnergy += subMode.psyEnergy;
+        resEnergy += subMode.resEnergy;
+        lumaDistortion += subMode.lumaDistortion;
+        chromaDistortion += subMode.chromaDistortion;
+        distortion += subMode.distortion;
+        totalBits += subMode.totalBits;
+        mvBits += subMode.mvBits;
+        coeffBits += subMode.coeffBits;
+    }
+};
+
+#if DETAILED_CU_STATS
+/* This structure is intended for performance debugging and we make no attempt
+ * to handle dynamic range overflows. Care should be taken to avoid long encodes
+ * if you care about the accuracy of these elapsed times and counters. This
+ * profiling is orthogonal to PPA/VTune and can be enabled independently from
+ * either of them */
+struct CUStats
+{
+    int64_t  intraRDOElapsedTime[NUM_CU_DEPTH]; // elapsed worker time in intra RDO per CU depth
+    int64_t  interRDOElapsedTime[NUM_CU_DEPTH]; // elapsed worker time in inter RDO per CU depth
+    int64_t  intraAnalysisElapsedTime;          // elapsed worker time in intra sa8d analysis
+    int64_t  motionEstimationElapsedTime;       // elapsed worker time in predInterSearch()
+    int64_t  loopFilterElapsedTime;             // elapsed worker time in deblock and SAO and PSNR/SSIM
+    int64_t  pmeTime;                           // elapsed worker time processing ME slave jobs
+    int64_t  pmeBlockTime;                      // elapsed worker time blocked for pme batch completion
+    int64_t  pmodeTime;                         // elapsed worker time processing pmode slave jobs
+    int64_t  pmodeBlockTime;                    // elapsed worker time blocked for pmode batch completion
+    int64_t  weightAnalyzeTime;                 // elapsed worker time analyzing reference weights
+    int64_t  totalCTUTime;                      // elapsed worker time in compressCTU (includes pmode master)
+
+    uint32_t skippedMotionReferences[NUM_CU_DEPTH];
+    uint32_t totalMotionReferences[NUM_CU_DEPTH];
+    uint32_t skippedIntraCU[NUM_CU_DEPTH];
+    uint32_t totalIntraCU[NUM_CU_DEPTH];
+
+    uint64_t countIntraRDO[NUM_CU_DEPTH];
+    uint64_t countInterRDO[NUM_CU_DEPTH];
+    uint64_t countIntraAnalysis;
+    uint64_t countMotionEstimate;
+    uint64_t countLoopFilter;
+    uint64_t countPMETasks;
+    uint64_t countPMEMasters;
+    uint64_t countPModeTasks;
+    uint64_t countPModeMasters;
+    uint64_t countWeightAnalyze;
+    uint64_t totalCTUs;
+
+    CUStats() { clear(); }
+
+    void clear()
+    {
+        memset(this, 0, sizeof(*this));
+    }
+
+    void accumulate(CUStats& other)
+    {
+        for (uint32_t i = 0; i <= g_maxCUDepth; i++)
+        {
+            intraRDOElapsedTime[i] += other.intraRDOElapsedTime[i];
+            interRDOElapsedTime[i] += other.interRDOElapsedTime[i];
+            countIntraRDO[i] += other.countIntraRDO[i];
+            countInterRDO[i] += other.countInterRDO[i];
+            skippedMotionReferences[i] += other.skippedMotionReferences[i];
+            totalMotionReferences[i] += other.totalMotionReferences[i];
+            skippedIntraCU[i] += other.skippedIntraCU[i];
+            totalIntraCU[i] += other.totalIntraCU[i];
+        }
+
+        intraAnalysisElapsedTime += other.intraAnalysisElapsedTime;
+        motionEstimationElapsedTime += other.motionEstimationElapsedTime;
+        loopFilterElapsedTime += other.loopFilterElapsedTime;
+        pmeTime += other.pmeTime;
+        pmeBlockTime += other.pmeBlockTime;
+        pmodeTime += other.pmodeTime;
+        pmodeBlockTime += other.pmodeBlockTime;
+        weightAnalyzeTime += other.weightAnalyzeTime;
+        totalCTUTime += other.totalCTUTime;
+
+        countIntraAnalysis += other.countIntraAnalysis;
+        countMotionEstimate += other.countMotionEstimate;
+        countLoopFilter += other.countLoopFilter;
+        countPMETasks += other.countPMETasks;
+        countPMEMasters += other.countPMEMasters;
+        countPModeTasks += other.countPModeTasks;
+        countPModeMasters += other.countPModeMasters;
+        countWeightAnalyze += other.countWeightAnalyze;
+        totalCTUs += other.totalCTUs;
+
+        other.clear();
+    }
+}; 
+#endif
+
+inline int getTUBits(int idx, int numIdx)
+{
+    return idx + (idx < numIdx - 1);
+}
+
+class Search : public Predict
+{
+public:
+
+    static const int16_t zeroShort[MAX_CU_SIZE];
+
+    MotionEstimate  m_me;
+    Quant           m_quant;
+    RDCost          m_rdCost;
+    const x265_param* m_param;
+    Frame*          m_frame;
+    const Slice*    m_slice;
+
+    Entropy         m_entropyCoder;
+    RQTData         m_rqt[NUM_FULL_DEPTH];
+
+    uint8_t*        m_qtTempCbf[3];
+    uint8_t*        m_qtTempTransformSkipFlag[3];
+
+    pixel*          m_fencScaled;     /* 32x32 buffer for down-scaled version of 64x64 CU fenc */
+    pixel*          m_fencTransposed; /* 32x32 buffer for transposed copy of fenc */
+    pixel*          m_intraPred;      /* 32x32 buffer for individual intra predictions */
+    pixel*          m_intraPredAngs;  /* allocation for 33 consecutive (all angular) 32x32 intra predictions */
+
+    coeff_t*        m_tsCoeff;        /* transform skip coeff 32x32 */
+    int16_t*        m_tsResidual;     /* transform skip residual 32x32 */
+    pixel*          m_tsRecon;        /* transform skip reconstructed pixels 32x32 */
+
+    bool            m_bFrameParallel;
+    bool            m_bEnableRDOQ;
+    uint32_t        m_numLayers;
+    uint32_t        m_refLagPixels;
+
+#if DETAILED_CU_STATS
+    /* Accumulate CU statistics separately for each frame encoder */
+    CUStats         m_stats[X265_MAX_FRAME_THREADS];
+#endif
+
+    Search();
+    ~Search();
+
+    bool     initSearch(const x265_param& param, ScalingList& scalingList);
+    int      setLambdaFromQP(const CUData& ctu, int qp); /* returns real quant QP in valid spec range */
+
+    // mark temp RD entropy contexts as uninitialized; useful for finding loads without stores
+    void     invalidateContexts(int fromDepth);
+
+    // full RD search of intra modes. if sharedModes is not NULL, it directly uses them
+    void     checkIntra(Mode& intraMode, const CUGeom& cuGeom, PartSize partSize, uint8_t* sharedModes, uint8_t* sharedChromaModes);
+
+    // select best intra mode using only sa8d costs, cannot measure NxN intra
+    void     checkIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
+    // encode luma mode selected by checkIntraInInter, then pick and encode a chroma mode
+    void     encodeIntraInInter(Mode& intraMode, const CUGeom& cuGeom);
+
+    // estimation inter prediction (non-skip)
+    void     predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChromaMC, uint32_t masks[2]);
+
+    // encode residual and compute rd-cost for inter mode
+    void     encodeResAndCalcRdInterCU(Mode& interMode, const CUGeom& cuGeom);
+    void     encodeResAndCalcRdSkipCU(Mode& interMode);
+
+    // encode residual without rd-cost
+    void     residualTransformQuantInter(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]);
+    void     residualTransformQuantIntra(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth, const uint32_t depthRange[2]);
+    void     residualQTIntraChroma(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t tuDepth);
+
+    // pick be chroma mode from available using just sa8d costs
+    void     getBestIntraModeChroma(Mode& intraMode, const CUGeom& cuGeom);
+
+    /* update CBF flags and QP values to be internally consistent */
+    void checkDQP(Mode& mode, const CUGeom& cuGeom);
+    void checkDQPForSplitPred(Mode& mode, const CUGeom& cuGeom);
+
+    MV getLowresMV(const CUData& cu, const PredictionUnit& pu, int list, int ref);
+
+    class PME : public BondedTaskGroup
+    {
+    public:
+
+        Search&       master;
+        Mode&         mode;
+        const CUGeom& cuGeom;
+        const PredictionUnit& pu;
+        int           puIdx;
+
+        struct {
+            int ref[2][MAX_NUM_REF];
+            int refCnt[2];
+        } m_jobs;
+
+        PME(Search& s, Mode& m, const CUGeom& g, const PredictionUnit& u, int p) : master(s), mode(m), cuGeom(g), pu(u), puIdx(p) {}
+
+        void processTasks(int workerThreadId);
+
+    protected:
+
+        PME operator=(const PME&);
+    };
+
+    void     processPME(PME& pme, Search& slave);
+    void     singleMotionEstimation(Search& master, Mode& interMode, const PredictionUnit& pu, int part, int list, int ref);
+
+protected:
+
+    /* motion estimation distribution */
+    ThreadLocalData* m_tld;
+
+    uint32_t      m_listSelBits[3];
+    Lock          m_meLock;
+
+    void     saveResidualQTData(CUData& cu, ShortYuv& resiYuv, uint32_t absPartIdx, uint32_t tuDepth);
+
+    // RDO search of luma intra modes; result is fully encoded luma. luma distortion is returned
+    uint32_t estIntraPredQT(Mode &intraMode, const CUGeom& cuGeom, const uint32_t depthRange[2], uint8_t* sharedModes);
+
+    // RDO select best chroma mode from luma; result is fully encode chroma. chroma distortion is returned
+    uint32_t estIntraPredChromaQT(Mode &intraMode, const CUGeom& cuGeom, uint8_t* sharedChromaModes);
+
+    void     codeSubdivCbfQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx);
+    void     codeInterSubdivCbfQT(CUData& cu, uint32_t absPartIdx, const uint32_t tuDepth, const uint32_t depthRange[2]);
+    void     codeCoeffQTChroma(const CUData& cu, uint32_t tuDepth, uint32_t absPartIdx, TextType ttype);
+
+    struct Cost
+    {
+        uint64_t rdcost;
+        uint32_t bits;
+        sse_ret_t distortion;
+        uint32_t energy;
+        Cost() { rdcost = 0; bits = 0; distortion = 0; energy = 0; }
+    };
+
+    uint64_t estimateNullCbfCost(uint32_t &dist, uint32_t &psyEnergy, uint32_t tuDepth, TextType compId);
+    void     estimateResidualQT(Mode& mode, const CUGeom& cuGeom, uint32_t absPartIdx, uint32_t depth, ShortYuv& resiYuv, Cost& costs, const uint32_t depthRange[2]);
+
+    // generate prediction, generate residual and recon. if bAllowSplit, find optimal RQT splits
+    void     codeIntraLumaQT(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, bool bAllowSplit, Cost& costs, const uint32_t depthRange[2]);
+    void     codeIntraLumaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, Cost& costs);
+    void     extractIntraResultQT(CUData& cu, Yuv& reconYuv, uint32_t tuDepth, uint32_t absPartIdx);
+
+    // generate chroma prediction, generate residual and recon
+    uint32_t codeIntraChromaQt(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t absPartIdx, uint32_t& psyEnergy);
+    uint32_t codeIntraChromaTSkip(Mode& mode, const CUGeom& cuGeom, uint32_t tuDepth, uint32_t tuDepthC, uint32_t absPartIdx, uint32_t& psyEnergy);
+    void     extractIntraResultChromaQT(CUData& cu, Yuv& reconYuv, uint32_t absPartIdx, uint32_t tuDepth);
+
+    // reshuffle CBF flags after coding a pair of 4:2:2 chroma blocks
+    void     offsetSubTUCBFs(CUData& cu, TextType ttype, uint32_t tuDepth, uint32_t absPartIdx);
+
+    /* output of mergeEstimation, best merge candidate */
+    struct MergeData
+    {
+        MVField  mvField[2];
+        uint32_t dir;
+        uint32_t index;
+        uint32_t bits;
+    };
+
+    /* inter/ME helper functions */
+    int       selectMVP(const CUData& cu, const PredictionUnit& pu, const MV amvp[AMVP_NUM_CANDS], int list, int ref);
+    const MV& checkBestMVP(const MV amvpCand[2], const MV& mv, int& mvpIdx, uint32_t& outBits, uint32_t& outCost) const;
+    void     setSearchRange(const CUData& cu, const MV& mvp, int merange, MV& mvmin, MV& mvmax) const;
+    uint32_t mergeEstimation(CUData& cu, const CUGeom& cuGeom, const PredictionUnit& pu, int puIdx, MergeData& m);
+    static void getBlkBits(PartSize cuMode, bool bPSlice, int puIdx, uint32_t lastMode, uint32_t blockBit[3]);
+
+    /* intra helper functions */
+    enum { MAX_RD_INTRA_MODES = 16 };
+    static void updateCandList(uint32_t mode, uint64_t cost, int maxCandCount, uint32_t* candModeList, uint64_t* candCostList);
+
+    // get most probable luma modes for CU part, and bit cost of all non mpm modes
+    uint32_t getIntraRemModeBits(CUData & cu, uint32_t absPartIdx, uint32_t mpmModes[3], uint64_t& mpms) const;
+
+    void updateModeCost(Mode& m) const { m.rdCost = m_rdCost.m_psyRd ? m_rdCost.calcPsyRdCost(m.distortion, m.totalBits, m.psyEnergy) : m_rdCost.calcRdCost(m.distortion, m.totalBits); }
+};
+}
+
+#endif // ifndef X265_SEARCH_H
--- a/x265/source/encoder/sei.cpp
+++ b/x265/source/encoder/sei.cpp
@ -0,0 +1,74 @@
+/*****************************************************************************
+* Copyright (C) 2013 x265 project
+*
+* Authors: Steve Borho <steve@borho.org>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#include "common.h"
+#include "bitstream.h"
+#include "slice.h"
+#include "sei.h"
+
+using namespace X265_NS;
+
+/* x265's identifying GUID */
+const uint8_t SEIuserDataUnregistered::m_uuid_iso_iec_11578[16] = {
+    0x2C, 0xA2, 0xDE, 0x09, 0xB5, 0x17, 0x47, 0xDB,
+    0xBB, 0x55, 0xA4, 0xFE, 0x7F, 0xC2, 0xFC, 0x4E
+};
+
+/* marshal a single SEI message sei, storing the marshalled representation
+ * in bitstream bs */
+void SEI::write(Bitstream& bs, const SPS& sps)
+{
+    BitCounter count;
+    m_bitIf = &count;
+
+    /* virtual writeSEI method, write to bit counter */
+    writeSEI(sps);
+
+    m_bitIf = &bs;
+    uint32_t type = payloadType();
+    for (; type >= 0xff; type -= 0xff)
+        WRITE_CODE(0xff, 8, "payload_type");
+    WRITE_CODE(type, 8, "payload_type");
+
+    X265_CHECK(0 == (count.getNumberOfWrittenBits() & 7), "payload unaligned\n");
+    uint32_t payloadSize = count.getNumberOfWrittenBits() >> 3;
+    for (; payloadSize >= 0xff; payloadSize -= 0xff)
+        WRITE_CODE(0xff, 8, "payload_size");
+    WRITE_CODE(payloadSize, 8, "payload_size");
+
+    /* virtual writeSEI method, write to bs */
+    writeSEI(sps);
+}
+
+void SEI::writeByteAlign()
+{
+    // TODO: expose bs.writeByteAlignment() as virtual function
+    if (m_bitIf->getNumberOfWrittenBits() % 8 != 0)
+    {
+        WRITE_FLAG(1, "bit_equal_to_one");
+        while (m_bitIf->getNumberOfWrittenBits() % 8 != 0)
+        {
+            WRITE_FLAG(0, "bit_equal_to_zero");
+        }
+    }
+}
--- a/x265/source/encoder/sei.h
+++ b/x265/source/encoder/sei.h
@ -0,0 +1,344 @@
+/*****************************************************************************
+* Copyright (C) 2013 x265 project
+*
+* Authors: Steve Borho <steve@borho.org>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at license @ x265.com.
+*****************************************************************************/
+
+#ifndef X265_SEI_H
+#define X265_SEI_H
+
+#include "common.h"
+#include "bitstream.h"
+#include "slice.h"
+
+namespace X265_NS {
+// private namespace
+
+class SEI : public SyntaxElementWriter
+{
+public:
+
+    /* SEI users call write() to marshal an SEI to a bitstream. SEI
+     * subclasses may implement write() or accept the default write()
+     * method which calls writeSEI() with a bitcounter to determine
+     * the size, then it encodes the header and calls writeSEI a
+     * second time for the real encode. */
+    virtual void write(Bitstream& bs, const SPS& sps);
+
+    virtual ~SEI() {}
+
+protected:
+
+    enum PayloadType
+    {
+        BUFFERING_PERIOD                     = 0,
+        PICTURE_TIMING                       = 1,
+        PAN_SCAN_RECT                        = 2,
+        FILLER_PAYLOAD                       = 3,
+        USER_DATA_REGISTERED_ITU_T_T35       = 4,
+        USER_DATA_UNREGISTERED               = 5,
+        RECOVERY_POINT                       = 6,
+        SCENE_INFO                           = 9,
+        FULL_FRAME_SNAPSHOT                  = 15,
+        PROGRESSIVE_REFINEMENT_SEGMENT_START = 16,
+        PROGRESSIVE_REFINEMENT_SEGMENT_END   = 17,
+        FILM_GRAIN_CHARACTERISTICS           = 19,
+        POST_FILTER_HINT                     = 22,
+        TONE_MAPPING_INFO                    = 23,
+        FRAME_PACKING                        = 45,
+        DISPLAY_ORIENTATION                  = 47,
+        SOP_DESCRIPTION                      = 128,
+        ACTIVE_PARAMETER_SETS                = 129,
+        DECODING_UNIT_INFO                   = 130,
+        TEMPORAL_LEVEL0_INDEX                = 131,
+        DECODED_PICTURE_HASH                 = 132,
+        SCALABLE_NESTING                     = 133,
+        REGION_REFRESH_INFO                  = 134,
+        MASTERING_DISPLAY_INFO               = 137,
+        CONTENT_LIGHT_LEVEL_INFO             = 144,
+    };
+
+    virtual PayloadType payloadType() const = 0;
+
+    virtual void writeSEI(const SPS&) { X265_CHECK(0, "empty writeSEI method called\n");  }
+
+    void writeByteAlign();
+};
+
+class SEIuserDataUnregistered : public SEI
+{
+public:
+
+    PayloadType payloadType() const { return USER_DATA_UNREGISTERED; }
+
+    SEIuserDataUnregistered() : m_userData(NULL) {}
+
+    static const uint8_t m_uuid_iso_iec_11578[16];
+    uint32_t m_userDataLength;
+    uint8_t *m_userData;
+
+    void write(Bitstream& bs, const SPS&)
+    {
+        m_bitIf = &bs;
+
+        WRITE_CODE(USER_DATA_UNREGISTERED, 8, "payload_type");
+
+        uint32_t payloadSize = 16 + m_userDataLength;
+        for (; payloadSize >= 0xff; payloadSize -= 0xff)
+            WRITE_CODE(0xff, 8, "payload_size");
+        WRITE_CODE(payloadSize, 8, "payload_size");
+
+        for (uint32_t i = 0; i < 16; i++)
+            WRITE_CODE(m_uuid_iso_iec_11578[i], 8, "sei.uuid_iso_iec_11578[i]");
+
+        for (uint32_t i = 0; i < m_userDataLength; i++)
+            WRITE_CODE(m_userData[i], 8, "user_data");
+    }
+};
+
+class SEIMasteringDisplayColorVolume : public SEI
+{
+public:
+
+    uint16_t displayPrimaryX[3];
+    uint16_t displayPrimaryY[3];
+    uint16_t whitePointX, whitePointY;
+    uint32_t maxDisplayMasteringLuminance;
+    uint32_t minDisplayMasteringLuminance;
+
+    PayloadType payloadType() const { return MASTERING_DISPLAY_INFO; }
+
+    bool parse(const char* value)
+    {
+        return sscanf(value, "G(%hu,%hu)B(%hu,%hu)R(%hu,%hu)WP(%hu,%hu)L(%u,%u)",
+                      &displayPrimaryX[0], &displayPrimaryY[0],
+                      &displayPrimaryX[1], &displayPrimaryY[1],
+                      &displayPrimaryX[2], &displayPrimaryY[2],
+                      &whitePointX, &whitePointY,
+                      &maxDisplayMasteringLuminance, &minDisplayMasteringLuminance) == 10;
+    }
+
+    void write(Bitstream& bs, const SPS&)
+    {
+        m_bitIf = &bs;
+
+        WRITE_CODE(MASTERING_DISPLAY_INFO, 8, "payload_type");
+        WRITE_CODE(8 * 2 + 2 * 4, 8, "payload_size");
+
+        for (uint32_t i = 0; i < 3; i++)
+        {
+            WRITE_CODE(displayPrimaryX[i], 16, "display_primaries_x[ c ]");
+            WRITE_CODE(displayPrimaryY[i], 16, "display_primaries_y[ c ]");
+        }
+        WRITE_CODE(whitePointX, 16, "white_point_x");
+        WRITE_CODE(whitePointY, 16, "white_point_y");
+        WRITE_CODE(maxDisplayMasteringLuminance, 32, "max_display_mastering_luminance");
+        WRITE_CODE(minDisplayMasteringLuminance, 32, "min_display_mastering_luminance");
+    }
+};
+
+class SEIContentLightLevel : public SEI
+{
+public:
+
+    uint16_t max_content_light_level;
+    uint16_t max_pic_average_light_level;
+
+    PayloadType payloadType() const { return CONTENT_LIGHT_LEVEL_INFO; }
+
+    void write(Bitstream& bs, const SPS&)
+    {
+        m_bitIf = &bs;
+
+        WRITE_CODE(CONTENT_LIGHT_LEVEL_INFO, 8, "payload_type");
+        WRITE_CODE(4, 8, "payload_size");
+        WRITE_CODE(max_content_light_level,     16, "max_content_light_level");
+        WRITE_CODE(max_pic_average_light_level, 16, "max_pic_average_light_level");
+    }
+};
+
+class SEIDecodedPictureHash : public SEI
+{
+public:
+
+    PayloadType payloadType() const { return DECODED_PICTURE_HASH; }
+
+    enum Method
+    {
+        MD5,
+        CRC,
+        CHECKSUM,
+    } m_method;
+
+    uint8_t m_digest[3][16];
+
+    void write(Bitstream& bs, const SPS&)
+    {
+        m_bitIf = &bs;
+
+        WRITE_CODE(DECODED_PICTURE_HASH, 8, "payload_type");
+
+        switch (m_method)
+        {
+        case MD5:
+            WRITE_CODE(1 + 16 * 3, 8, "payload_size");
+            WRITE_CODE(MD5, 8, "hash_type");
+            break;
+        case CRC:
+            WRITE_CODE(1 + 2 * 3, 8, "payload_size");
+            WRITE_CODE(CRC, 8, "hash_type");
+            break;
+        case CHECKSUM:
+            WRITE_CODE(1 + 4 * 3, 8, "payload_size");
+            WRITE_CODE(CHECKSUM, 8, "hash_type");
+            break;
+        }
+
+        for (int yuvIdx = 0; yuvIdx < 3; yuvIdx++)
+        {
+            if (m_method == MD5)
+            {
+                for (uint32_t i = 0; i < 16; i++)
+                    WRITE_CODE(m_digest[yuvIdx][i], 8, "picture_md5");
+            }
+            else if (m_method == CRC)
+            {
+                uint32_t val = (m_digest[yuvIdx][0] << 8) + m_digest[yuvIdx][1];
+                WRITE_CODE(val, 16, "picture_crc");
+            }
+            else if (m_method == CHECKSUM)
+            {
+                uint32_t val = (m_digest[yuvIdx][0] << 24) + (m_digest[yuvIdx][1] << 16) + (m_digest[yuvIdx][2] << 8) + m_digest[yuvIdx][3];
+                WRITE_CODE(val, 32, "picture_checksum");
+            }
+        }
+    }
+};
+
+class SEIActiveParameterSets : public SEI
+{
+public:
+
+    PayloadType payloadType() const { return ACTIVE_PARAMETER_SETS; }
+
+    bool m_selfContainedCvsFlag;
+    bool m_noParamSetUpdateFlag;
+
+    void writeSEI(const SPS&)
+    {
+        WRITE_CODE(0, 4, "active_vps_id");
+        WRITE_FLAG(m_selfContainedCvsFlag, "self_contained_cvs_flag");
+        WRITE_FLAG(m_noParamSetUpdateFlag, "no_param_set_update_flag");
+        WRITE_UVLC(0, "num_sps_ids_minus1");
+        WRITE_UVLC(0, "active_seq_param_set_id");
+        writeByteAlign();
+    }
+};
+
+class SEIBufferingPeriod : public SEI
+{
+public:
+
+    PayloadType payloadType() const { return BUFFERING_PERIOD; }
+
+    SEIBufferingPeriod()
+        : m_cpbDelayOffset(0)
+        , m_dpbDelayOffset(0)
+        , m_auCpbRemovalDelayDelta(1)
+    {
+    }
+
+    bool     m_cpbDelayOffset;
+    bool     m_dpbDelayOffset;
+    uint32_t m_initialCpbRemovalDelay;
+    uint32_t m_initialCpbRemovalDelayOffset;
+    uint32_t m_auCpbRemovalDelayDelta;
+
+    void writeSEI(const SPS& sps)
+    {
+        const HRDInfo& hrd = sps.vuiParameters.hrdParameters;
+
+        WRITE_UVLC(0, "bp_seq_parameter_set_id");
+        WRITE_FLAG(0, "rap_cpb_params_present_flag");
+        WRITE_FLAG(0, "concatenation_flag");
+        WRITE_CODE(m_auCpbRemovalDelayDelta - 1,   hrd.cpbRemovalDelayLength,       "au_cpb_removal_delay_delta_minus1");
+        WRITE_CODE(m_initialCpbRemovalDelay,       hrd.initialCpbRemovalDelayLength,        "initial_cpb_removal_delay");
+        WRITE_CODE(m_initialCpbRemovalDelayOffset, hrd.initialCpbRemovalDelayLength, "initial_cpb_removal_delay_offset");
+
+        writeByteAlign();
+    }
+};
+
+class SEIPictureTiming : public SEI
+{
+public:
+
+    PayloadType payloadType() const { return PICTURE_TIMING; }
+
+    uint32_t  m_picStruct;
+    uint32_t  m_sourceScanType;
+    bool      m_duplicateFlag;
+
+    uint32_t  m_auCpbRemovalDelay;
+    uint32_t  m_picDpbOutputDelay;
+
+    void writeSEI(const SPS& sps)
+    {
+        const VUI *vui = &sps.vuiParameters;
+        const HRDInfo *hrd = &vui->hrdParameters;
+
+        if (vui->frameFieldInfoPresentFlag)
+        {
+            WRITE_CODE(m_picStruct, 4,          "pic_struct");
+            WRITE_CODE(m_sourceScanType, 2,     "source_scan_type");
+            WRITE_FLAG(m_duplicateFlag,         "duplicate_flag");
+        }
+
+        if (vui->hrdParametersPresentFlag)
+        {
+            WRITE_CODE(m_auCpbRemovalDelay - 1, hrd->cpbRemovalDelayLength, "au_cpb_removal_delay_minus1");
+            WRITE_CODE(m_picDpbOutputDelay, hrd->dpbOutputDelayLength, "pic_dpb_output_delay");
+            /* Removed sub-pic signaling June 2014 */
+        }
+        writeByteAlign();
+    }
+};
+
+class SEIRecoveryPoint : public SEI
+{
+public:
+
+    PayloadType payloadType() const { return RECOVERY_POINT; }
+
+    int  m_recoveryPocCnt;
+    bool m_exactMatchingFlag;
+    bool m_brokenLinkFlag;
+
+    void writeSEI(const SPS&)
+    {
+        WRITE_SVLC(m_recoveryPocCnt,    "recovery_poc_cnt");
+        WRITE_FLAG(m_exactMatchingFlag, "exact_matching_flag");
+        WRITE_FLAG(m_brokenLinkFlag,    "broken_link_flag");
+        writeByteAlign();
+    }
+};
+}
+
+#endif // ifndef X265_SEI_H
--- a/x265/source/encoder/slicetype.cpp
+++ b/x265/source/encoder/slicetype.cpp
--- a/x265/source/encoder/slicetype.h
+++ b/x265/source/encoder/slicetype.h
@ -0,0 +1,243 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Steve Borho <steve@borho.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#ifndef X265_SLICETYPE_H
+#define X265_SLICETYPE_H
+
+#include "common.h"
+#include "slice.h"
+#include "motion.h"
+#include "piclist.h"
+#include "threadpool.h"
+
+namespace X265_NS {
+// private namespace
+
+struct Lowres;
+class Frame;
+class Lookahead;
+
+#define LOWRES_COST_MASK  ((1 << 14) - 1)
+#define LOWRES_COST_SHIFT 14
+
+/* Thread local data for lookahead tasks */
+struct LookaheadTLD
+{
+    MotionEstimate  me;
+    ReferencePlanes weightedRef;
+    pixel*          wbuffer[4];
+    int             widthInCU;
+    int             heightInCU;
+    int             ncu;
+    int             paddedLines;
+
+#if DETAILED_CU_STATS
+    int64_t         batchElapsedTime;
+    int64_t         coopSliceElapsedTime;
+    uint64_t        countBatches;
+    uint64_t        countCoopSlices;
+#endif
+
+    LookaheadTLD()
+    {
+        me.setQP(X265_LOOKAHEAD_QP);
+        me.init(X265_HEX_SEARCH, 1, X265_CSP_I400);
+        for (int i = 0; i < 4; i++)
+            wbuffer[i] = NULL;
+        widthInCU = heightInCU = ncu = paddedLines = 0;
+
+#if DETAILED_CU_STATS
+        batchElapsedTime = 0;
+        coopSliceElapsedTime = 0;
+        countBatches = 0;
+        countCoopSlices = 0;
+#endif
+    }
+
+    void init(int w, int h, int n)
+    {
+        widthInCU = w;
+        heightInCU = h;
+        ncu = n;
+    }
+
+    ~LookaheadTLD() { X265_FREE(wbuffer[0]); }
+
+    void calcAdaptiveQuantFrame(Frame *curFrame, x265_param* param);
+    void lowresIntraEstimate(Lowres& fenc);
+
+    void weightsAnalyse(Lowres& fenc, Lowres& ref);
+
+protected:
+
+    uint32_t acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp);
+    uint32_t weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp);
+    bool     allocWeightedRef(Lowres& fenc);
+};
+
+class Lookahead : public JobProvider
+{
+public:
+
+    PicList       m_inputQueue;      // input pictures in order received
+    PicList       m_outputQueue;     // pictures to be encoded, in encode order
+    Lock          m_inputLock;
+    Lock          m_outputLock;
+
+    /* pre-lookahead */
+    int           m_fullQueueSize;
+    bool          m_isActive;
+    bool          m_sliceTypeBusy;
+    bool          m_bAdaptiveQuant;
+    bool          m_outputSignalRequired;
+    bool          m_bBatchMotionSearch;
+    bool          m_bBatchFrameCosts;
+    Event         m_outputSignal;
+
+    LookaheadTLD* m_tld;
+    x265_param*   m_param;
+    Lowres*       m_lastNonB;
+    int*          m_scratch;         // temp buffer for cutree propagate
+    
+    int           m_histogram[X265_BFRAME_MAX + 1];
+    int           m_lastKeyframe;
+    int           m_8x8Width;
+    int           m_8x8Height;
+    int           m_8x8Blocks;
+    int           m_numCoopSlices;
+    int           m_numRowsPerSlice;
+    bool          m_filled;
+    bool          m_isSceneTransition;
+    Lookahead(x265_param *param, ThreadPool *pool);
+
+#if DETAILED_CU_STATS
+    int64_t       m_slicetypeDecideElapsedTime;
+    int64_t       m_preLookaheadElapsedTime;
+    uint64_t      m_countSlicetypeDecide;
+    uint64_t      m_countPreLookahead;
+    void          getWorkerStats(int64_t& batchElapsedTime, uint64_t& batchCount, int64_t& coopSliceElapsedTime, uint64_t& coopSliceCount);
+#endif
+
+    bool    create();
+    void    destroy();
+    void    stopJobs();
+
+    void    addPicture(Frame&, int sliceType);
+    void    flush();
+    Frame*  getDecidedPicture();
+
+    void    getEstimatedPictureCost(Frame *pic);
+
+
+protected:
+
+    void    findJob(int workerThreadID);
+    void    slicetypeDecide();
+    void    slicetypeAnalyse(Lowres **frames, bool bKeyframe);
+
+    /* called by slicetypeAnalyse() to make slice decisions */
+    bool    scenecut(Lowres **frames, int p0, int p1, bool bRealScenecut, int numFrames);
+    bool    scenecutInternal(Lowres **frames, int p0, int p1, bool bRealScenecut);
+    void    slicetypePath(Lowres **frames, int length, char(*best_paths)[X265_LOOKAHEAD_MAX + 1]);
+    int64_t slicetypePathCost(Lowres **frames, char *path, int64_t threshold);
+    int64_t vbvFrameCost(Lowres **frames, int p0, int p1, int b);
+    void    vbvLookahead(Lowres **frames, int numFrames, int keyframes);
+
+    /* called by slicetypeAnalyse() to effect cuTree adjustments to adaptive
+     * quant offsets */
+    void    cuTree(Lowres **frames, int numframes, bool bintra);
+    void    estimateCUPropagate(Lowres **frames, double average_duration, int p0, int p1, int b, int referenced);
+    void    cuTreeFinish(Lowres *frame, double averageDuration, int ref0Distance);
+
+    /* called by getEstimatedPictureCost() to finalize cuTree costs */
+    int64_t frameCostRecalculate(Lowres **frames, int p0, int p1, int b);
+};
+
+class PreLookaheadGroup : public BondedTaskGroup
+{
+public:
+
+    Frame* m_preframes[X265_LOOKAHEAD_MAX];
+    Lookahead& m_lookahead;
+
+    PreLookaheadGroup(Lookahead& l) : m_lookahead(l) {}
+
+    void processTasks(int workerThreadID);
+
+protected:
+
+    PreLookaheadGroup& operator=(const PreLookaheadGroup&);
+};
+
+class CostEstimateGroup : public BondedTaskGroup
+{
+public:
+
+    Lookahead& m_lookahead;
+    Lowres**   m_frames;
+    bool       m_batchMode;
+
+    CostEstimateGroup(Lookahead& l, Lowres** f) : m_lookahead(l), m_frames(f), m_batchMode(false) {}
+
+    /* Cooperative cost estimate using multiple slices of downscaled frame */
+    struct Coop
+    {
+        int  p0, b, p1;
+        bool bDoSearch[2];
+    } m_coop;
+
+    enum { MAX_COOP_SLICES = 32 };
+    struct Slice
+    {
+        int  costEst;
+        int  costEstAq;
+        int  intraMbs;
+    } m_slice[MAX_COOP_SLICES];
+
+    int64_t singleCost(int p0, int p1, int b, bool intraPenalty = false);
+
+    /* Batch cost estimates, using one worker thread per estimateFrameCost() call */
+    enum { MAX_BATCH_SIZE = 512 };
+    struct Estimate
+    {
+        int  p0, b, p1;
+    } m_estimates[MAX_BATCH_SIZE];
+
+    void add(int p0, int p1, int b);
+    void finishBatch();
+
+protected:
+
+    static const int s_merange = 16;
+
+    void    processTasks(int workerThreadID);
+
+    int64_t estimateFrameCost(LookaheadTLD& tld, int p0, int p1, int b, bool intraPenalty);
+    void    estimateCUCost(LookaheadTLD& tld, int cux, int cuy, int p0, int p1, int b, bool bDoSearch[2], bool lastRow, int slice);
+
+    CostEstimateGroup& operator=(const CostEstimateGroup&);
+};
+
+}
+
+#endif // ifndef X265_SLICETYPE_H
--- a/x265/source/encoder/weightPrediction.cpp
+++ b/x265/source/encoder/weightPrediction.cpp
@ -0,0 +1,536 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Author: Shazeb Nawaz Khan <shazeb@multicorewareinc.com>
+ *         Steve Borho <steve@borho.org>
+ *         Kavitha Sampas <kavitha@multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at license @ x265.com.
+ *****************************************************************************/
+
+#include "common.h"
+#include "frame.h"
+#include "picyuv.h"
+#include "lowres.h"
+#include "slice.h"
+#include "mv.h"
+#include "bitstream.h"
+
+using namespace X265_NS;
+namespace {
+struct Cache
+{
+    const int * intraCost;
+    int         numPredDir;
+    int         csp;
+    int         hshift;
+    int         vshift;
+    int         lowresWidthInCU;
+    int         lowresHeightInCU;
+};
+
+int sliceHeaderCost(WeightParam *w, int lambda, int bChroma)
+{
+    /* 4 times higher, because chroma is analyzed at full resolution. */
+    if (bChroma)
+        lambda *= 4;
+    int denomCost = bs_size_ue(w[0].log2WeightDenom) * (2 - bChroma);
+    return lambda * (10 + denomCost + 2 * (bs_size_se(w[0].inputWeight) + bs_size_se(w[0].inputOffset)));
+}
+
+/* make a motion compensated copy of lowres ref into mcout with the same stride.
+ * The borders of mcout are not extended */
+void mcLuma(pixel* mcout, Lowres& ref, const MV * mvs)
+{
+    intptr_t stride = ref.lumaStride;
+    const int mvshift = 1 << 2;
+    const int cuSize = 8;
+    MV mvmin, mvmax;
+
+    int cu = 0;
+
+    for (int y = 0; y < ref.lines; y += cuSize)
+    {
+        intptr_t pixoff = y * stride;
+        mvmin.y = (int16_t)((-y - 8) * mvshift);
+        mvmax.y = (int16_t)((ref.lines - y - 1 + 8) * mvshift);
+
+        for (int x = 0; x < ref.width; x += cuSize, pixoff += cuSize, cu++)
+        {
+            ALIGN_VAR_16(pixel, buf8x8[8 * 8]);
+            intptr_t bstride = 8;
+            mvmin.x = (int16_t)((-x - 8) * mvshift);
+            mvmax.x = (int16_t)((ref.width - x - 1 + 8) * mvshift);
+
+            /* clip MV to available pixels */
+            MV mv = mvs[cu];
+            mv = mv.clipped(mvmin, mvmax);
+            pixel *tmp = ref.lowresMC(pixoff, mv, buf8x8, bstride);
+            primitives.cu[BLOCK_8x8].copy_pp(mcout + pixoff, stride, tmp, bstride);
+        }
+    }
+}
+
+/* use lowres MVs from lookahead to generate a motion compensated chroma plane.
+ * if a block had cheaper lowres cost as intra, we treat it as MV 0 */
+void mcChroma(pixel *      mcout,
+              pixel *      src,
+              intptr_t     stride,
+              const MV *   mvs,
+              const Cache& cache,
+              int          height,
+              int          width)
+{
+    /* the motion vectors correspond to 8x8 lowres luma blocks, or 16x16 fullres
+     * luma blocks. We have to adapt block size to chroma csp */
+    int csp = cache.csp;
+    int bw = 16 >> cache.hshift;
+    int bh = 16 >> cache.vshift;
+    const int mvshift = 1 << 2;
+    MV mvmin, mvmax;
+
+    for (int y = 0; y < height; y += bh)
+    {
+        /* note: lowres block count per row might be different from chroma block
+         * count per row because of rounding issues, so be very careful with indexing
+         * into the lowres structures */
+        int cu = y * cache.lowresWidthInCU;
+        intptr_t pixoff = y * stride;
+        mvmin.y = (int16_t)((-y - 8) * mvshift);
+        mvmax.y = (int16_t)((height - y - 1 + 8) * mvshift);
+
+        for (int x = 0; x < width; x += bw, cu++, pixoff += bw)
+        {
+            if (x < cache.lowresWidthInCU && y < cache.lowresHeightInCU)
+            {
+                MV mv = mvs[cu]; // lowres MV
+                mv <<= 1;        // fullres MV
+                mv.x >>= cache.hshift;
+                mv.y >>= cache.vshift;
+
+                /* clip MV to available pixels */
+                mvmin.x = (int16_t)((-x - 8) * mvshift);
+                mvmax.x = (int16_t)((width - x - 1 + 8) * mvshift);
+                mv = mv.clipped(mvmin, mvmax);
+
+                intptr_t fpeloffset = (mv.y >> 2) * stride + (mv.x >> 2);
+                pixel *temp = src + pixoff + fpeloffset;
+
+                int xFrac = mv.x & 0x7;
+                int yFrac = mv.y & 0x7;
+                if ((yFrac | xFrac) == 0)
+                {
+                    primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, temp, stride);
+                }
+                else if (yFrac == 0)
+                {
+                    primitives.chroma[csp].pu[LUMA_16x16].filter_hpp(temp, stride, mcout + pixoff, stride, xFrac);
+                }
+                else if (xFrac == 0)
+                {
+                    primitives.chroma[csp].pu[LUMA_16x16].filter_vpp(temp, stride, mcout + pixoff, stride, yFrac);
+                }
+                else
+                {
+                    ALIGN_VAR_16(int16_t, imm[16 * (16 + NTAPS_CHROMA)]);
+                    primitives.chroma[csp].pu[LUMA_16x16].filter_hps(temp, stride, imm, bw, xFrac, 1);
+                    primitives.chroma[csp].pu[LUMA_16x16].filter_vsp(imm + ((NTAPS_CHROMA >> 1) - 1) * bw, bw, mcout + pixoff, stride, yFrac);
+                }
+            }
+            else
+            {
+                primitives.chroma[csp].pu[LUMA_16x16].copy_pp(mcout + pixoff, stride, src + pixoff, stride);
+            }
+        }
+    }
+}
+
+/* Measure sum of 8x8 satd costs between source frame and reference
+ * frame (potentially weighted, potentially motion compensated). We
+ * always use source images for this analysis since reference recon
+ * pixels have unreliable availability */
+uint32_t weightCost(pixel *         fenc,
+                    pixel *         ref,
+                    pixel *         weightTemp,
+                    intptr_t        stride,
+                    const Cache &   cache,
+                    int             width,
+                    int             height,
+                    WeightParam *   w,
+                    bool            bLuma)
+{
+    if (w)
+    {
+        /* make a weighted copy of the reference plane */
+        int offset = w->inputOffset << (X265_DEPTH - 8);
+        int weight = w->inputWeight;
+        int denom = w->log2WeightDenom;
+        int round = denom ? 1 << (denom - 1) : 0;
+        int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */
+        int pwidth = ((width + 15) >> 4) << 4;
+
+        primitives.weight_pp(ref, weightTemp, stride, pwidth, height,
+                             weight, round << correction, denom + correction, offset);
+        ref = weightTemp;
+    }
+
+    uint32_t cost = 0;
+    pixel *f = fenc, *r = ref;
+
+    if (bLuma)
+    {
+        int cu = 0;
+        for (int y = 0; y < height; y += 8, r += 8 * stride, f += 8 * stride)
+        {
+            for (int x = 0; x < width; x += 8, cu++)
+            {
+                int cmp = primitives.pu[LUMA_8x8].satd(r + x, stride, f + x, stride);
+                cost += X265_MIN(cmp, cache.intraCost[cu]);
+            }
+        }
+    }
+    else if (cache.csp == X265_CSP_I444)
+        for (int y = 0; y < height; y += 16, r += 16 * stride, f += 16 * stride)
+            for (int x = 0; x < width; x += 16)
+                cost += primitives.pu[LUMA_16x16].satd(r + x, stride, f + x, stride);
+    else
+        for (int y = 0; y < height; y += 8, r += 8 * stride, f += 8 * stride)
+            for (int x = 0; x < width; x += 8)
+                cost += primitives.pu[LUMA_8x8].satd(r + x, stride, f + x, stride);
+
+    return cost;
+}
+}
+
+namespace X265_NS {
+void weightAnalyse(Slice& slice, Frame& frame, x265_param& param)
+{
+    WeightParam wp[2][MAX_NUM_REF][3];
+    PicYuv *fencPic = frame.m_fencPic;
+    Lowres& fenc    = frame.m_lowres;
+
+    Cache cache;
+
+    memset(&cache, 0, sizeof(cache));
+    cache.intraCost = fenc.intraCost;
+    cache.numPredDir = slice.isInterP() ? 1 : 2;
+    cache.lowresWidthInCU = fenc.width >> 3;
+    cache.lowresHeightInCU = fenc.lines >> 3;
+    cache.csp = fencPic->m_picCsp;
+    cache.hshift = CHROMA_H_SHIFT(cache.csp);
+    cache.vshift = CHROMA_V_SHIFT(cache.csp);
+
+    /* Use single allocation for motion compensated ref and weight buffers */
+    pixel *mcbuf = X265_MALLOC(pixel, 2 * fencPic->m_stride * fencPic->m_picHeight);
+    if (!mcbuf)
+    {
+        slice.disableWeights();
+        return;
+    }
+    pixel *weightTemp = mcbuf + fencPic->m_stride * fencPic->m_picHeight;
+
+    int lambda = (int)x265_lambda_tab[X265_LOOKAHEAD_QP];
+    int curPoc = slice.m_poc;
+    const float epsilon = 1.f / 128.f;
+
+    int chromaDenom, lumaDenom, denom;
+    chromaDenom = lumaDenom = 7;
+    int numpixels[3];
+    int w16 = ((fencPic->m_picWidth  + 15) >> 4) << 4;
+    int h16 = ((fencPic->m_picHeight + 15) >> 4) << 4;
+    numpixels[0] = w16 * h16;
+    numpixels[1] = numpixels[2] = numpixels[0] >> (cache.hshift + cache.vshift);
+
+    for (int list = 0; list < cache.numPredDir; list++)
+    {
+        WeightParam *weights = wp[list][0];
+        Frame *refFrame = slice.m_refFrameList[list][0];
+        Lowres& refLowres = refFrame->m_lowres;
+        int diffPoc = abs(curPoc - refFrame->m_poc);
+
+        /* prepare estimates */
+        float guessScale[3], fencMean[3], refMean[3];
+        for (int plane = 0; plane < 3; plane++)
+        {
+            SET_WEIGHT(weights[plane], false, 1, 0, 0);
+            uint64_t fencVar = fenc.wp_ssd[plane] + !refLowres.wp_ssd[plane];
+            uint64_t refVar  = refLowres.wp_ssd[plane] + !refLowres.wp_ssd[plane];
+            guessScale[plane] = sqrt((float)fencVar / refVar);
+            fencMean[plane] = (float)fenc.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
+            refMean[plane]  = (float)refLowres.wp_sum[plane] / (numpixels[plane]) / (1 << (X265_DEPTH - 8));
+        }
+
+        /* make sure both our scale factors fit */
+        while (!list && chromaDenom > 0)
+        {
+            float thresh = 127.f / (1 << chromaDenom);
+            if (guessScale[1] < thresh && guessScale[2] < thresh)
+                break;
+            chromaDenom--;
+        }
+
+        SET_WEIGHT(weights[1], false, 1 << chromaDenom, chromaDenom, 0);
+        SET_WEIGHT(weights[2], false, 1 << chromaDenom, chromaDenom, 0);
+
+        MV *mvs = NULL;
+
+        for (int plane = 0; plane < 3; plane++)
+        {
+            denom = plane ? chromaDenom : lumaDenom;
+            if (plane && !weights[0].bPresentFlag)
+                break;
+
+            /* Early termination */
+            x265_emms();
+            if (fabsf(refMean[plane] - fencMean[plane]) < 0.5f && fabsf(1.f - guessScale[plane]) < epsilon)
+            {
+                SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
+                continue;
+            }
+
+            if (plane)
+            {
+                int scale = x265_clip3(0, 255, (int)(guessScale[plane] * (1 << denom) + 0.5f));
+                if (scale > 127)
+                    continue;
+                weights[plane].inputWeight = scale;
+            }
+            else
+            {
+                weights[plane].setFromWeightAndOffset((int)(guessScale[plane] * (1 << denom) + 0.5f), 0, denom, !list);
+            }
+
+            int mindenom = weights[plane].log2WeightDenom;
+            int minscale = weights[plane].inputWeight;
+            int minoff = 0;
+
+            if (!plane && diffPoc <= param.bframes + 1)
+            {
+                mvs = fenc.lowresMvs[list][diffPoc - 1];
+
+                /* test whether this motion search was performed by lookahead */
+                if (mvs[0].x != 0x7FFF)
+                {
+                    /* reference chroma planes must be extended prior to being
+                     * used as motion compensation sources */
+                    if (!refFrame->m_bChromaExtended)
+                    {
+                        refFrame->m_bChromaExtended = true;
+                        PicYuv *refPic = refFrame->m_fencPic;
+                        int width = refPic->m_picWidth >> cache.hshift;
+                        int height = refPic->m_picHeight >> cache.vshift;
+                        extendPicBorder(refPic->m_picOrg[1], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
+                        extendPicBorder(refPic->m_picOrg[2], refPic->m_strideC, width, height, refPic->m_chromaMarginX, refPic->m_chromaMarginY);
+                    }
+                }
+                else
+                    mvs = 0;
+            }
+
+            /* prepare inputs to weight analysis */
+            pixel *orig;
+            pixel *fref;
+            intptr_t stride;
+            int    width, height;
+            switch (plane)
+            {
+            case 0:
+                orig = fenc.lowresPlane[0];
+                stride = fenc.lumaStride;
+                width = fenc.width;
+                height = fenc.lines;
+                fref = refLowres.lowresPlane[0];
+                if (mvs)
+                {
+                    mcLuma(mcbuf, refLowres, mvs);
+                    fref = mcbuf;
+                }
+                break;
+
+            case 1:
+                orig = fencPic->m_picOrg[1];
+                stride = fencPic->m_strideC;
+                fref = refFrame->m_fencPic->m_picOrg[1];
+
+                /* Clamp the chroma dimensions to the nearest multiple of
+                 * 8x8 blocks (or 16x16 for 4:4:4) since mcChroma uses lowres
+                 * blocks and weightCost measures 8x8 blocks. This
+                 * potentially ignores some edge pixels, but simplifies the
+                 * logic and prevents reading uninitialized pixels. Lowres
+                 * planes are border extended and require no clamping. */
+                width =  ((fencPic->m_picWidth  >> 4) << 4) >> cache.hshift;
+                height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
+                if (mvs)
+                {
+                    mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
+                    fref = mcbuf;
+                }
+                break;
+
+            case 2:
+                orig = fencPic->m_picOrg[2];
+                stride = fencPic->m_strideC;
+                fref = refFrame->m_fencPic->m_picOrg[2];
+                width =  ((fencPic->m_picWidth  >> 4) << 4) >> cache.hshift;
+                height = ((fencPic->m_picHeight >> 4) << 4) >> cache.vshift;
+                if (mvs)
+                {
+                    mcChroma(mcbuf, fref, stride, mvs, cache, height, width);
+                    fref = mcbuf;
+                }
+                break;
+
+            default:
+                slice.disableWeights();
+                X265_FREE(mcbuf);
+                return;
+            }
+
+            uint32_t origscore = weightCost(orig, fref, weightTemp, stride, cache, width, height, NULL, !plane);
+            if (!origscore)
+            {
+                SET_WEIGHT(weights[plane], 0, 1 << denom, denom, 0);
+                continue;
+            }
+
+            uint32_t minscore = origscore;
+            bool bFound = false;
+
+            /* x264 uses a table lookup here, selecting search range based on preset */
+            static const int scaleDist = 4;
+            static const int offsetDist = 2;
+
+            int startScale = x265_clip3(0, 127, minscale - scaleDist);
+            int endScale   = x265_clip3(0, 127, minscale + scaleDist);
+            for (int scale = startScale; scale <= endScale; scale++)
+            {
+                int deltaWeight = scale - (1 << mindenom);
+                if (deltaWeight > 127 || deltaWeight <= -128)
+                    continue;
+
+                x265_emms();
+                int curScale = scale;
+                int curOffset = (int)(fencMean[plane] - refMean[plane] * curScale / (1 << mindenom) + 0.5f);
+                if (curOffset < -128 || curOffset > 127)
+                {
+                    /* Rescale considering the constraints on curOffset. We do it in this order
+                     * because scale has a much wider range than offset (because of denom), so
+                     * it should almost never need to be clamped. */
+                    curOffset = x265_clip3(-128, 127, curOffset);
+                    curScale = (int)((1 << mindenom) * (fencMean[plane] - curOffset) / refMean[plane] + 0.5f);
+                    curScale = x265_clip3(0, 127, curScale);
+                }
+
+                int startOffset = x265_clip3(-128, 127, curOffset - offsetDist);
+                int endOffset   = x265_clip3(-128, 127, curOffset + offsetDist);
+                for (int off = startOffset; off <= endOffset; off++)
+                {
+                    WeightParam wsp;
+                    SET_WEIGHT(wsp, true, curScale, mindenom, off);
+                    uint32_t s = weightCost(orig, fref, weightTemp, stride, cache, width, height, &wsp, !plane) +
+                                 sliceHeaderCost(&wsp, lambda, !!plane);
+                    COPY4_IF_LT(minscore, s, minscale, curScale, minoff, off, bFound, true);
+
+                    /* Don't check any more offsets if the previous one had a lower cost than the current one */
+                    if (minoff == startOffset && off != startOffset)
+                        break;
+                }
+            }
+
+            /* Use a smaller luma denominator if possible */
+            if (!(plane || list))
+            {
+                while (mindenom > 0 && !(minscale & 1))
+                {
+                    mindenom--;
+                    minscale >>= 1;
+                }
+            }
+
+            if (!bFound || (minscale == (1 << mindenom) && minoff == 0) || (float)minscore / origscore > 0.998f)
+            {
+                SET_WEIGHT(weights[plane], false, 1 << denom, denom, 0);
+            }
+            else
+            {
+                SET_WEIGHT(weights[plane], true, minscale, mindenom, minoff);
+            }
+        }
+
+        if (weights[0].bPresentFlag)
+        {
+            // Make sure both chroma channels match
+            if (weights[1].bPresentFlag != weights[2].bPresentFlag)
+            {
+                if (weights[1].bPresentFlag)
+                    weights[2] = weights[1];
+                else
+                    weights[1] = weights[2];
+            }
+        }
+
+        lumaDenom = weights[0].log2WeightDenom;
+        chromaDenom = weights[1].log2WeightDenom;
+
+        /* reset weight states */
+        for (int ref = 1; ref < slice.m_numRefIdx[list]; ref++)
+        {
+            SET_WEIGHT(wp[list][ref][0], false, 1 << lumaDenom, lumaDenom, 0);
+            SET_WEIGHT(wp[list][ref][1], false, 1 << chromaDenom, chromaDenom, 0);
+            SET_WEIGHT(wp[list][ref][2], false, 1 << chromaDenom, chromaDenom, 0);
+        }
+    }
+
+    X265_FREE(mcbuf);
+
+    memcpy(slice.m_weightPredTable, wp, sizeof(WeightParam) * 2 * MAX_NUM_REF * 3);
+
+    if (param.logLevel >= X265_LOG_FULL)
+    {
+        char buf[1024];
+        int p = 0;
+        bool bWeighted = false;
+
+        p = sprintf(buf, "poc: %d weights:", slice.m_poc);
+        int numPredDir = slice.isInterP() ? 1 : 2;
+        for (int list = 0; list < numPredDir; list++)
+        {
+            WeightParam* w = &wp[list][0][0];
+            if (w[0].bPresentFlag || w[1].bPresentFlag || w[2].bPresentFlag)
+            {
+                bWeighted = true;
+                p += sprintf(buf + p, " [L%d:R0 ", list);
+                if (w[0].bPresentFlag)
+                    p += sprintf(buf + p, "Y{%d/%d%+d}", w[0].inputWeight, 1 << w[0].log2WeightDenom, w[0].inputOffset);
+                if (w[1].bPresentFlag)
+                    p += sprintf(buf + p, "U{%d/%d%+d}", w[1].inputWeight, 1 << w[1].log2WeightDenom, w[1].inputOffset);
+                if (w[2].bPresentFlag)
+                    p += sprintf(buf + p, "V{%d/%d%+d}", w[2].inputWeight, 1 << w[2].log2WeightDenom, w[2].inputOffset);
+                p += sprintf(buf + p, "]");
+            }
+        }
+
+        if (bWeighted)
+        {
+            if (p < 80) // pad with spaces to ensure progress line overwritten
+                sprintf(buf + p, "%*s", 80 - p, " ");
+            x265_log(&param, X265_LOG_FULL, "%s\n", buf);
+        }
+    }
+}
+}