libbpg-0.9.4

2015-01-16 13:47:26 +01:00 · 2015-01-16 13:47:26 +01:00 · 6e56352f86
commit 6e56352f86
parent b21307932d
11 changed files with 811 additions and 388 deletions
--- a/9
+++ b/9
@ -1,3 +1,12 @@
+version 0.9.4:
+
+- Modified alpha plane encoding to allow progressive display and
+  streaming encoding. This change is incompatible, so images
+  containing alpha from the previous versions of the format cannot be
+  decoded.
+
+- Added 4:2:2 and 4:2:0 chroma formats with MPEG2 chroma sample position.
+
 version 0.9.3:

 - Fixed small palette PNG.
--- a/10
+++ b/10
@ -91,6 +91,10 @@ as input.
  with the '-keepmetadata' option. For JPEG input, EXIF, ICCP and XMP
  are copied. For PNG input, ICCP is copied.

+- Objective comparisons: the JCTVC encoder is tuned for PSNR only, not
+  for SSIM, so you should use PSNR when making objective comparison
+  with other formats. x265 is tuned by default for SSIM.
+
 3) BPG decoder
 --------------

@ -130,7 +134,11 @@ into it. Stylesheets are supported (the 'id' and 'class' attributes
 are preserved). The 'width' and 'height' attributes are supported only
 with pixel units.

-asm.js gives an interesting speed boost, so we hope that more browser
+The image data is downloaded with the XMLHttpRequest object. So the
+BPG images and the BPG Javascript decoder must be in the same domain
+unless Cross-Origin Resource Sharing is used.
+
+asm.js gives an interesting speed boost, so we hope that more browsers
 will support this Javascript subset.

 6) FFmpeg modifications
--- a/2
+++ b/2
@ -1 +1 @@
-0.9.3
+0.9.4
--- a/bpgdec.c
+++ b/bpgdec.c
@ -174,11 +174,13 @@ static void bpg_show_info(const char *filename, int show_extensions)
    FILE *f;
    BPGImageInfo p_s, *p = &p_s;
    BPGExtensionData *first_md, *md;
-    static const char *format_str[4] = {
+    static const char *format_str[6] = {
        "Gray",
        "4:2:0",
        "4:2:2",
        "4:4:4",
+        "4:2:0_video",
+        "4:2:2_video",
    };
    static const char *color_space_str[BPG_CS_COUNT] = {
        "YCbCr",
--- a/bpgenc.c
+++ b/bpgenc.c
@ -433,35 +433,102 @@ static void gray_neg_c(ColorConvertState *s, PIXEL *y_ptr, int n)

 /* decimation */

-#define DTAPS2 5
-#define DTAPS (2 * DTAPS2)
-#define DC0 57
-#define DC1 17
-#define DC2 (-8)
-#define DC3 (-4)
-#define DC4 2
+/* phase = 0 */
+#define DP0TAPS2 7
+#define DP0TAPS (2 * DP0TAPS + 1)
+#define DP0C0 64
+#define DP0C1 40
+#define DP0C3 (-11)
+#define DP0C5 4
+#define DP0C7 (-1)

-static void decimate2_simple(PIXEL *dst, PIXEL *src, int n, int bit_depth)
+/* phase = 0.5 */
+#define DP1TAPS2 5
+#define DP1TAPS (2 * DP1TAPS2)
+#define DP1C0 57
+#define DP1C1 17
+#define DP1C2 (-8)
+#define DP1C3 (-4)
+#define DP1C4 2
+
+#define DTAPS_MAX 7
+
+/* chroma aligned with luma samples */
+static void decimate2p0_simple(PIXEL *dst, PIXEL *src, int n, int bit_depth)
 {
    int n2, i, pixel_max;
    pixel_max = (1 << bit_depth) - 1;
    n2 = (n + 1) / 2;
    for(i = 0; i < n2; i++) {
-        dst[i] = clamp_pix(((src[-4] + src[5]) * DC4 + 
-                            (src[-3] + src[4]) * DC3 + 
-                            (src[-2] + src[3]) * DC2 + 
-                            (src[-1] + src[2]) * DC1 + 
-                            (src[0] + src[1]) * DC0 + 64) >> 7, pixel_max);
+        dst[i] = clamp_pix(((src[-7] + src[7]) * DP0C7 + 
+                            (src[-5] + src[5]) * DP0C5 + 
+                            (src[-3] + src[3]) * DP0C3 + 
+                            (src[-1] + src[1]) * DP0C1 + 
+                            src[0] * DP0C0 + 64) >> 7, pixel_max);
        src += 2;
    }
 }

-static void decimate2_h(PIXEL *dst, PIXEL *src, int n, int bit_depth)
+/* same with more precision and no saturation */
+static void decimate2p0_simple16(int16_t *dst, PIXEL *src, int n, int bit_depth)
+{
+    int n2, i, shift, rnd;
+    shift = bit_depth - 7;
+    rnd = 1 << (shift - 1);
+    n2 = (n + 1) / 2;
+    for(i = 0; i < n2; i++) {
+        dst[i] = ((src[-7] + src[7]) * DP0C7 + 
+                  (src[-5] + src[5]) * DP0C5 + 
+                  (src[-3] + src[3]) * DP0C3 + 
+                  (src[-1] + src[1]) * DP0C1 + 
+                  src[0] * DP0C0 + rnd) >> shift;
+    src += 2;
+    }
+}
+
+
+/* chroma half way between luma samples */
+static void decimate2p1_simple(PIXEL *dst, PIXEL *src, int n, int bit_depth)
+{
+    int n2, i, pixel_max;
+    pixel_max = (1 << bit_depth) - 1;
+    n2 = (n + 1) / 2;
+    for(i = 0; i < n2; i++) {
+        dst[i] = clamp_pix(((src[-4] + src[5]) * DP1C4 + 
+                            (src[-3] + src[4]) * DP1C3 + 
+                            (src[-2] + src[3]) * DP1C2 + 
+                            (src[-1] + src[2]) * DP1C1 + 
+                            (src[0] + src[1]) * DP1C0 + 64) >> 7, pixel_max);
+        src += 2;
+    }
+}
+
+/* same with more precision and no saturation */
+static void decimate2p1_simple16(int16_t *dst, PIXEL *src, int n, int bit_depth)
+{
+    int n2, i, shift, rnd;
+    shift = bit_depth - 7;
+    rnd = 1 << (shift - 1);
+    n2 = (n + 1) / 2;
+    for(i = 0; i < n2; i++) {
+        dst[i] = ((src[-4] + src[5]) * DP1C4 + 
+                  (src[-3] + src[4]) * DP1C3 + 
+                  (src[-2] + src[3]) * DP1C2 + 
+                  (src[-1] + src[2]) * DP1C1 + 
+                  (src[0] + src[1]) * DP1C0 + rnd) >> shift;
+        src += 2;
+    }
+}
+
+static void decimate2_h(PIXEL *dst, PIXEL *src, int n, int bit_depth, int phase)
 {
    PIXEL *src1, v;
    int d, i;

-    d = DTAPS2;
+    if (phase == 0) 
+        d = DP0TAPS2;
+    else
+        d = DP1TAPS2;
    /* add edge pixels */
    src1 = malloc(sizeof(PIXEL) * (n + 2 * d));
    v = src[0];
@ -471,35 +538,24 @@ static void decimate2_h(PIXEL *dst, PIXEL *src, int n, int bit_depth)
    v = src[n - 1];
    for(i = 0; i < d; i++)
        src1[d + n + i] = v;
-    decimate2_simple(dst, src1 + d, n, bit_depth);
+    if (phase == 0)
+        decimate2p0_simple(dst, src1 + d, n, bit_depth);
+    else
+        decimate2p1_simple(dst, src1 + d, n, bit_depth);
    free(src1);
 }

-/* same as decimate2_simple but with more precision and no saturation */
-static void decimate2_simple16(int16_t *dst, PIXEL *src, int n, int bit_depth)
-{
-    int n2, i, shift, rnd;
-    shift = bit_depth - 7;
-    rnd = 1 << (shift - 1);
-    n2 = (n + 1) / 2;
-    for(i = 0; i < n2; i++) {
-        dst[i] = ((src[-4] + src[5]) * DC4 + 
-                  (src[-3] + src[4]) * DC3 + 
-                  (src[-2] + src[3]) * DC2 + 
-                  (src[-1] + src[2]) * DC1 + 
-                  (src[0] + src[1]) * DC0 + rnd) >> shift;
-        src += 2;
-    }
-}
-
 /* src1 is a temporary buffer of length n + 2 * DTAPS */
 static void decimate2_h16(int16_t *dst, PIXEL *src, int n, PIXEL *src1,
-                          int bit_depth)
+                          int bit_depth, int phase)
 {
    PIXEL v;
    int d, i;

-    d = DTAPS2;
+    if (phase == 0) 
+        d = DP0TAPS2;
+    else
+        d = DP1TAPS2;
    /* add edge pixels */
    v = src[0];
    for(i = 0; i < d; i++)
@ -508,7 +564,11 @@ static void decimate2_h16(int16_t *dst, PIXEL *src, int n, PIXEL *src1,
    v = src[n - 1];
    for(i = 0; i < d; i++)
        src1[d + n + i] = v;
-    decimate2_simple16(dst, src1 + d, n, bit_depth);
+    if (phase == 0)
+        decimate2p0_simple16(dst, src1 + d, n, bit_depth);
+    else
+        decimate2p1_simple16(dst, src1 + d, n, bit_depth);
+        
 }

 static void decimate2_v(PIXEL *dst, int16_t **src, int pos, int n,
@ -517,57 +577,57 @@ static void decimate2_v(PIXEL *dst, int16_t **src, int pos, int n,
    int16_t *src0, *src1, *src2, *src3, *src4, *src5, *srcm1, *srcm2, *srcm3, *srcm4;
    int i, shift, offset, pixel_max;

-    pos = sub_mod_int(pos, 4, DTAPS);
+    pos = sub_mod_int(pos, 4, DP1TAPS);
    srcm4 = src[pos];
-    pos = add_mod_int(pos, 1, DTAPS);
+    pos = add_mod_int(pos, 1, DP1TAPS);
    srcm3 = src[pos];
-    pos = add_mod_int(pos, 1, DTAPS);
+    pos = add_mod_int(pos, 1, DP1TAPS);
    srcm2 = src[pos];
-    pos = add_mod_int(pos, 1, DTAPS);
+    pos = add_mod_int(pos, 1, DP1TAPS);
    srcm1 = src[pos];
-    pos = add_mod_int(pos, 1, DTAPS);
+    pos = add_mod_int(pos, 1, DP1TAPS);
    src0 = src[pos];
-    pos = add_mod_int(pos, 1, DTAPS);
+    pos = add_mod_int(pos, 1, DP1TAPS);
    src1 = src[pos];
-    pos = add_mod_int(pos, 1, DTAPS);
+    pos = add_mod_int(pos, 1, DP1TAPS);
    src2 = src[pos];
-    pos = add_mod_int(pos, 1, DTAPS);
+    pos = add_mod_int(pos, 1, DP1TAPS);
    src3 = src[pos];
-    pos = add_mod_int(pos, 1, DTAPS);
+    pos = add_mod_int(pos, 1, DP1TAPS);
    src4 = src[pos];
-    pos = add_mod_int(pos, 1, DTAPS);
+    pos = add_mod_int(pos, 1, DP1TAPS);
    src5 = src[pos];
    
    shift = 21 - bit_depth;
    offset = 1 << (shift - 1);
    pixel_max = (1 << bit_depth) - 1;
    for(i = 0; i < n; i++) {
-        dst[i] = clamp_pix(((srcm4[i] + src5[i]) * DC4 + 
-                            (srcm3[i] + src4[i]) * DC3 + 
-                            (srcm2[i] + src3[i]) * DC2 + 
-                            (srcm1[i] + src2[i]) * DC1 + 
-                            (src0[i] + src1[i]) * DC0 + offset) >> shift, pixel_max);
+        dst[i] = clamp_pix(((srcm4[i] + src5[i]) * DP1C4 + 
+                            (srcm3[i] + src4[i]) * DP1C3 + 
+                            (srcm2[i] + src3[i]) * DP1C2 + 
+                            (srcm1[i] + src2[i]) * DP1C1 + 
+                            (src0[i] + src1[i]) * DP1C0 + offset) >> shift, pixel_max);
    }
 }

 /* Note: we do the horizontal decimation first to use less CPU cache */
 static void decimate2_hv(uint8_t *dst, int dst_linesize,
                         uint8_t *src, int src_linesize, 
-                         int w, int h, int bit_depth)
+                         int w, int h, int bit_depth, int h_phase)
 {
    PIXEL *buf1;
-    int16_t *buf2[DTAPS];
+    int16_t *buf2[DP1TAPS];
    int w2, pos, i, y, y1, y2;
    
    w2 = (w + 1) / 2;

-    buf1 = malloc(sizeof(PIXEL) * (w + 2 * DTAPS));
+    buf1 = malloc(sizeof(PIXEL) * (w + 2 * DTAPS_MAX));
    /* init line buffer */
-    for(i = 0; i < DTAPS; i++) {
+    for(i = 0; i < DP1TAPS; i++) {
        buf2[i] = malloc(sizeof(int16_t) * w2);
        y = i;
-        if (y > DTAPS2)
-            y -= DTAPS;
+        if (y > DP1TAPS2)
+            y -= DP1TAPS;
        if (y < 0) {
            /* copy from first line */
            memcpy(buf2[i], buf2[0], sizeof(int16_t) * w2);
@ -576,12 +636,12 @@ static void decimate2_hv(uint8_t *dst, int dst_linesize,
            memcpy(buf2[i], buf2[h - 1], sizeof(int16_t) * w2);
        } else {
            decimate2_h16(buf2[i], (PIXEL *)(src + src_linesize * y), w,
-                          buf1, bit_depth);
+                          buf1, bit_depth, h_phase);
        }
    }

    for(y = 0; y < h; y++) {
-        pos = y % DTAPS;
+        pos = y % DP1TAPS;
        if ((y & 1) == 0) {
            /* filter one line */
            y2 = y >> 1;
@ -589,20 +649,20 @@ static void decimate2_hv(uint8_t *dst, int dst_linesize,
                        pos, w2, bit_depth);
        }
        /* add a new line in the buffer */
-        y1 = y + DTAPS2 + 1;
-        pos = add_mod_int(pos, DTAPS2 + 1, DTAPS);
+        y1 = y + DP1TAPS2 + 1;
+        pos = add_mod_int(pos, DP1TAPS2 + 1, DP1TAPS);
        if (y1 >= h) {
            /* copy last line */
-            memcpy(buf2[pos], buf2[sub_mod_int(pos, 1, DTAPS)],
+            memcpy(buf2[pos], buf2[sub_mod_int(pos, 1, DP1TAPS)],
                   sizeof(int16_t) * w2);
        } else {
            /* horizontally decimate new line */
            decimate2_h16(buf2[pos], (PIXEL *)(src + src_linesize * y1), w,
-                          buf1, bit_depth);
+                          buf1, bit_depth, h_phase);
        }
    }

-    for(i = 0; i < DTAPS; i++)
+    for(i = 0; i < DP1TAPS; i++)
        free(buf2[i]);
    free(buf1);
 }
@ -673,7 +733,7 @@ void image_free(Image *img)
    free(img);
 }

-int image_ycc444_to_ycc422(Image *img)
+int image_ycc444_to_ycc422(Image *img, int h_phase)
 {
    uint8_t *data1;
    int w1, h1, bpp, linesize1, i, y;
@ -690,7 +750,7 @@ int image_ycc444_to_ycc422(Image *img)
        for(y = 0; y < img->h; y++) {
            decimate2_h((PIXEL *)(data1 + y * linesize1),
                        (PIXEL *)(img->data[i] + y * img->linesize[i]),
-                        img->w, img->bit_depth);
+                        img->w, img->bit_depth, h_phase);
        }
        free(img->data[i]);
        img->data[i] = data1;
@ -700,7 +760,7 @@ int image_ycc444_to_ycc422(Image *img)
    return 0;
 }

-int image_ycc444_to_ycc420(Image *img)
+int image_ycc444_to_ycc420(Image *img, int h_phase)
 {
    uint8_t *data1;
    int w1, h1, bpp, linesize1, i;
@ -717,7 +777,7 @@ int image_ycc444_to_ycc420(Image *img)
        data1 = malloc(linesize1 * h1);
        decimate2_hv(data1, linesize1,
                     img->data[i], img->linesize[i],
-                     img->w, img->h, img->bit_depth);
+                     img->w, img->h, img->bit_depth, h_phase);
        free(img->data[i]);
        img->data[i] = data1;
        img->linesize[i] = linesize1;
@ -1330,22 +1390,29 @@ void save_yuv(Image *img, const char *filename)


 /* return the position of the end of the NAL or -1 if error */
-static int extract_nal(uint8_t **pnal_buf, int *pnal_len, 
-                       const uint8_t *buf, int buf_len)
+static int find_nal_end(const uint8_t *buf, int buf_len)
 {
-    int idx, start, end, len;
-    uint8_t *nal_buf;
-    int nal_len;
+    int idx;

    idx = 0;
-    if (buf_len < 6 || buf[0] != 0 || buf[1] != 0 || buf[2] != 0 || buf[3] != 1)
+    if (buf_len >= 4 &&
+        buf[0] == 0 && buf[1] == 0 && buf[2] == 0 && buf[3] == 1) {
+        idx = 4;
+    } else if (buf_len >= 3 &&
+               buf[0] == 0 && buf[1] == 0 && buf[2] == 1) {
+        idx = 3;
+    } else {
+        return -1;
+    }
+    /* NAL header */
+    if (idx + 2 > buf_len)
        return -1;
-    idx += 4;
-    start = idx;
    /* find the last byte */
    for(;;) {
-        if (idx + 2 >= buf_len)
+        if (idx + 2 >= buf_len) {
+            idx = buf_len;
            break;
+        }
        if (buf[idx] == 0 && buf[idx + 1] == 0 && buf[idx + 2] == 1)
            break;
        if (idx + 3 < buf_len &&
@ -1353,7 +1420,24 @@ static int extract_nal(uint8_t **pnal_buf, int *pnal_len,
            break;
        idx++;
    }
-    end = idx;
+    return idx;
+}
+
+/* return the position of the end of the NAL or -1 if error */
+static int extract_nal(uint8_t **pnal_buf, int *pnal_len, 
+                       const uint8_t *buf, int buf_len)
+{
+    int idx, start, end, len;
+    uint8_t *nal_buf;
+    int nal_len;
+
+    end = find_nal_end(buf, buf_len);
+    if (end < 0)
+        return -1;
+    if (buf[2] == 1)
+        start = 3;
+    else
+        start = 4;
    len = end - start;
    
    nal_buf = malloc(len);
@ -1509,10 +1593,41 @@ static void put_ue_golomb(PutBitState *s, uint32_t v)
    put_bits(s, n, v);
 }

+typedef struct {
+    uint8_t *buf;
+    int size;
+    int len;
+} DynBuf;
+
+static void dyn_buf_init(DynBuf *s)
+{
+    s->buf = NULL;
+    s->size = 0;
+    s->len = 0;
+}
+
+static int dyn_buf_resize(DynBuf *s, int size)
+{
+    int new_size;
+    uint8_t *new_buf;
+
+    if (size <= s->size)
+        return 0;
+    new_size = (s->size * 3) / 2;
+    if (new_size < size)
+        new_size = size;
+    new_buf = realloc(s->buf, new_size);
+    if (!new_buf) 
+        return -1;
+    s->buf = new_buf;
+    s->size = new_size;
+    return 0;
+}
+
 /* suppress the VPS NAL and keep only the useful part of the SPS
   header. The decoder can rebuild a valid HEVC stream if needed. */
-static int build_modified_hevc(uint8_t **pout_buf, 
-                               const uint8_t *buf, int buf_len)
+static int build_modified_sps(uint8_t **pout_buf, int *pout_buf_len,
+                              const uint8_t *buf, int buf_len)
 {
    int nal_unit_type, nal_len, idx, i, ret, msps_buf_len;
    int out_buf_len, out_buf_len_max;
@ -1546,14 +1661,6 @@ static int build_modified_hevc(uint8_t **pout_buf,
        fprintf(stderr, "expecting SPS nal (%d)\n", nal_unit_type);
        return -1; /* expect SPS nal */
    }
-    /* skip the next start code */
-    if (idx + 3 < buf_len &&
-        buf[idx] == 0 && buf[idx + 1] == 0 && buf[idx + 2] == 0 && buf[idx + 3] == 1) {
-        idx += 4;
-    } else if (idx + 2 < buf_len &&
-               buf[idx] == 0 && buf[idx + 1] == 0 && buf[idx + 2] == 1) {
-        idx += 3;
-    }

    /* skip the initial part of the SPS up to and including
       log2_min_cb_size */
@ -1785,7 +1892,7 @@ static int build_modified_hevc(uint8_t **pout_buf,
        }
        msps_buf_len = (pb->idx + 7) >> 3;

-        out_buf_len_max = 5 + msps_buf_len + (buf_len - idx);
+        out_buf_len_max = 5 + msps_buf_len;
        out_buf = malloc(out_buf_len_max);

        //        printf("msps_n_bits=%d\n", pb->idx);
@ -1795,15 +1902,105 @@ static int build_modified_hevc(uint8_t **pout_buf,
        memcpy(p, msps_buf, msps_buf_len);
        p += msps_buf_len;
        
-        memcpy(p, buf + idx, buf_len - idx);
-        p += buf_len - idx;
-        
        out_buf_len = p - out_buf;
        free(msps_buf);
        free(nal_buf);
    }
    *pout_buf = out_buf;
-    return out_buf_len;
+    *pout_buf_len = out_buf_len;
+    return idx;
+}
+
+static int build_modified_hevc(uint8_t **pout_buf, 
+                               const uint8_t *cbuf, int cbuf_len,
+                               const uint8_t *abuf, int abuf_len)
+{
+    DynBuf out_buf_s, *out_buf = &out_buf_s;
+    uint8_t *msps;
+    const uint8_t *nal_buf;
+    int msps_len, cidx, aidx, is_alpha, nal_len, first_nal, start, l;
+    
+    dyn_buf_init(out_buf);
+    
+    /* add alpha MSPS */
+    aidx = 0; /* avoids warning */
+    if (abuf) {
+        aidx = build_modified_sps(&msps, &msps_len, abuf, abuf_len);
+        if (aidx < 0)
+            goto fail;
+        if (dyn_buf_resize(out_buf, out_buf->len + msps_len) < 0)
+            goto fail;
+        memcpy(out_buf->buf + out_buf->len, msps, msps_len);
+        out_buf->len += msps_len;
+        free(msps);
+    }
+    
+    /* add color MSPS */
+    cidx = build_modified_sps(&msps, &msps_len, cbuf, cbuf_len);
+    if (cidx < 0)
+        goto fail;
+    if (dyn_buf_resize(out_buf, out_buf->len + msps_len) < 0)
+        goto fail;
+    memcpy(out_buf->buf + out_buf->len, msps, msps_len);
+    out_buf->len += msps_len;
+    free(msps);
+
+    /* add the remaining NALs, alternating between alpha (if present)
+       and color. */
+    is_alpha = (abuf != NULL);
+    first_nal = 1;
+    for(;;) {
+        if (!is_alpha) {
+            if (cidx >= cbuf_len) {
+                if (abuf) {
+                    fprintf(stderr, "Incorrect number of alpha NALs\n");
+                    goto fail;
+                }
+                break;
+            }
+            nal_buf = cbuf + cidx;
+            nal_len = find_nal_end(nal_buf, cbuf_len - cidx);
+            //            printf("cidx=%d/%d nal_len=%d\n", cidx, cbuf_len, nal_len);
+            if (nal_len < 0)
+                goto fail;
+            cidx += nal_len;
+        } else {
+            if (aidx >= abuf_len) 
+                break;
+            nal_buf = abuf + aidx;
+            nal_len = find_nal_end(nal_buf, abuf_len - aidx);
+            //            printf("aidx=%d/%d nal_len=%d\n", aidx, abuf_len, nal_len);
+            if (nal_len < 0)
+                goto fail;
+            aidx += nal_len;
+        }
+        start = 3 + (nal_buf[2] == 0);
+        if (first_nal) {
+            /* skip first start code */
+            l = start;
+        } else {
+            l = 0;
+        }
+        if (dyn_buf_resize(out_buf, out_buf->len + nal_len - l) < 0)
+            goto fail;
+        //        printf("add nal len=%d\n", nal_len - l);
+        memcpy(out_buf->buf + out_buf->len, nal_buf + l, nal_len - l);
+        if (is_alpha) {
+            /* set nul_layer_id of alpha to '1' */
+            out_buf->buf[out_buf->len + (start - l) + 1] |= 1 << 3;
+        }
+        out_buf->len += nal_len - l;
+
+        if (abuf) {
+            is_alpha ^= 1;
+        }
+        first_nal = 0;
+    }
+    *pout_buf = out_buf->buf;
+    return out_buf->len;
+ fail:
+    free(out_buf->buf);
+    return -1;
 }

 typedef enum {
@ -1830,18 +2027,17 @@ static int hevc_encode_picture2(uint8_t **pbuf, Image *img,
                                HEVCEncodeParams *params,
                                HEVCEncoderEnum encoder_type)
 {
-    uint8_t *buf, *out_buf;
-    int buf_len, out_buf_len;
+    int buf_len;
    
    switch(encoder_type) {
 #if defined(USE_JCTVC)
    case HEVC_ENCODER_JCTVC:
-        buf_len = jctvc_encode_picture(&buf, img, params);
+        buf_len = jctvc_encode_picture(pbuf, img, params);
        break;
 #endif
 #if defined(USE_X265)
    case HEVC_ENCODER_X265:
-        buf_len = x265_encode_picture(&buf, img, params);
+        buf_len = x265_encode_picture(pbuf, img, params);
        break;
 #endif
    default:
@ -1852,14 +2048,7 @@ static int hevc_encode_picture2(uint8_t **pbuf, Image *img,
        *pbuf = NULL;
        return -1;
    }
-    out_buf_len = build_modified_hevc(&out_buf, buf, buf_len);
-    free(buf);
-    if (out_buf_len < 0) {
-        *pbuf = NULL;
-        return -1;
-    }
-    *pbuf = out_buf;
-    return out_buf_len;
+    return buf_len;
 }


@ -1937,10 +2126,13 @@ int main(int argc, char **argv)
    HEVCEncodeParams p_s, *p = &p_s;
    uint8_t *out_buf, *alpha_buf, *extension_buf;
    int out_buf_len, alpha_buf_len, verbose;
+    uint8_t *hevc_buf;
+    int hevc_buf_len;
    FILE *f;
    int qp, c, option_index, sei_decoded_picture_hash, is_png, extension_buf_len;
    int keep_metadata, cb_size, width, height, compress_level, alpha_qp;
    int bit_depth, lossless_mode, i, limited_range, premultiplied_alpha;
+    int c_h_phase;
    BPGImageFormatEnum format;
    BPGColorSpaceEnum color_space;
    BPGMetaData *md;
@ -1951,6 +2143,7 @@ int main(int argc, char **argv)
    alpha_qp = -1;
    sei_decoded_picture_hash = 0;
    format = BPG_FORMAT_420;
+    c_h_phase = 1;
    color_space = BPG_CS_YCbCr;
    keep_metadata = 0;
    verbose = 0;
@ -2015,10 +2208,18 @@ int main(int argc, char **argv)
        case 'f':
            if (!strcmp(optarg, "420")) {
                format = BPG_FORMAT_420;
+                c_h_phase = 1;
            } else if (!strcmp(optarg, "422")) {
                format = BPG_FORMAT_422;
+                c_h_phase = 1;
            } else if (!strcmp(optarg, "444")) {
                format = BPG_FORMAT_444;
+            } else if (!strcmp(optarg, "422_video")) {
+                format = BPG_FORMAT_422;
+                c_h_phase = 0;
+            } else if (!strcmp(optarg, "420_video")) {
+                format = BPG_FORMAT_420;
+                c_h_phase = 0;
            } else {
                fprintf(stderr, "Invalid chroma format\n");
                exit(1);
@ -2143,10 +2344,10 @@ int main(int argc, char **argv)

    if (img->format == BPG_FORMAT_444) {
        if (format == BPG_FORMAT_420) {
-            if (image_ycc444_to_ycc420(img) != 0)
+            if (image_ycc444_to_ycc420(img, c_h_phase) != 0)
                goto error_convert;
        } else if (format == BPG_FORMAT_422) {
-            if (image_ycc444_to_ycc422(img) != 0)  {
+            if (image_ycc444_to_ycc422(img, c_h_phase) != 0)  {
            error_convert:
                fprintf(stderr, "Cannot convert image\n");
                exit(1);
@ -2201,6 +2402,16 @@ int main(int argc, char **argv)
            exit(1);
        }
    }
+    
+    hevc_buf = NULL;
+    hevc_buf_len = build_modified_hevc(&hevc_buf, out_buf, out_buf_len,
+                                       alpha_buf, alpha_buf_len);
+    if (hevc_buf_len < 0) {
+        fprintf(stderr, "Error while creating HEVC data\n");
+        exit(1);
+    }
+    free(out_buf);
+    free(alpha_buf);

    /* prepare the extension data */
    extension_buf = NULL;
@ -2235,7 +2446,7 @@ int main(int argc, char **argv)

    {
        uint8_t img_header[128], *q;
-        int v, has_alpha, has_extension, alpha2_flag, alpha1_flag;
+        int v, has_alpha, has_extension, alpha2_flag, alpha1_flag, format;
        
        has_alpha = (img_alpha != NULL);
        has_extension = (extension_buf_len > 0);
@ -2259,7 +2470,14 @@ int main(int argc, char **argv)
        *q++ = (IMAGE_HEADER_MAGIC >> 16) & 0xff;
        *q++ = (IMAGE_HEADER_MAGIC >> 8) & 0xff;
        *q++ = (IMAGE_HEADER_MAGIC >> 0) & 0xff;
-        v = (img->format << 5) | (alpha1_flag << 4) | (img->bit_depth - 8);
+
+        if (c_h_phase == 0 && img->format == BPG_FORMAT_420)
+            format = BPG_FORMAT_420_VIDEO;
+        else if (c_h_phase == 0 && img->format == BPG_FORMAT_422)
+            format = BPG_FORMAT_422_VIDEO;
+        else
+            format = img->format;
+        v = (format << 5) | (alpha1_flag << 4) | (img->bit_depth - 8);
        *q++ = v;
        v = (img->color_space << 4) | (has_extension << 3) |
            (alpha2_flag << 2) | (img->limited_range << 1);
@ -2267,13 +2485,10 @@ int main(int argc, char **argv)
        put_ue(&q, width);
        put_ue(&q, height);
        
-        put_ue(&q, out_buf_len);
+        put_ue(&q, 0); /* zero length means up to the end of the file */
        if (has_extension) {
            put_ue(&q, extension_buf_len); /* extension data length */
        }
-        if (has_alpha) {
-            put_ue(&q, alpha_buf_len);
-        }

        fwrite(img_header, 1, q - img_header, f);

@ -2285,21 +2500,11 @@ int main(int argc, char **argv)
            free(extension_buf);
        }

-        /* HEVC YUV/RGB data */
-        if (fwrite(out_buf, 1, out_buf_len, f) != out_buf_len) {
+        if (fwrite(hevc_buf, 1, hevc_buf_len, f) != hevc_buf_len) {
            fprintf(stderr, "Error while writing HEVC image planes\n");
            exit(1);
        }
-        free(out_buf);
-
-        if (has_alpha) {
-            /* alpha data */
-            if (fwrite(alpha_buf, 1, alpha_buf_len, f) != alpha_buf_len) {
-                fprintf(stderr, "Error while writing HEVC alpha plane\n");
-                exit(1);
-            }
-            free(alpha_buf);
-        }
+        free(hevc_buf);
    }

    fclose(f);
--- a/doc/bpg_spec.txt
+++ b/doc/bpg_spec.txt
@ -1,6 +1,6 @@
 BPG Specification

-version 0.9.3
+version 0.9.4

 Copyright (c) 2014 Fabrice Bellard

@ -17,8 +17,12 @@ space is either BT 601 (JPEG case), BT 709 or BT 2020.

 The chroma can be subsampled by a factor of two in horizontal or both
 in horizontal and vertical directions (4:4:4, 4:2:2 or 4:2:0 chroma
-formats are supported). The chroma is sampled at the same position
-relative to the luma as in the JPEG format [2].
+formats are supported). In order to be able to transcode JPEG images
+or video frames without modification to the chroma, both JPEG and
+MPEG2 chroma sample positions are supported.
+
+Progressive decoding and display is supported by interleaving the
+alpha and color data.

 Arbitrary metadata (such as EXIF, ICC profile, XMP) are supported.

@ -73,24 +77,17 @@ heic_file() {
     picture_data_length                                         ue7(32)
     if (extension_present_flag)  
        extension_data_length                                    ue7(32)
-     if (alpha1_flag || alpha2_flag)
-        alpha_data_length                                        ue7(32)
    
     if (extension_present_flag) {
         extension_data()
     }

     hevc_header_and_data()
-
-     if (alpha1_flag || alpha2_flag) {
-         hevc_header_and_data()
-     }
-
 }

 extension_data() 
 {
-     for(i = 0; i < v; i++) {
+     while (more_bytes()) {
         extension_tag                                           ue7(32)
         extension_tag_length                                    ue7(32)
         for(j = 0; j < extension_tag_length; j++) {
@ -100,6 +97,15 @@ extension_data()
 }
     
 hevc_header_and_data()
+{
+     if (alpha1_flag || alpha2_flag) {
+         hevc_header()
+     }
+     hevc_header()
+     hevc_data()
+}
+
+hevc_header()
 {
     hevc_header_length                                          ue7(32)
     log2_min_luma_coding_block_size_minus3                      ue(v)
@ -134,8 +140,6 @@ hevc_header_and_data()
         cabac_bypass_alignment_enabled_flag                     u(1)
     }
     trailing_bits                                               u(v)
-
-     hevc_data()
 }

 hevc_data() 
@ -154,9 +158,11 @@ hevc_data()
     'pixel_format' indicates the chroma subsampling:

       0 : Grayscale
-       1 : 4:2:0
-       2 : 4:2:2
+       1 : 4:2:0. Chroma at position (0.5, 0.5) (JPEG chroma position)
+       2 : 4:2:2. Chroma at position (0.5, 0) (JPEG chroma position)
       3 : 4:4:4
+       4 : 4:2:0. Chroma at position (0, 0.5) (MPEG2 chroma position)
+       5 : 4:2:2. Chroma at position (0, 0) (MPEG2 chroma position)

       The other values are reserved.
       
@ -217,9 +223,8 @@ hevc_data()
       For RGB, G is stored as the Y plane. B in the Cb plane and R in
       the Cr plane.

-       YCgCo is defined as HEVC matrix_coeffs = 8, full range. Y is
-       stored in the Y plane. Cg in the Cb plane and Co in the Cr
-       plane.
+       YCgCo is defined as HEVC matrix_coeffs = 8. Y is stored in the
+       Y plane. Cg in the Cb plane and Co in the Cr plane.
       
       If no color profile is present, the RGB output data are assumed
       to be in the sRGB color space [6].
@ -246,12 +251,12 @@ hevc_data()
     'picture_height' is the picture height in pixels. The value 0 is
     not allowed.

-     'picture_data_length' is the picture data length in bytes.
+     'picture_data_length' is the picture data length in bytes. The
+     special value of zero indicates that the picture data goes up to
+     the end of the file.

     'extension_data_length' is the extension data length in bytes.

-     'alpha_data_length' is the alpha data length in bytes.
-
     'extension_data()' is the extension data.

     'extension_tag' is the extension tag. The following values are defined:
@ -320,7 +325,19 @@ hevc_data()
     - bit_depth_luma_minus8 = bit_depth_minus_8
     - bit_depth_chroma_minus8 = bit_depth_minus_8
     - scaling_list_enabled_flag = 0
-                    
+
+     Alpha data encoding:
+
+     - If alpha data is present, all the corresponding NALs have
+       nuh_layer_id = 1. NALs for color data shall have nuh_layer_id =
+       0.
+     - Alpha data shall use the same tile sizes as color data and
+       shall have the same entropy_coding_sync_enabled_flag value as
+       color data.
+     - Alpha slices shall use the same number of coding units as color
+       slices and should be interleaved with color slices. alpha NALs
+       shall come before the corresponding color NALs.
+
 3.3) HEVC Profile
 -----------------

@ -361,7 +378,7 @@ information.

 - Pixel formats: we wanted to be able to convert JPEG images to BPG
  with as little loss as possible. So supporting the same color space
-  (CCIR 601 YCbCr) with the same range (full range) and most of the
+  (BT 601 YCbCr) with the same range (full range) and most of the
  allowed JPEG chroma formats (4:4:4, 4:2:2, 4:2:0 or grayscale) was
  mandatory to avoid going back to RGB or doing a subsampling or
  interpolation.
@ -371,7 +388,14 @@ information.
  format to simplify the decoder. The color is either
  non-premultiplied or premultiplied. Premultiplied alpha usually
  gives a better compression. Non-premultiplied alpha is supported in
-  case no loss is needed on the color components.
+  case no loss is needed on the color components. In order to allow
+  progressive display, the alpha and color data are interleaved (the
+  nuh_layed_id NAL field is 0 for color data and 1 for alpha
+  data). The alpha and color slices should contain the same number of
+  coding units and each alpha slice should come before the
+  corresponding color slice. Since alpha slices are usually smaller
+  than color slices, it allows a progressive display even if there is
+  a single slice.

 - Color spaces: In addition to YCbCr, RGB is supported for the high
  quality or lossless cases. YCgCo is supported because it may give
@ -423,4 +447,4 @@ information.

 [5] Extensible Metadata Platform (XMP) http://www.adobe.com/devnet/xmp.html

-[6] sRGB color space, IEC 61966-2-1.
+[6] sRGB color space, IEC 61966-2-1
--- a/html/bpgdec.js
+++ b/html/bpgdec.js
--- a/html/bpgdec8b.js
+++ b/html/bpgdec8b.js
--- a/jctvc_glue.cpp
+++ b/jctvc_glue.cpp
@ -139,6 +139,12 @@ int jctvc_encode_picture(uint8_t **pbuf, Image *img,
        add_opt(&argc, argv, "--HadamardME=0");
    }

+#if 0
+    /* TEST with several slices */
+    add_opt(&argc, argv, "--SliceMode=2");
+    add_opt(&argc, argv, "--SliceArgument=5");
+#endif
+
    /* trailing NULL */
    argv[argc] = NULL;

--- a/libbpg.c
+++ b/libbpg.c
@ -76,6 +76,7 @@ struct BPGDecoderContext {
    AVFrame *alpha_frame;
    int w, h;
    BPGImageFormatEnum format;
+    uint8_t c_h_phase; /* only used for 422 and 420 */
    uint8_t has_alpha; /* true if alpha or W plane */
    uint8_t bit_depth;
    uint8_t has_w_plane;
@ -170,12 +171,18 @@ static int get_ue(uint32_t *pv, const uint8_t *buf, int len)
    return ret;
 }

-static int decode_write_frame(AVCodecContext *avctx,
-                              AVFrame *frame, int *frame_count, AVPacket *pkt, int last)
+static int decode_write_data(AVCodecContext *avctx,
+                             AVFrame *frame, int *frame_count,
+                             const uint8_t *buf, int buf_len)
 {
+    AVPacket avpkt;
    int len, got_frame;

-    len = avcodec_decode_video2(avctx, frame, &got_frame, pkt);
+    av_init_packet(&avpkt);
+    avpkt.data = (uint8_t *)buf;
+    avpkt.size = buf_len;
+    
+    len = avcodec_decode_video2(avctx, frame, &got_frame, &avpkt);
    if (len < 0) {
 #ifdef DEBUG
        fprintf(stderr, "Error while decoding frame %d\n", *frame_count);
@ -184,40 +191,36 @@ static int decode_write_frame(AVCodecContext *avctx,
    }
    if (got_frame) {
 #ifdef DEBUG
-        printf("Saving %sframe %3d\n", last ? "last " : "", *frame_count);
+        printf("got frame %d\n", *frame_count);
 #endif
        (*frame_count)++;
    }
-    if (pkt->data) {
-        pkt->size -= len;
-        pkt->data += len;
-    }
    return 0;
 }

 extern AVCodec ff_hevc_decoder;

-static AVFrame *hevc_decode(const uint8_t *input_data, int input_data_len,
-                            int width, int height, int chroma_format_idc,
-                            int bit_depth)
+static int build_msps(uint8_t **pbuf, int *pbuf_len,
+                      const uint8_t *input_data, int input_data_len1,
+                      int width, int height, int chroma_format_idc,
+                      int bit_depth)
 {
-    AVCodec *codec;
-    AVCodecContext *c= NULL;
-    int frame_count, idx, msps_len, ret, buf_len, i;
-    AVPacket avpkt;
-    AVFrame *frame;
+    int input_data_len = input_data_len1;
+    int idx, msps_len, ret, buf_len, i;
    uint32_t len;
    uint8_t *buf, *msps_buf;

+    *pbuf = NULL;
+
    /* build the modified SPS header to please libavcodec */
    ret = get_ue(&len, input_data, input_data_len);
    if (ret < 0)
-        return NULL;
+        return -1;
    input_data += ret;
    input_data_len -= ret;
    
    if (len > input_data_len)
-        return NULL;
+        return -1;

    msps_len = 1 + 4 + 4 + 1 + len;
    msps_buf = av_malloc(msps_len);
@ -265,74 +268,211 @@ static AVFrame *hevc_decode(const uint8_t *input_data, int input_data_len,
    /* the last byte cannot be 0 */
    if (idx == 0 || buf[idx - 1] == 0x00)
        buf[idx++] = 0x80;
-    
    av_free(msps_buf);
-
-    /* NAL start code (Note: should be 3 bytes depending on exact NAL
-       type, but it is not critical for libavcodec) */
-    buf[idx++] = 0x00;
-    buf[idx++] = 0x00;
-    buf[idx++] = 0x00;
-    buf[idx++] = 0x01; 
-
-    memcpy(buf + idx, input_data, input_data_len);
-    idx += input_data_len;
    
-    assert(idx < buf_len);
+    *pbuf_len = idx;
+    *pbuf = buf;
+    return input_data_len1 - input_data_len;
+}

-    av_init_packet(&avpkt);
+static AVFrame *hevc_decode_frame(const uint8_t *buf, int buf_len)
+{
+    AVCodec *codec;
+    AVCodecContext *c;
+    AVFrame *frame;
+    int frame_count, ret;

    codec = &ff_hevc_decoder;

    c = avcodec_alloc_context3(codec);
-    if (!c) {
-#ifdef DEBUG
-        fprintf(stderr, "Could not allocate video codec context\n");
-#endif
-        exit(1);
-    }
-
-    if(codec->capabilities&CODEC_CAP_TRUNCATED)
-        c->flags|= CODEC_FLAG_TRUNCATED; /* we do not send complete frames */
-
+    if (!c) 
+        goto fail;
+    frame = av_frame_alloc();
+    if (!frame) 
+        goto fail;
    /* for testing: use the MD5 or CRC in SEI to check the decoded bit
       stream. */
    c->err_recognition |= AV_EF_CRCCHECK; 
-
    /* open it */
-    if (avcodec_open2(c, codec, NULL) < 0) {
-#ifdef DEBUG
-        fprintf(stderr, "Could not open codec\n");
-#endif
-        exit(1);
+    if (avcodec_open2(c, codec, NULL) < 0) 
+        goto fail;
+    
+    frame_count = 0;
+    ret = decode_write_data(c, frame, &frame_count, buf, buf_len);
+    avcodec_close(c);
+    if (ret < 0 || frame_count != 1)
+        goto fail;
+    av_free(c);
+    return frame;
+ fail:
+    av_free(c);
+    av_frame_free(&frame);
+    return NULL;
+}
+
+/* return the position of the end of the NAL or -1 if error */
+static int find_nal_end(const uint8_t *buf, int buf_len, int has_startcode)
+{
+    int idx;
+
+    idx = 0;
+    if (has_startcode) {
+        if (buf_len >= 4 &&
+            buf[0] == 0 && buf[1] == 0 && buf[2] == 0 && buf[3] == 1) {
+            idx = 4;
+        } else if (buf_len >= 3 &&
+                   buf[0] == 0 && buf[1] == 0 && buf[2] == 1) {
+            idx = 3;
+        } else {
+            return -1;
+        }
+    }
+    /* NAL header */
+    if (idx + 2 > buf_len)
+        return -1;
+    /* find the last byte */
+    for(;;) {
+        if (idx + 2 >= buf_len) {
+            idx = buf_len;
+            break;
+        }
+        if (buf[idx] == 0 && buf[idx + 1] == 0 && buf[idx + 2] == 1)
+            break;
+        if (idx + 3 < buf_len &&
+            buf[idx] == 0 && buf[idx + 1] == 0 && buf[idx + 2] == 0 && buf[idx + 3] == 1)
+            break;
+        idx++;
+    }
+    return idx;
+}
+
+typedef struct {
+    uint8_t *buf;
+    int size;
+    int len;
+} DynBuf;
+
+static void dyn_buf_init(DynBuf *s)
+{
+    s->buf = NULL;
+    s->size = 0;
+    s->len = 0;
+}
+
+static int dyn_buf_resize(DynBuf *s, int size)
+{
+    int new_size;
+    uint8_t *new_buf;
+
+    if (size <= s->size)
+        return 0;
+    new_size = (s->size * 3) / 2;
+    if (new_size < size)
+        new_size = size;
+    new_buf = av_realloc(s->buf, new_size);
+    if (!new_buf) 
+        return -1;
+    s->buf = new_buf;
+    s->size = new_size;
+    return 0;
+}
+
+static int dyn_buf_push(DynBuf *s, const uint8_t *data, int len)
+{
+    if (dyn_buf_resize(s, s->len + len) < 0)
+        return -1;
+    memcpy(s->buf + s->len, data, len);
+    s->len += len;
+    return 0;
+}
+
+static int hevc_decode(AVFrame **pcframe, AVFrame **paframe,
+                       const uint8_t *buf, int buf_len,
+                       int width, int height, int chroma_format_idc,
+                       int bit_depth, int has_alpha)
+{
+    int nal_len, start, first_nal, nal_buf_len, ret, nuh_layer_id;
+    AVFrame *cframe = NULL, *aframe = NULL;
+    uint8_t *nal_buf;
+    DynBuf abuf_s, *abuf = &abuf_s;
+    DynBuf cbuf_s, *cbuf = &cbuf_s;
+    DynBuf *pbuf;
+
+    dyn_buf_init(abuf);
+    dyn_buf_init(cbuf);
+
+    if (has_alpha) {
+        ret = build_msps(&nal_buf, &nal_len, buf, buf_len,
+                         width, height, 0, bit_depth);
+        if (ret < 0)
+            goto fail;
+        buf += ret;
+        buf_len -= ret;
+        if (dyn_buf_push(abuf, nal_buf, nal_len) < 0)
+            goto fail;
+        free(nal_buf);
+    }
+
+    ret = build_msps(&nal_buf, &nal_len, buf, buf_len,
+                     width, height, chroma_format_idc, bit_depth);
+    if (ret < 0)
+        goto fail;
+    buf += ret;
+    buf_len -= ret;
+    if (dyn_buf_push(cbuf, nal_buf, nal_len) < 0)
+        goto fail;
+    free(nal_buf);
+
+    first_nal = 1;
+    while (buf_len > 0) {
+        nal_len = find_nal_end(buf, buf_len, !first_nal);
+        if (nal_len < 0)
+            goto fail;
+        if (first_nal)
+            start = 0;
+        else
+            start = 3 + (buf[2] == 0);
+        nuh_layer_id = ((buf[start] & 1) << 5) | (buf[start + 1] >> 3);
+        nal_buf_len = nal_len - start + 3;
+        if (has_alpha && nuh_layer_id == 1)
+            pbuf = abuf;
+        else
+            pbuf = cbuf;
+        if (dyn_buf_resize(pbuf, pbuf->len + nal_buf_len) < 0)
+            goto fail;
+        nal_buf = pbuf->buf + pbuf->len;
+        nal_buf[0] = 0x00;
+        nal_buf[1] = 0x00;
+        nal_buf[2] = 0x01;
+        memcpy(nal_buf + 3, buf + start, nal_len - start);
+        if (has_alpha && nuh_layer_id == 1)
+            nal_buf[4] &= 0x7;
+        pbuf->len += nal_buf_len;
+        buf += nal_len;
+        buf_len -= nal_len;
+        first_nal = 0;
    }
    
-    frame = av_frame_alloc();
-    if (!frame) {
-#ifdef DEBUG
-        fprintf(stderr, "Could not allocate video frame\n");
-#endif
-        return NULL;
-    }
-
-    avpkt.size = idx;
-    avpkt.data = buf;
-    frame_count = 0;
-    while (avpkt.size > 0) {
-        if (decode_write_frame(c, frame, &frame_count, &avpkt, 0) < 0)
-            exit(1);
-    }
-
-    avcodec_close(c);
-    av_free(c);
-    av_free(buf);
-
-    if (frame_count == 0) {
-        av_frame_free(&frame);
-        return NULL;
-    } else {
-        return frame;
+    if (has_alpha) {
+        aframe = hevc_decode_frame(abuf->buf, abuf->len);
+        if (!aframe)
+            goto fail;
    }
+    cframe = hevc_decode_frame(cbuf->buf, cbuf->len);
+    if (!cframe)
+        goto fail;
+    ret = 0;
+ done:
+    av_free(abuf->buf);
+    av_free(cbuf->buf);
+    *pcframe = cframe;
+    *paframe = aframe;
+    return ret;
+ fail:
+    av_frame_free(&cframe);
+    av_frame_free(&aframe);
+    ret = -1;
+    goto done;
 }

 uint8_t *bpg_decoder_get_data(BPGDecoderContext *img, int *pline_size, int plane)
@ -390,18 +530,72 @@ static inline int clamp8(int a)
        return a;
 }

-/* 7 tap Lanczos interpolator */
-#define IC0 (-1)
-#define IC1 4
-#define IC2 (-10)
-#define IC3 57
-#define IC4 18
-#define IC5 (-6)
-#define IC6 2
+/* 8 tap Lanczos interpolator (phase=0, symmetric) */
+#define IP0C0 40
+#define IP0C1 (-11)
+#define IP0C2 4
+#define IP0C3 (-1)
+
+/* 7 tap Lanczos interpolator (phase=0.5) */
+#define IP1C0 (-1)
+#define IP1C1 4
+#define IP1C2 (-10)
+#define IP1C3 57
+#define IP1C4 18
+#define IP1C5 (-6)
+#define IP1C6 2
+
+/* interpolate by a factor of two assuming chroma is aligned with the
+   luma samples. */
+static void interp2p0_simple(PIXEL *dst, const PIXEL *src, int n, int bit_depth)
+{
+    int pixel_max;
+
+    pixel_max = (1 << bit_depth) - 1;
+    while (n >= 2) {
+        dst[0] = src[0];
+        dst[1] = clamp_pix(((src[-3] + src[4]) * IP0C3 + 
+                            (src[-2] + src[3]) * IP0C2 + 
+                            (src[-1] + src[2]) * IP0C1 + 
+                            (src[0] + src[1]) * IP0C0 + 32) >> 6, pixel_max);
+        dst += 2;
+        src++;
+        n -= 2;
+    }
+    if (n) {
+        dst[0] = src[0];
+    }
+}
+
+static void interp2p0_simple16(PIXEL *dst, const int16_t *src, int n, int bit_depth)
+{
+    int shift1, offset1, shift0, offset0, pixel_max;
+
+    pixel_max = (1 << bit_depth) - 1;
+    shift0 = 14 - bit_depth;
+    offset0 = (1 << shift0) >> 1;
+    shift1 = 20 - bit_depth;
+    offset1 = 1 << (shift1 - 1);
+
+    while (n >= 2) {
+        dst[0] = clamp_pix((src[0] + offset0) >> shift0, pixel_max);
+        dst[1] = clamp_pix(((src[-3] + src[4]) * IP0C3 + 
+                            (src[-2] + src[3]) * IP0C2 + 
+                            (src[-1] + src[2]) * IP0C1 + 
+                            (src[0] + src[1]) * IP0C0 + offset1) >> shift1,
+                           pixel_max);
+        dst += 2;
+        src++;
+        n -= 2;
+    }
+    if (n) {
+        dst[0] = clamp_pix((src[0] + offset0) >> shift0, pixel_max);
+    }
+}

 /* interpolate by a factor of two assuming chroma is between the luma
   samples. */
-static void interp2_simple(PIXEL *dst, const PIXEL *src, int n, int bit_depth)
+static void interp2p1_simple(PIXEL *dst, const PIXEL *src, int n, int bit_depth)
 {
    int pixel_max, a0, a1, a2, a3, a4, a5, a6;

@ -422,11 +616,11 @@ static void interp2_simple(PIXEL *dst, const PIXEL *src, int n, int bit_depth)
        a4 = a5;
        a5 = a6;
        a6 = src[3];
-        dst[0] = clamp_pix((a0 * IC6 + a1 * IC5 + a2 * IC4 + a3 * IC3 + 
-                            a4 * IC2 + a5 * IC1 + a6 * IC0 + 32) >> 6, 
+        dst[0] = clamp_pix((a0 * IP1C6 + a1 * IP1C5 + a2 * IP1C4 + a3 * IP1C3 + 
+                            a4 * IP1C2 + a5 * IP1C1 + a6 * IP1C0 + 32) >> 6, 
                           pixel_max);
-        dst[1] = clamp_pix((a0 * IC0 + a1 * IC1 + a2 * IC2 + a3 * IC3 +
-                            a4 * IC4 + a5 * IC5 + a6 * IC6 + 32) >> 6, 
+        dst[1] = clamp_pix((a0 * IP1C0 + a1 * IP1C1 + a2 * IP1C2 + a3 * IP1C3 +
+                            a4 * IP1C4 + a5 * IP1C5 + a6 * IP1C6 + 32) >> 6, 
                           pixel_max);
        dst += 2;
        src++;
@ -440,35 +634,14 @@ static void interp2_simple(PIXEL *dst, const PIXEL *src, int n, int bit_depth)
        a4 = a5;
        a5 = a6;
        a6 = src[3];
-        dst[0] = clamp_pix((a0 * IC6 + a1 * IC5 + a2 * IC4 + a3 * IC3 + 
-                            a4 * IC2 + a5 * IC1 + a6 * IC0 + 32) >> 6, 
+        dst[0] = clamp_pix((a0 * IP1C6 + a1 * IP1C5 + a2 * IP1C4 + a3 * IP1C3 + 
+                            a4 * IP1C2 + a5 * IP1C1 + a6 * IP1C0 + 32) >> 6, 
                           pixel_max);
    }
 }

-static void interp2_h(PIXEL *dst, const PIXEL *src, int n, int bit_depth)
-{
-    PIXEL *src1, v;
-    int i, n2;
-
-    /* add extra pixels and do the interpolation (XXX: could go faster) */
-    n2 = (n + 1) / 2;
-    src1 = av_malloc((n2 + ITAPS - 1) * sizeof(PIXEL));
-    memcpy(src1 + ITAPS2 - 1, src, n2 * sizeof(PIXEL));
-
-    v = src[0];
-    for(i = 0; i < ITAPS2 - 1; i++)
-        src1[i] = v;
-
-    v = src[n2 - 1];
-    for(i = 0; i < ITAPS2; i++)
-        src1[ITAPS2 - 1 + n2 + i] = v;
-    interp2_simple(dst, src1 + ITAPS2 - 1, n, bit_depth);
-    av_free(src1);
-}
-
-static void interp2_simple2(PIXEL *dst, const int16_t *src, int n, 
-                            int bit_depth)
+static void interp2p1_simple16(PIXEL *dst, const int16_t *src, int n, 
+                               int bit_depth)
 {
    int shift, offset, pixel_max, a0, a1, a2, a3, a4, a5, a6;

@ -491,11 +664,11 @@ static void interp2_simple2(PIXEL *dst, const int16_t *src, int n,
        a4 = a5;
        a5 = a6;
        a6 = src[3];
-        dst[0] = clamp_pix((a0 * IC6 + a1 * IC5 + a2 * IC4 + a3 * IC3 +
-                            a4 * IC2 + a5 * IC1 + a6 * IC0 + offset) >> shift,
+        dst[0] = clamp_pix((a0 * IP1C6 + a1 * IP1C5 + a2 * IP1C4 + a3 * IP1C3 +
+                            a4 * IP1C2 + a5 * IP1C1 + a6 * IP1C0 + offset) >> shift,
                           pixel_max);
-        dst[1] = clamp_pix((a0 * IC0 + a1 * IC1 + a2 * IC2 + a3 * IC3 +
-                            a4 * IC4 + a5 * IC5 + a6 * IC6 + offset) >> shift,
+        dst[1] = clamp_pix((a0 * IP1C0 + a1 * IP1C1 + a2 * IP1C2 + a3 * IP1C3 +
+                            a4 * IP1C4 + a5 * IP1C5 + a6 * IP1C6 + offset) >> shift,
                           pixel_max);
        dst += 2;
        src++;
@ -509,19 +682,44 @@ static void interp2_simple2(PIXEL *dst, const int16_t *src, int n,
        a4 = a5;
        a5 = a6;
        a6 = src[3];
-        dst[0] = clamp_pix((a0 * IC6 + a1 * IC5 + a2 * IC4 + a3 * IC3 +
-                            a4 * IC2 + a5 * IC1 + a6 * IC0 + offset) >> shift, 
+        dst[0] = clamp_pix((a0 * IP1C6 + a1 * IP1C5 + a2 * IP1C4 + a3 * IP1C3 +
+                            a4 * IP1C2 + a5 * IP1C1 + a6 * IP1C0 + offset) >> shift, 
                           pixel_max);
    }
 }

+/* tmp_buf is a temporary buffer of length (n2 + 2 * ITAPS2 - 1) */
+static void interp2_h(PIXEL *dst, const PIXEL *src, int n, int bit_depth,
+                      int phase, PIXEL *tmp_buf)
+{
+    PIXEL *src1 = tmp_buf, v;
+    int i, n2;
+
+    /* add extra pixels and do the interpolation (XXX: could go faster) */
+    n2 = (n + 1) / 2;
+    memcpy(src1 + ITAPS2 - 1, src, n2 * sizeof(PIXEL));
+
+    v = src[0];
+    for(i = 0; i < ITAPS2 - 1; i++)
+        src1[i] = v;
+
+    v = src[n2 - 1];
+    for(i = 0; i < ITAPS2; i++)
+        src1[ITAPS2 - 1 + n2 + i] = v;
+    if (phase == 0)
+        interp2p0_simple(dst, src1 + ITAPS2 - 1, n, bit_depth);
+    else
+        interp2p1_simple(dst, src1 + ITAPS2 - 1, n, bit_depth);
+}
+
 /* y_pos is the position of the sample '0' in the 'src' circular
-   buffer. tmp is a temporary buffer of length (n2 + 2 * ITAPS - 1) */
+   buffer. tmp_buf is a temporary buffer of length (n2 + 2 * ITAPS2 - 1) */
 static void interp2_vh(PIXEL *dst, PIXEL **src, int n, int y_pos,
-                       int16_t *tmp_buf, int bit_depth, int frac_pos)
+                       int16_t *tmp_buf, int bit_depth, int frac_pos,
+                       int c_h_phase)
 {
    const PIXEL *src0, *src1, *src2, *src3, *src4, *src5, *src6;
-    int i, n2, shift;
+    int i, n2, shift, rnd;
    PIXEL v;

    src0 = src[(y_pos - 3) & 7];
@ -533,24 +731,24 @@ static void interp2_vh(PIXEL *dst, PIXEL **src, int n, int y_pos,
    src6 = src[(y_pos + 3) & 7];

    /* vertical interpolation first */
-    /* XXX: should round but not critical */
    shift = bit_depth - 8;
+    rnd = (1 << shift) >> 1;
    n2 = (n + 1) / 2;
    if (frac_pos == 0) {
        for(i = 0; i < n2; i++) {
            tmp_buf[ITAPS2 - 1 + i] = 
-                (src0[i] * IC6 + src1[i] * IC5 + 
-                 src2[i] * IC4 + src3[i] * IC3 + 
-                 src4[i] * IC2 + src5[i] * IC1 + 
-                 src6[i] * IC0) >> shift;
+                (src0[i] * IP1C6 + src1[i] * IP1C5 + 
+                 src2[i] * IP1C4 + src3[i] * IP1C3 + 
+                 src4[i] * IP1C2 + src5[i] * IP1C1 + 
+                 src6[i] * IP1C0 + rnd) >> shift;
        }
    } else {
        for(i = 0; i < n2; i++) {
            tmp_buf[ITAPS2 - 1 + i] = 
-                (src0[i] * IC0 + src1[i] * IC1 + 
-                 src2[i] * IC2 + src3[i] * IC3 + 
-                 src4[i] * IC4 + src5[i] * IC5 + 
-                 src6[i] * IC6) >> shift;
+                (src0[i] * IP1C0 + src1[i] * IP1C1 + 
+                 src2[i] * IP1C2 + src3[i] * IP1C3 + 
+                 src4[i] * IP1C4 + src5[i] * IP1C5 + 
+                 src6[i] * IP1C6 + rnd) >> shift;
        }
    }

@ -561,7 +759,10 @@ static void interp2_vh(PIXEL *dst, PIXEL **src, int n, int y_pos,
    v = tmp_buf[ITAPS2 - 1 + n2 - 1];
    for(i = 0; i < ITAPS2; i++)
        tmp_buf[ITAPS2 - 1 + n2 + i] = v;
-    interp2_simple2(dst, tmp_buf + ITAPS2 - 1, n, bit_depth);
+    if (c_h_phase == 0)
+        interp2p0_simple16(dst, tmp_buf + ITAPS2 - 1, n, bit_depth);
+    else
+        interp2p1_simple16(dst, tmp_buf + ITAPS2 - 1, n, bit_depth);
 }

 static void ycc_to_rgb24(ColorConvertState *s, uint8_t *dst, const PIXEL *y_ptr,
@ -1076,13 +1277,14 @@ static int bpg_decoder_output_init(BPGDecoderContext *s,
        s->h2 = (s->h + 1) / 2;
        s->cb_buf2 = av_malloc(s->w * sizeof(PIXEL));
        s->cr_buf2 = av_malloc(s->w * sizeof(PIXEL));
+        /* Note: too large if 422 and sizeof(PIXEL) = 1 */
+        s->c_buf4 = av_malloc((s->w2 + 2 * ITAPS2 - 1) * sizeof(int16_t));

        if (s->format == BPG_FORMAT_420) {
            for(i = 0; i < ITAPS; i++) {
                s->cb_buf3[i] = av_malloc(s->w2 * sizeof(PIXEL));
                s->cr_buf3[i] = av_malloc(s->w2 * sizeof(PIXEL));
            }
-            s->c_buf4 = av_malloc((s->w2 + 2 * ITAPS2 - 1) * sizeof(int16_t));
            
            /* init the vertical interpolation buffer */
            for(i = 0; i < ITAPS; i++) {
@ -1141,7 +1343,7 @@ static void bpg_decoder_output_end(BPGDecoderContext *s)
 int bpg_decoder_get_line(BPGDecoderContext *s, void *rgb_line1)
 {
    uint8_t *rgb_line = rgb_line1;
-    int w, h, y, pos, y2, y1, incr;
+    int w, h, y, pos, y2, y1, incr, y_frac;
    PIXEL *y_ptr, *cb_ptr, *cr_ptr, *a_ptr;

    w = s->w;
@ -1160,17 +1362,12 @@ int bpg_decoder_get_line(BPGDecoderContext *s, void *rgb_line1)
    case BPG_FORMAT_420:
        y2 = y >> 1;
        pos = y2 % ITAPS;
-        if ((y & 1) == 0) {
-            interp2_vh(s->cb_buf2, s->cb_buf3, w, pos, s->c_buf4,
-                       s->bit_depth, 0);
-            interp2_vh(s->cr_buf2, s->cr_buf3, w, pos, s->c_buf4,
-                       s->bit_depth, 0);
-        } else {
-            interp2_vh(s->cb_buf2, s->cb_buf3, w, pos, s->c_buf4,
-                       s->bit_depth, 1);
-            interp2_vh(s->cr_buf2, s->cr_buf3, w, pos, s->c_buf4,
-                       s->bit_depth, 1);
-
+        y_frac = y & 1;
+        interp2_vh(s->cb_buf2, s->cb_buf3, w, pos, s->c_buf4,
+                   s->bit_depth, y_frac, s->c_h_phase);
+        interp2_vh(s->cr_buf2, s->cr_buf3, w, pos, s->c_buf4,
+                   s->bit_depth, y_frac, s->c_h_phase);
+        if (y_frac) {
            /* add a new line in the circular buffer */
            pos = (pos + ITAPS2 + 1) % ITAPS;
            y1 = y2 + ITAPS2 + 1;
@ -1186,8 +1383,10 @@ int bpg_decoder_get_line(BPGDecoderContext *s, void *rgb_line1)
    case BPG_FORMAT_422:
        cb_ptr = (PIXEL *)(s->cb_buf + y * s->cb_linesize);
        cr_ptr = (PIXEL *)(s->cr_buf + y * s->cr_linesize);
-        interp2_h(s->cb_buf2, cb_ptr, w, s->bit_depth);
-        interp2_h(s->cr_buf2, cr_ptr, w, s->bit_depth);
+        interp2_h(s->cb_buf2, cb_ptr, w, s->bit_depth, s->c_h_phase, 
+                  (PIXEL *)s->c_buf4);
+        interp2_h(s->cr_buf2, cr_ptr, w, s->bit_depth, s->c_h_phase,
+                  (PIXEL *)s->c_buf4);
        s->cvt_func(&s->cvt, rgb_line, y_ptr, s->cb_buf2, s->cr_buf2, w, incr);
        break;
    case BPG_FORMAT_444:
@ -1279,8 +1478,7 @@ typedef struct {
    uint8_t premultiplied_alpha;
    uint8_t limited_range;
    BPGColorSpaceEnum color_space;
-    uint32_t ycc_data_len;
-    uint32_t alpha_data_len;
+    uint32_t hevc_data_len;
    BPGExtensionData *first_md;
 } BPGHeaderData;

@ -1302,7 +1500,7 @@ static int bpg_decode_header(BPGHeaderData *h,
    idx = 4;
    flags1 = buf[idx++];
    h->format = flags1 >> 5;
-    if (h->format > 3)
+    if (h->format > 5)
        return -1;
    alpha1_flag = (flags1 >> 4) & 1;
    h->bit_depth = (flags1 & 0xf) + 8;
@ -1343,7 +1541,7 @@ static int bpg_decode_header(BPGHeaderData *h,
    if (header_only)
        return idx;

-    ret = get_ue(&h->ycc_data_len, buf + idx, buf_len - idx);
+    ret = get_ue(&h->hevc_data_len, buf + idx, buf_len - idx);
    if (ret < 0)
        return -1;
    idx += ret;
@ -1356,14 +1554,6 @@ static int bpg_decode_header(BPGHeaderData *h,
        idx += ret;
    }

-    h->alpha_data_len = 0;
-    if (h->has_alpha) {
-        ret = get_ue(&h->alpha_data_len, buf + idx, buf_len - idx);
-        if (ret < 0)
-            return -1;
-        idx += ret;
-    }
-
    h->first_md = NULL;
    if (has_extension) {
        int ext_end;
@ -1407,12 +1597,16 @@ static int bpg_decode_header(BPGHeaderData *h,
            idx += extension_data_len;
        }
    }
+
+    if (h->hevc_data_len == 0)
+        h->hevc_data_len = buf_len - idx;
+    
    return idx;
 }

 int bpg_decoder_decode(BPGDecoderContext *img, const uint8_t *buf, int buf_len)
 {
-    int idx, has_alpha, format, bit_depth, chroma_format_idc, color_space;
+    int idx, has_alpha, bit_depth, color_space;
    uint32_t width, height;
    BPGHeaderData h_s, *h = &h_s;

@ -1421,14 +1615,23 @@ int bpg_decoder_decode(BPGDecoderContext *img, const uint8_t *buf, int buf_len)
        return idx;
    width = h->width;
    height = h->height;
-    format = h->format;
    has_alpha = h->has_alpha;
    color_space = h->color_space;
    bit_depth = h->bit_depth;
    
    img->w = width;
    img->h = height;
-    img->format = format;
+    img->format = h->format;
+    if (h->format == BPG_FORMAT_422_VIDEO) {
+        img->format = BPG_FORMAT_422;
+        img->c_h_phase = 0;
+    } else if (h->format == BPG_FORMAT_420_VIDEO) {
+        img->format = BPG_FORMAT_420;
+        img->c_h_phase = 0;
+    } else {
+        img->format = h->format;
+        img->c_h_phase = 1;
+    }
    img->has_alpha = has_alpha;
    img->premultiplied_alpha = h->premultiplied_alpha;
    img->has_w_plane = h->has_w_plane;
@ -1437,53 +1640,17 @@ int bpg_decoder_decode(BPGDecoderContext *img, const uint8_t *buf, int buf_len)
    img->bit_depth = bit_depth;
    img->first_md = h->first_md;

-    if (idx + h->ycc_data_len > buf_len)
+    if (idx + h->hevc_data_len > buf_len)
        goto fail;
-    chroma_format_idc = format;
-    img->frame = hevc_decode(buf + idx, h->ycc_data_len,
-                             width, height, chroma_format_idc, bit_depth);
-    if (!img->frame)
+    if (hevc_decode(&img->frame, &img->alpha_frame,
+                    buf + idx, h->hevc_data_len,
+                    width, height, img->format, bit_depth, has_alpha) < 0)
        goto fail;
-    idx += h->ycc_data_len;
+    idx += h->hevc_data_len;

    if (img->frame->width < img->w || img->frame->height < img->h)
        goto fail;
    
-    switch(img->frame->format) {
-    case AV_PIX_FMT_YUV420P16:
-    case AV_PIX_FMT_YUV420P:
-        if (format != BPG_FORMAT_420)
-            goto fail;
-        break;
-    case AV_PIX_FMT_YUV422P16:
-    case AV_PIX_FMT_YUV422P:
-        if (format != BPG_FORMAT_422)
-            goto fail;
-        break;
-    case AV_PIX_FMT_YUV444P16:
-    case AV_PIX_FMT_YUV444P:
-        if (format != BPG_FORMAT_444)
-            goto fail;
-        break;
-    case AV_PIX_FMT_GRAY16:
-    case AV_PIX_FMT_GRAY8:
-        if (format != BPG_FORMAT_GRAY)
-            goto fail;
-        break;
-    default:
-        goto fail;
-    }
-    
-    if (has_alpha) {
-        if (idx + h->alpha_data_len > buf_len)
-            goto fail;
-        img->alpha_frame = hevc_decode(buf + idx, h->alpha_data_len,
-                                       width, height, 0, bit_depth);
-        if (!img->alpha_frame)
-            goto fail;
-        idx += h->alpha_data_len;
-    }
-
    img->y = -1;
    return 0;

--- a/libbpg.h
+++ b/libbpg.h
@ -28,9 +28,11 @@ typedef struct BPGDecoderContext BPGDecoderContext;

 typedef enum {
    BPG_FORMAT_GRAY,
-    BPG_FORMAT_420,
-    BPG_FORMAT_422,
+    BPG_FORMAT_420, /* chroma at offset (0.5, 0.5) (JPEG) */
+    BPG_FORMAT_422, /* chroma at offset (0.5, 0) (JPEG) */
    BPG_FORMAT_444,
+    BPG_FORMAT_420_VIDEO, /* chroma at offset (0, 0.5) (MPEG2) */
+    BPG_FORMAT_422_VIDEO, /* chroma at offset (0, 0) (MPEG2) */
 } BPGImageFormatEnum;

 typedef enum {
 @ -1 +1 @@
 .9.3
 .9.4