数据压缩


数据压缩

在此主要介绍使用Huffman编码压缩和LZW压缩算法

地址: 整合为JUI程序(Java 8+)

1.Huffman压缩(霍夫曼压缩)

1.1.BitBuffer辅助类 - 按bit位处理字节数组
/**
 * BitterBuffer
 * 按bit位处理字节数组,在此仅实现使用到的方法
 */
public class BitBuffer {
    private long mark = -1;
    private long position = 0;
    private long limit;
    private long capacity;

    private byte[] hb;

    public BitBuffer(int byteCapacity) {
        this.hb = new byte[byteCapacity];
        this.limit = (long)byteCapacity << 3;
        this.capacity = limit;
    }
    private BitBuffer(byte[] array,long bitLength){
        this.hb = array;
        this.limit = bitLength;
        this.capacity = limit;
    }
    public static BitBuffer wrap(byte[] array, long bitLength){
        return new BitBuffer(array,bitLength);
    }

    //byte转无符号
    private int toUnsigned(byte value){ return value < 0 ? value & 0xff : value; }
    //转ByteBuffer
    public ByteBuffer toByteBuffer(){
        int length = (int) ((position & 7) == 0 ? position >>> 3 : (position >>> 3) + 1);
        ByteBuffer byteBuffer = ByteBuffer.wrap(hb,0,length);
        return byteBuffer;
    }
    //获取底层数组
    public byte[] array(){ return hb; }

    //取出bits位的值返回
    public long get(int bits){
        if(position + bits > limit) throw new RuntimeException("超出BitBuffer范围无法取出");
        if(bits > 64) throw new RuntimeException("单次取出超过64bit位,无法取出");
        int index = (int) (position >>> 3);
        int remain = (int)(8 - (position & 7));
        //更新position
        position += bits;
        int num = remain - bits;
        if(num >= 0) return (toUnsigned(hb[index]) >>> num) & ((1 << bits) - 1);
        else{
            long code = toUnsigned(hb[index]) & ((1 << remain) - 1);
            return get(code,-num,index+1);
        }
    }
    private long get(long code,int bits,int index){
        if(bits <= 8){
            int num = 8 - bits;
            int tail = (toUnsigned(hb[index]) >>> num) & ((1 << bits) - 1);
            return (code << bits) | tail;
        }
        code = (code << 8) | toUnsigned(hb[index]);
        return get(code,bits-8,index+1);
    }

    //将code的bits位(低位->高位数bits位)放入
    public void put(long code,int bits){
        if(position + bits > limit) throw new RuntimeException("超出BitBuffer范围无法取出");
        if(bits > 64) throw new RuntimeException("单次放入超过64bit位,无法放入");
        int index = (int)(position >>> 3);
        int remain = (int)(8 - (position & 7));
        //更新position
        position += bits;
        int num = remain - bits;
        if(num >= 0) hb[index] |= code << num;
        else{
            hb[index] |= code >>> -num;
            put(code,-num,index+1);
        }
    }
    private void put(long code,int bits,int index){
        if(bits <= 8){
            hb[index] |= code << 8-bits;
            return;
        }
        hb[index] |= code >>> (bits-=8);
        put(code,bits,index+1);
    }
    public boolean hasRemaining(){ return position < limit; }
    //获取当前position值
    public long position(){ return position; }
    //设置当前position值
    public BitBuffer position(long newPosition){
        if ((newPosition > limit) || (newPosition < 0))
            throw new IllegalArgumentException();
        position = newPosition;
        if (mark > position) mark = -1;
        return this;
    }
    public long remaining(){ return limit-position; }
}
1.2.Huffman压缩
/**
 * Huffman压缩
 * 概述: 通过huffman编码来重新表示字符,使得出现频率高的字符编码短,出现少的字符编码长
 *      整体来说,所需的总的bit位是减少的,但对于字符频率均匀的数据几乎没有压缩效果
 * 实现: 压缩后头部第1个字节存储压缩后结尾长度(结尾时可能一个字节没占满),
 *      紧接着4个字节存储压缩后字节长度(对于大文件分批次构建不同huffman树进行压缩,确定长度,可以分批次解压)
 * 细节: 分别使用bio与nio实现,解压时使用多线程并行优化
 */
public class Huffman {
    private int max = 1 << 24; //单次最大读取字节数,对nio性能影响大(ByteBuffer大小影响)
    private int R = 256;//码表范围
    private int bufferLength = 8192; //写入时缓冲区大小8K

    //huffman节点类
    private class Node implements Comparable<Node>{
        private char ch;
        private int freq;
        private Node left;
        private Node right;

        public Node(char ch, int freq, Node left, Node right) {
            this.ch = ch;
            this.freq = freq;
            this.left = left;
            this.right = right;
        }
        public boolean isLeaf(){
            return left == null && right == null;
        }
        @Override
        public int compareTo(Node o) { return this.freq - o.freq; }
    }

    //压缩: 使用bio
    public void compress(InputStream in, OutputStream out){
        int length;
        byte[] buff = null;
        try{
            while (true) {
                //对于大文件分批次构建huffman树进行压缩
                length = (length = in.available()) > max ? max : length;
                if(buff == null) buff = new byte[length];
                length = in.read(buff);
                if(length == -1 || length == 0) break; //读取结束
                //统计频率
                int[] freq = new int[R];
                for (int i = 0; i < length; ++i)
                    freq[toUnsigned(buff[i])]++;
                //创建huffman树
                Node root = buildTrie(freq);
                //生成编码表(是用long类型,防止code超过范围)
                long[] st = new long[R];
                buildCodeTable(root, st, 0L, 0);
                //写入huffman树
                BitBuffer bitBuffer = new BitBuffer(length+(R << 1));
                bitBuffer.position(40);//第一个字节标记压缩数据末尾bit数,后面4个字节记录压缩后字节长度
                writeTrie(root,bitBuffer);
                //写入压缩数据
                long mixCode,code;
                int bit;
                for(int i = 0;i<length;++i){
                    mixCode = st[toUnsigned(buff[i])];
                    bit = (int) (mixCode & 0xff);
                    code = mixCode >>> 8;
                    bitBuffer.put(code,bit);
                }
                //维护头部5字节
                long gap = bitBuffer.position() - 40;
                byte tail = (byte) (gap & 7);
                //压缩数据长度(不包含头部长度5)
                int compressLength = (int) (tail == 0 ? gap >>> 3 : (gap >>> 3) + 1);
                long position = bitBuffer.position();
                bitBuffer.position(0);
                bitBuffer.put(tail,8);
                bitBuffer.put(compressLength >>> 16,16);
                bitBuffer.put(compressLength,16);
                //写入压缩数据
                byte[] buf = bitBuffer.array();
                int maxLength = (int) ((position & 7) == 0 ? position >>> 3 : (position >>> 3) + 1);
                //分段写入
                for(int k = 0;k<maxLength;k+=bufferLength){
                    if(k + bufferLength > maxLength)
                        out.write(buf,k,maxLength - k);
                    else
                        out.write(buf,k,bufferLength);
                }

            }
        } catch (IOException e){
            e.printStackTrace();
        } finally {
            try {
                out.close();
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    //压缩: 使用nio
    public void nioCompress(FileInputStream in, FileOutputStream out){
        long length = 0;
        int capacity;
        try (FileChannel channel = in.getChannel();FileChannel outChannel = out.getChannel()) {
            long size = channel.size();
            //大文件分段压缩
            while(length < size){
                capacity = (int) ((length += max) <= size ? max : size - length + max);
                MappedByteBuffer mappedByteBuffer = channel.map(FileChannel.MapMode.READ_ONLY,length-max,capacity);
                //统计频率
                int[] freq = new int[R];
                for (int i = 0; i < capacity; ++i)
                    freq[toUnsigned(mappedByteBuffer.get())]++;
                //创建huffman树
                Node root = buildTrie(freq);
                //生成编码表(是用long类型,防止编码过长)
                long[] st = new long[R];
                buildCodeTable(root, st, 0L, 0);
                //写入huffman树
                BitBuffer bitBuffer = new BitBuffer(capacity+(R << 1));
                bitBuffer.position(40);//第一个字节标记压缩数据末尾bit数,后面4个字节记录压缩后字节长度
                writeTrie(root,bitBuffer);
                //压缩数据放入字节数组
                long mixCode,code;
                int bit;
                mappedByteBuffer.flip();
                for(int i = 0;i<capacity;++i){
                    mixCode = st[toUnsigned(mappedByteBuffer.get())];
                    bit = (int) (mixCode & 0xff);
                    code = mixCode >>> 8;
                    bitBuffer.put(code,bit);
                }
                //维护头部5字节
                long gap = bitBuffer.position() - 40;
                byte tail = (byte) (gap & 7);
                //压缩数据长度(不包含头部长度5)
                int compressLength = tail == 0 ? (int)(gap >>> 3) : (int)((gap >>> 3) + 1);
                long position = bitBuffer.position();
                bitBuffer.position(0);
                bitBuffer.put(tail,8);
                bitBuffer.put(compressLength >>> 16,16);
                bitBuffer.put(compressLength,16);
                //写入压缩数据
                bitBuffer.position(position);
                ByteBuffer byteBuffer = bitBuffer.toByteBuffer();
                outChannel.write(byteBuffer);
            }

        } catch(IOException e){
            e.printStackTrace();
        }
    }

    //解压: 使用bio
    public void expand(InputStream in,OutputStream out){
        //确定线程数与容量
        int capacity = Runtime.getRuntime().availableProcessors();
        LinkedBlockingQueue<Future<ByteBuffer>> blockingQueue = new LinkedBlockingQueue<>(capacity);
        CountDownLatch latch = new CountDownLatch(1); //闭锁: 作用等待数据写入线程结束再放行
        //数据写入线程: 负责将解压数据写入
        Thread thread = new Thread(() ->{
            try {
                while(!blockingQueue.isEmpty()) {
                    Future<ByteBuffer> future = blockingQueue.poll();
                    ByteBuffer byteBuffer = future.get();
                    //写入
                    byte[] buf = byteBuffer.array();
                    int maxLength = byteBuffer.limit();
                    //分段写入
                    for(int k = 0;k<maxLength;k+=bufferLength){
                        if(k + bufferLength > maxLength)
                            out.write(buf,k,maxLength - k);
                        else
                            out.write(buf,k,bufferLength);
                    }
                }
                latch.countDown();
            } catch (Exception e) { //异常不做处理
                e.printStackTrace();
            }
        });

        try {
            //解压线程池: 负责解压数据
            ExecutorService executor = Executors.newWorkStealingPool(capacity);
            while(true) {
                //大文件可能分多段压缩,确定当前段长度
                byte[] head = new byte[5];
                int length = in.read(head); //读取头部信息
                if(length == -1 || length == 0) break;
                ByteBuffer headByte = ByteBuffer.wrap(head);
                int tailBit = headByte.get();//获取尾部bit位
                if (length != 5) throw new RuntimeException("头部信息缺失");
                length = headByte.getInt();//获取到当前段压缩字节长度
                byte[] buff = new byte[length]; //读取当前压缩段数据
                //读取压缩段
                long len = in.read(buff);
                if(length == -1 || length == 0) break;
                if (len != length) throw new RuntimeException("读取压缩信息失败");
                //解压并写入数据到byte数组: 并行提高效率
                Future<ByteBuffer> future = executor.submit(()->{
                    long bitLength = tailBit == 0 ? len << 3 : (len << 3) + tailBit - 8;
                    BitBuffer bitBuffer = BitBuffer.wrap(buff,bitLength);
                    //读取huffman树
                    Node root = readTrie(bitBuffer);
                    //解压后先写入byte数组
                    byte[] buf = new byte[max];
                    int bufIndex = 0;
                    //特殊处理: root为叶子节点,仅一个节点
                    if(root.isLeaf()){
                        long p = bitBuffer.position();
                        long limit = bitBuffer.limit();
                        for(;p<limit;p++)
                            buf[bufIndex++] = (byte)root.ch;
                        return ByteBuffer.wrap(buf,0,bufIndex);
                    }
                    //解码
                    while(bitBuffer.hasRemaining()){
                        Node x = root;
                        //此处每次读取1bit位处理,效率低,多线程并行优化
                        while(!x.isLeaf() && bitBuffer.hasRemaining()){
                            boolean bit = bitBuffer.get(1) == 1;
                            if(bit) x = x.right;
                            else x = x.left;
                        }
                        buf[bufIndex++] = (byte) x.ch;
                    }
                    return ByteBuffer.wrap(buf,0,bufIndex);
                });
                blockingQueue.put(future); //将任务放入阻塞队列,put放入速度必定比take处理速度快
                if(!thread.isAlive()) thread.start(); //启动数据写入线程
            }

            latch.await();
        } catch (Exception e) {//异常不做处理
            e.printStackTrace();
        } finally {
            try {
                out.close();
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    //解压: 使用nio
    public void nioExpand(FileInputStream in,FileOutputStream out){
        //确定线程数与容量
        int capacity = Runtime.getRuntime().availableProcessors();
        LinkedBlockingQueue<Future<ByteBuffer>> blockingQueue = new LinkedBlockingQueue<>(capacity);
        CountDownLatch latch = new CountDownLatch(1); //闭锁: 作用等待数据写入线程结束再放行
        //数据写入线程: 负责将解压数据写入
        Thread thread = new Thread(() ->{
            try {
                FileChannel outChannel = out.getChannel();
                while(!blockingQueue.isEmpty()) {
                    Future<ByteBuffer> future = blockingQueue.poll();
                    ByteBuffer byteBuffer = future.get();
                    //写入
                    outChannel.write(byteBuffer);
                }
                latch.countDown();
            } catch (Exception e) { //异常不做处理
                e.printStackTrace();
            }
        });


        try(FileChannel inChannel = in.getChannel()){
            //解压线程池: 负责解压数据
            ExecutorService executor = Executors.newWorkStealingPool(capacity);
            while(true) {
                //大文件可能分多段压缩,确定当前段长度
                byte[] head = new byte[5];
                ByteBuffer headByte = ByteBuffer.wrap(head);
                int length = inChannel.read(headByte);
                if(length == -1 || length == 0) break;
                headByte.flip();
                int tailBit = headByte.get();//获取尾部bit位
                if (length != 5) throw new RuntimeException("头部信息缺失");
                length = headByte.getInt();//获取到当前段压缩字节长度
                byte[] buff = new byte[length]; //读取当前压缩段数据
                ByteBuffer dataByte = ByteBuffer.wrap(buff);
                long len = inChannel.read(dataByte);
                if(length == -1 || length == 0) break;
                if (len != length) throw new RuntimeException("读取压缩信息失败");

                //解压并写入数据到byte数组: 并行提高效率
                Future<ByteBuffer> future = executor.submit(()->{
                    long bitLength = tailBit == 0 ? len << 3 : (len << 3) + tailBit - 8;
                    BitBuffer bitBuffer = BitBuffer.wrap(buff,bitLength);
                    //读取huffman树
                    Node root = readTrie(bitBuffer);
                    //解压后先写入byte数组
                    byte[] buf = new byte[max];
                    int bufIndex = 0;
                    //特殊处理: root为叶子节点,仅一个节点
                    if(root.isLeaf()){
                        long p = bitBuffer.position();
                        long limit = bitBuffer.limit();
                        for(;p<limit;p++)
                            buf[bufIndex++] = (byte)root.ch;
                        return ByteBuffer.wrap(buf,0,bufIndex);
                    }
                    //解码
                    while(bitBuffer.hasRemaining()){
                        Node x = root;
                        //此处每次读取1bit位处理,效率低,多线程并行优化
                        while(!x.isLeaf() && bitBuffer.hasRemaining()){
                            boolean bit = bitBuffer.get(1) == 1;
                            if(bit) x = x.right;
                            else x = x.left;
                        }
                        buf[bufIndex++] = (byte)x.ch;
                    }
                    return ByteBuffer.wrap(buf,0,bufIndex);
                });
                blockingQueue.put(future); //将任务放入阻塞队列,put放入速度比take处理速度快
                if(!thread.isAlive()) thread.start(); //启动数据写入线程
            }
            latch.await();
        } catch(Exception e){
            e.printStackTrace();
        }
    }

    //压缩: 文件夹/文件
    public void compress(File inFile,FileOutputStream out){
        Queue<Map.Entry<Integer,File>> queue = new ArrayDeque<>();
        Queue<File> fileQueue = new ArrayDeque<>();
        queue.offer(new AbstractMap.SimpleEntry<>(-1,inFile));
        //对文件名再编码: 防止同名文件名导致的问题
        Integer dKey = 0;//文件夹,被3整除
        Integer fKey = 1;//非空文件,余1
        Integer eKey = 2;//空文件,余2
        Integer key; //当前文件对应的编码
        StringBuilder table = new StringBuilder();
        StringBuilder paths = new StringBuilder();
        while(!queue.isEmpty()){
            Map.Entry<Integer,File> entity = queue.poll();
            Integer parentKey = entity.getKey();
            File file = entity.getValue();
            if(file.isDirectory()){
                key = dKey;
                dKey+=3;
                File[] files = file.listFiles();
                for(File f : files) queue.offer(new AbstractMap.SimpleEntry<>(key,f));
            }
            else {
                if (file.length() == 0) {//空文件
                    key = eKey;
                    eKey += 3;
                } else {
                    key = fKey;
                    fKey += 3;
                    fileQueue.offer(file);
                }
            }
            paths.append(parentKey);
            paths.append("/");
            paths.append(key);
            paths.append("?");

            table.append(key);
            table.append(":");
            table.append(file.getName());
            table.append("<");
        }

        table.append(">");
        table.append(paths);

        try{
            //记录文件名长度(int值)+文件名
            byte[] pathsBuff = table.toString().getBytes();
            int len = pathsBuff.length;
            byte[] head = new byte[]{(byte)(len >>> 24),(byte)(len >>> 16),(byte)(len >>> 8),(byte)len};
            out.write(head);
            out.write(pathsBuff);

            //依次压缩文件
            while(!fileQueue.isEmpty()){
                File file = fileQueue.poll();
                long length = 0;
                int capacity;
                try(FileChannel channel = new FileInputStream(file).getChannel()){
                    long size = channel.size();
                    //大文件分段压缩
                    while(length < size){
                        capacity = (int) ((length += max) <= size ? max : size - length + max);
                        MappedByteBuffer mappedByteBuffer = channel.map(FileChannel.MapMode.READ_ONLY,length-max,capacity);
                        //统计频率
                        int[] freq = new int[R];
                        for (int i = 0; i < capacity; ++i)
                            freq[toUnsigned(mappedByteBuffer.get())]++;
                        //创建huffman树
                        Node root = buildTrie(freq);
                        //生成编码表(是用long类型,防止编码过长)
                        long[] st = new long[R];
                        buildCodeTable(root, st, 0L, 0);
                        //写入huffman树
                        BitBuffer bitBuffer = new BitBuffer(capacity+(R << 1));
                        bitBuffer.position(40);//第一个字节标记压缩数据末尾bit数,后面4个字节记录压缩后字节长度
                        writeTrie(root,bitBuffer);
                        //压缩数据放入字节数组
                        long mixCode,code;
                        int bit;
                        mappedByteBuffer.flip();
                        for(int i = 0;i<capacity;++i){
                            mixCode = st[toUnsigned(mappedByteBuffer.get())];
                            bit = (int) (mixCode & 0xff);
                            code = mixCode >>> 8;
                            bitBuffer.put(code,bit);
                        }
                        //维护头部5字节
                        long gap = bitBuffer.position() - 40;
                        byte tail = (byte) (gap & 7);
                        //压缩数据长度(不包含头部长度5)
                        int compressLength = tail == 0 ? (int)(gap >>> 3) : (int)((gap >>> 3) + 1);
                        long position = bitBuffer.position();
                        bitBuffer.position(0);
                        //若到文件末尾时标记0001
                        if(length >= size) {
                            bitBuffer.put(1,4);
                            bitBuffer.put(tail,4);
                        }else{
                            bitBuffer.put(tail,8);
                        }
                        bitBuffer.put(compressLength >>> 16,16);
                        bitBuffer.put(compressLength,16);
                        //写入压缩数据
                        byte[] buf = bitBuffer.array();
                        int maxLength = (int) ((position & 7) == 0 ? position >>> 3 : (position >>> 3) + 1);
                        //分段写入
                        for(int k = 0;k<maxLength;k+=bufferLength){
                            if(k + bufferLength > maxLength)
                                out.write(buf,k,maxLength - k);
                            else
                                out.write(buf,k,bufferLength);
                        }
                    }

                }catch (Exception e){
                    e.printStackTrace();
                }
            }
        } catch (Exception e){
            e.printStackTrace();
        } finally {
            try {
                out.close();
                System.gc();//通知尽快释放MappedByteBuffer
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    //解压: 文件夹/文件
    public void expand(FileInputStream in,File outFile){
        try{
            byte[] headBuff = new byte[4];
            int len = in.read(headBuff);
            if(len != 4) throw new RuntimeException("头部信息缺失");
            ByteBuffer headByte = ByteBuffer.wrap(headBuff);
            //获取文件名字符串长度
            int length = headByte.getInt();
            byte[] pathByte = new byte[length];
            //读取文件名字符串
            len = in.read(pathByte);
            if (len != length) throw new RuntimeException("读取压缩信息失败");
            String[] arrs = new String(pathByte).split(">");
            HashMap<Integer,String> hashMap = new HashMap<>(); //值: 文件名
            Queue<Map.Entry<Integer,File>> dirsQueue = new ArrayDeque<>();//保存文件夹键值
            //创建对照表
            for(String e : arrs[0].split("<")){
                String[] entity = e.split(":");
                hashMap.put(Integer.parseInt(entity[0]),entity[1]);
            }
            //创建文件
            File f;
            Integer key = -1;
            for(String e : arrs[1].split("\\?")){
                String[] entity = e.split("/");
                int parentKey = Integer.parseInt(entity[0]);
                int childKey = Integer.parseInt(entity[1]);
                int mod = childKey % 3;
                if(parentKey == -1 && mod == 0){ //处理文件夹根目录
                    f = new File(outFile,hashMap.get(childKey));
                    f.mkdir();
                    dirsQueue.offer(new AbstractMap.SimpleEntry<>(childKey,f));
                    outFile = f;
                    key = childKey;
                    continue;
                }
                Map.Entry<Integer,File> entry = null;
                while(key != parentKey){
                    entry = dirsQueue.poll();
                    key = entry.getKey();
                }
                outFile = entry == null ? outFile : entry.getValue();
                //处理
                f = new File(outFile,hashMap.get(childKey));
                if(mod == 0){ //文件夹
                    f.mkdir();
                    dirsQueue.offer(new AbstractMap.SimpleEntry<>(childKey,f));
                }
                else if(mod == 1){ //文件直接解压
                    f.createNewFile();
                    expandFile(in,new FileOutputStream(f));
                }
                else f.createNewFile(); //空文件
            }

        } catch (Exception e){
            e.printStackTrace();
        } finally {
            try {
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

    }
    public void expandFile(InputStream in,OutputStream out){
        //确定线程数与容量
        int capacity = Runtime.getRuntime().availableProcessors();
        LinkedBlockingQueue<Future<ByteBuffer>> blockingQueue = new LinkedBlockingQueue<>(capacity);
        CountDownLatch latch = new CountDownLatch(1); //闭锁: 作用等待数据写入线程结束再放行
        //数据写入线程: 负责将解压数据写入
        Thread thread = new Thread(() ->{
            try {
                while(!blockingQueue.isEmpty()) {
                    Future<ByteBuffer> future = blockingQueue.poll();
                    ByteBuffer byteBuffer = future.get();
                    //写入
                    byte[] buf = byteBuffer.array();
                    int maxLength = byteBuffer.limit();
                    //分段写入
                    for(int k = 0;k<maxLength;k+=bufferLength){
                        if(k + bufferLength > maxLength)
                            out.write(buf,k,maxLength - k);
                        else
                            out.write(buf,k,bufferLength);
                    }
                }
                latch.countDown();
            } catch (Exception e) { //异常不做处理
                e.printStackTrace();
            }
        });

        try {
            //解压线程池: 负责解压数据
            ExecutorService executor = Executors.newWorkStealingPool(capacity);
            while(true) {
                //大文件可能分多段压缩,确定当前段长度
                byte[] head = new byte[5];
                int length = in.read(head); //读取头部信息
                if(length == -1 || length == 0) break;
                ByteBuffer headByte = ByteBuffer.wrap(head);

                int bits = headByte.get();//获取尾部bit位
                boolean flag = false;
                if(bits > 7) {
                    bits &= 0xf;
                    flag = true;
                }

                int tailBit = bits;//获取尾部bit位
                if (length != 5) throw new RuntimeException("头部信息缺失");
                length = headByte.getInt();//获取到当前段压缩字节长度
                byte[] buff = new byte[length]; //读取当前压缩段数据
                //读取压缩段
                long len = in.read(buff);
                if(length == -1 || length == 0) break;
                if (len != length) throw new RuntimeException("读取压缩信息失败");

                //解压并写入数据到byte数组: 并行提高效率
                Future<ByteBuffer> future = executor.submit(()->{
                    long bitLength = tailBit == 0 ? len << 3 : (len << 3) + tailBit - 8;
                    BitBuffer bitBuffer = BitBuffer.wrap(buff,bitLength);
                    //读取huffman树
                    Node root = readTrie(bitBuffer);
                    //解压后先写入byte数组
                    byte[] buf = new byte[max];
                    int bufIndex = 0;
                    //特殊处理: root为叶子节点,仅一个节点
                    if(root.isLeaf()){
                        long p = bitBuffer.position();
                        long limit = bitBuffer.limit();
                        for(;p<limit;p++)
                            buf[bufIndex++] = (byte)root.ch;
                        return ByteBuffer.wrap(buf,0,bufIndex);
                    }
                    //解码
                    while(bitBuffer.hasRemaining()){
                        Node x = root;
                        //此处每次读取1bit位处理,效率低,多线程并行优化
                        while(!x.isLeaf() && bitBuffer.hasRemaining()){
                            boolean bit = bitBuffer.get(1) == 1;
                            if(bit) x = x.right;
                            else x = x.left;
                        }
                        buf[bufIndex++] = (byte) x.ch;
                    }
                    return ByteBuffer.wrap(buf,0,bufIndex);
                });
                blockingQueue.put(future); //将任务放入阻塞队列,put放入速度必定比take处理速度快
                if(!thread.isAlive()) thread.start(); //启动数据写入线程

                if(flag) break;
            }

            latch.await();
        } catch (Exception e) {//异常不做处理
            e.printStackTrace();
        } finally {
            try {
                out.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    //byte转无符号
    private int toUnsigned(byte value){ return value < 0 ? value & 0xff : value; }
    //根据统计的频率freq构建huffman树
    private Node buildTrie(int[] freq){
        PriorityQueue<Node> pq = new PriorityQueue<>();
        for(char c=0;c<R;++c)
            if (freq[c] > 0) pq.offer(new Node(c, freq[c], null, null));
        while(pq.size() > 1){
            Node left = pq.poll();
            Node right = pq.poll();
            Node parent = new Node('\0',left.freq+right.freq,left,right);
            pq.offer(parent);
        }
        return pq.poll();
    }
    //根据huffman树构建huffman码表,st为码表,int值的前3个字节保存码值,最后1个字节保存码值长度
    private void buildCodeTable(Node x,long[] st,long code,int length){
        if(!x.isLeaf()){
            code <<= 1;
            buildCodeTable(x.left,st,code,length+1);
            buildCodeTable(x.right,st,code+1,length+1);
        }
        else if(length == 0){ //只有一个根节点
            st[x.ch] = 1<<8|1;
        }
        else {
            if (length > 56) throw new RuntimeException("错误,编码过长超过long[] st无法保存码表");
            st[x.ch] = code<<8|length; //7个字节(码值)+1个字节(码值长度)
        }
    }
    //将huffman树存储到字节数组中,非叶子节点0,叶子节点1后面紧跟8bit位为叶子节点的码值
    private void writeTrie(Node x,BitBuffer buff){
        if(x.isLeaf()){
            buff.put(1,1);
            buff.put(x.ch,8);
            return;
        }
        else buff.position(buff.position()+1);
        writeTrie(x.left,buff);
        writeTrie(x.right,buff);
    }
    //读取头部存储的huffman树并创建(读取到叶子节点会自动递归结束)
    private Node readTrie(BitBuffer buff){
        boolean isLeaf = buff.get(1) == 1;
        if(isLeaf){
            Node node = new Node((char)buff.get(8),-1,null,null);
            return node;
        }
        else return new Node('\0',-1,readTrie(buff),readTrie(buff));
    }
}

2.LZW算法

使用到的三向单词查找树(仅保留用到的方法)-稍作改动直接处理字节数组

/**
 * 稍作改动的三向字符串查找树
 * 概述: 稍作改动可以直接处理字符数组
 */
public class TST<V> {
    private Node<V> root;//根节点
    private class Node<V>{  //节点
        private char c; //字符
        private Node<V> left; //左连接 <c
        private Node<V> mid;  //中连接 =c
        private Node<V> right;//右连接 >c
        private V value;    //保存键值
    }

    //获取: 键-buff[begin,end]组成的字符串
    public V get(byte[] buff,int begin,int end){
        Node x = get(root,buff,begin,end);
        if(x != null) return (V)x.value;
        if(x == null) System.out.println("x为null" + begin + ":" + end);
        return null;
    }
    private Node get(Node x,byte[] buff,int begin,int end){
        if(buff == null || buff.length == 0 || begin > end) return null;
        if(x == null) {
            System.out.println("x.为null" + begin+":"+end);
            return null;
        }
        char c = toUnsigned(buff[begin]);
        if(x.c > c) return get(x.left,buff,begin,end);
        else if(x.c < c) return get(x.right,buff,begin,end);
        else if(begin < end) return get(x.mid,buff,begin+1,end);
        else {
            if(x.value == null) System.out.println("begin:"+ begin+" end:"+end);
            return x;
        }
    }
    //插入: 键-byte数组buff[begin,end]组成的字符串,值-value
    public void put(byte[] buff,int begin,int end,V value){
        if(buff == null || buff.length == 0 || begin > end) return;
        root = put(root,buff,begin,end,value);
    }
    private Node put(Node x,byte[] buff,int begin,int end,V value){
        char c = toUnsigned(buff[begin]);
        if(x == null){
            x = new Node();
            x.c = c;
        }
        if(x.c > c) x.left = put(x.left,buff,begin,end,value);
        else if(x.c < c) x.right = put(x.right,buff,begin,end,value);
        else if(begin < end) x.mid = put(x.mid,buff,begin+1,end,value);
        else x.value = value;
        return x;
    }
    //获取最长前缀并所在byte数组的索引
    public int longestPrefixof(byte[] buff,int begin,int end){
        if(buff == null || buff.length == 0 || begin > end) return -1;
        int index = 0;
        Node x = root;
        while(x != null && begin <= end){
            char c = toUnsigned(buff[begin]);
            if(x.c > c) x = x.left;
            else if(x.c < c) x = x.right;
            else{
                if(x.value != null) index = begin;
                begin++;
                x = x.mid;
            }
        }
        return index;
    }

    //插入
    public void put(String key, V value){
        if(key == null || key.length() == 0) return;
        root = put(root,key,value,0);
    }
    /**插入: 与获取思路相同,只是遇到空节点创建新节点,到末尾时保存键值*/
    private Node put(Node x,String key,V value,int d){
        char c = key.charAt(d);
        if(x == null) { //没有节点就创建
            x = new Node();
            x.c = c;
        }
        if(x.c > c) x.left = put(x.left,key,value,d);
        else if(x.c < c) x.right = put(x.right,key,value,d);
        else if(d < key.length()-1) x.mid = put(x.mid,key,value,d+1);
        else x.value = value;   //末尾时保存键值
        return x;
    }

    //byte转无符号
    private char toUnsigned(byte value){ return (char)(value < 0 ? value & 0xff : value); }
}
2.1.LZW算法-12位定长bit位编码
/**
 * LZW算法
 * 概述: gif格式用到此算法思想,适合有大量重复值的数据,如果重复数据很少,反而会有反效果
 * 实现: 1.使用定长的12bit位码表,码表满时,停止扩展,不适合大文件
 *      2.JDK1.7+之后substring方法性能下降,在此直接处理字节数组,不再像算法4中转为字符串处理
 *      3.因为要处理字节数组,因此对三向字符串查找树稍作更改,直接处理字符
 */
public class LZW {
    private int buffLength = 8192;
    private int R = 256;
    private int W = 12;
    private int L = 1 << W;
    /** 压缩 **/
    public void compress(InputStream in, OutputStream out){
        try {
            byte[] buff = new byte[in.available()];
            int length = in.read(buff);
            //基础表
            TST<Integer> st = new TST<>(); //稍作更改的三向单词查找树(直接处理字节数组)
            for(int i=0;i<R;++i)
                st.put(""+(char)i,i);
            int code = R+1;
            BitBuffer bitBuffer = new BitBuffer(length << 1);
            //压缩
            for(int i=0;i<length;){
                //获取最长前缀索引
                int index = st.longestPrefixof(buff,i,length-1);
                //压缩后写入缓冲区
                Integer value = st.get(buff,i,index);
                bitBuffer.put(value,W);
                //扩展表
                if(index < length-1 && code < L) //W位表满之后就不再扩展
                    st.put(buff,i,index+1,code++);
                i = index + 1;
            }
            long position = bitBuffer.position();
            int len = (int)((position & 7 ) == 0 ? position >>> 3 : (position >>> 3) + 1);
            byte[] buf = bitBuffer.array();
            //分段写入
            for(int k = 0;k<len;k+=buffLength){
                if(k + buffLength > len)
                    out.write(buf,k,len - k);
                else
                    out.write(buf,k,buffLength);
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                out.close();
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    /** 解压 **/
    public void expand(InputStream in,OutputStream out){
        try {
            byte[] buff = new byte[in.available()];
            int length = in.read(buff);
            String[] st = new String[L]; //确定码表长度,直接使用定长数组存储解码表
            int i;
            //基础表
            for(i=0;i<R;++i)
                st[i] = ""+(char)i;
            st[i++] = "";
            BitBuffer bitBuffer = BitBuffer.wrap(buff,length << 3);
            if(bitBuffer.remaining() < W) return; //达到末尾结束
            //先取出第一个编码字符
            int codeword = (int) bitBuffer.get(W);
            String val = st[codeword]; //解码第一个字符
            StringBuilder builder = new StringBuilder(); //存储解码后的字符串
            while(true){
                builder.append(val);
                if(bitBuffer.remaining() < W) break; //达到末尾结束
                codeword = (int) bitBuffer.get(W); //读取一个编码字符
                String s = st[codeword];
                //扩展表
                if(i == codeword) s = val+ val.charAt(0); //特殊情况,当前解码字符刚好是下一个要加入码表的字符
                if(i < L) st[i++] = val + s.charAt(0);
                val = s;
                if(builder.length() > buffLength){ //分次写入
                    out.write(builder.toString().getBytes("ISO8859-1"));
                    builder.delete(0,builder.length());
                }
            }
            out.write(builder.toString().getBytes("ISO8859-1"));
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                out.close();
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}
2.2.LZW算法-从9bit位开始变长编码
/**
 * LZW算法
 * 概述: gif格式用到此算法思想,适合有大量重复值的数据,如果重复数据很少,反而会有反效果
 * 实现: 1.在LZW算法基础上使用变长的编码,从9bit位开始,用完时扩展,并使用最后一位L-1作为变更标记
 *      2.JDK1.7+之后substring方法性能下降,在此直接处理字节数组,不再像算法4中转为字符串处理
 *      3.压缩性能有待改进
 */
public class LZW {
    private int buffLength = 8192; //缓冲大小8M
    private int R = 256; //基本码表
    private int W = 9; //从9bit位开始扩展码表
    private int maxW = 24; //最大扩展bit位
    private int L = 1 << W; //码表大小

    /** 压缩 **/
    public void compress(InputStream in, OutputStream out){
        try {
            byte[] buff = new byte[in.available()];
            int length = in.read(buff);
            //基础表
            TST<Integer> st = new TST<>(); //稍作更改的三向单词查找树(直接处理字节数组)
            for(int i=0;i<R;++i)
                st.put(""+(char)i,i);
            int code = R+1;
            BitBuffer bitBuffer = new BitBuffer(length << 1);
            //压缩
            for(int i=0;i<length;){
                //获取最长前缀索引
                int index = st.longestPrefixof(buff,i,length-1);
                //压缩后写入缓冲区
                Integer value = st.get(buff,i,index);
                bitBuffer.put(value,W);
                //扩展表
                if(index < length-1 && code < L-1) //W位表没有满就扩展码表
                    st.put(buff,i,index+1,code++);
                else if(code == L-1){   //L-1标记当前M bit位码表编满,扩展码表为M+1 bit位
                    bitBuffer.put(L-1,W);
                    if(W == maxW){ //达到最大就重新创建码表
                        st = new TST<>();
                        for(int k=0;k<R;++k)
                            st.put(""+(char)k,k);
                        code = R+1;
                        W = 9;
                    }
                    else {  //没有达到最大就增加bit位扩展码表
                        code++;
                        if (index < length - 1)
                            st.put(buff, i, index + 1, code++);
                        W++;
                    }
                    L = 1 << W;
                }
                i = index + 1;
            }
            long position = bitBuffer.position();
            int len = (int)((position & 7 ) == 0 ? position >>> 3 : (position >>> 3) + 1);
            byte[] buf = bitBuffer.array();
            //分段写入
            for(int k = 0;k<len;k+=buffLength){
                if(k + buffLength > len)
                    out.write(buf,k,len - k);
                else
                    out.write(buf,k,buffLength);
            }

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                out.close();
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    /** 解压 **/
    public void expand(InputStream in,OutputStream out){
        try {
            byte[] buff = new byte[in.available()];
            long length = in.read(buff);
            //不确定码表总大小,使用hashMap
            HashMap<Integer,String> st = new HashMap<>(L);
            int i;
            //基础表
            for(i=0;i<R;++i)
                st.put(i,""+(char)i);
            st.put(i++,"");
            BitBuffer bitBuffer = BitBuffer.wrap(buff,length << 3);
            if(bitBuffer.remaining() < W) return; //达到末尾结束
            //先取出第一个编码字符
            int codeword = (int) bitBuffer.get(W);
            String val = st.get(codeword); //解码第一个字符
            StringBuilder builder = new StringBuilder(); //存储解码后的字符串
            while(true){
                builder.append(val); //解码后存入StringBuilder
                if(bitBuffer.remaining() < W) break; //达到末尾结束
                codeword = (int) bitBuffer.get(W); //读取一个编码字符
                if(codeword == L-1){ //达到最大码表,增加一bit位或者重置
                    if(W == maxW){ //达到最大bit位重置码表
                        W = 9;
                        L = 1 << W;
                        st = new HashMap<>(L);
                        //基础表
                        for(i=0;i<R;++i)
                            st.put(i,""+(char)i);
                        st.put(i++,"");
                        if(bitBuffer.remaining() < W) return; //结束
                        codeword = (int) bitBuffer.get(W);
                        val = st.get(codeword);
                        continue;
                    }
                    else {
                        W++;
                        L = 1 << W;
                        if(bitBuffer.remaining() < W) return; //结束
                        codeword = (int) bitBuffer.get(W);
                        i++; //L-1作为标记,不再使用跳过
                    }
                }
                String s = st.get(codeword);
                //扩展表
                if(i == codeword) s = val+ val.charAt(0); //特殊情况,当前解码字符刚好是下一个要加入码表的字符
                if(i < L) st.put(i++,val+s.charAt(0));
                val = s; //更新
                if(builder.length() > buffLength){ //分次写入
                    out.write(builder.toString().getBytes("ISO8859-1"));
                    builder.delete(0,builder.length());
                }
            }
            //写入最后部分
            out.write(builder.toString().getBytes("ISO8859-1"));
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                out.close();
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

文章作者: Bryson
版权声明: 本博客所有文章除特別声明外,均采用 CC BY 4.0 许可协议。转载请注明来源 Bryson !
评论
 上一篇
并发-对象的共享 并发-对象的共享
非原子的64操作 非volatile类型的64位数值变量(double和long),JVM允许将64位的读或写操作分解为两个32位操作,因此在多线程中使用共享且可变的long与double类型的变量是不安全的,需要使用volatile声明或
2019-08-10
下一篇 
正则表达式 正则表达式
正则表达式 对于任意正则表达式都存在一个与之对应的非确定有限状态自动机(NFA),反之亦然 1.正则匹配/** * 正则表达式的模式匹配 * 概述: 每个正则表达式都对应一个NFA,反之亦然 * 思路: 将正则表达式转换为有向图(仅
2019-07-16
  目录