数据压缩

算法字符串

算法

发布日期: 2019-07-30

更新日期: 2020-04-20

文章字数: 6.4k

阅读时长: 32 分

阅读次数:

在此主要介绍使用Huffman编码压缩和LZW压缩算法

地址: 整合为JUI程序(Java 8+)

1.Huffman压缩(霍夫曼压缩)

1.1.BitBuffer辅助类 - 按bit位处理字节数组

/**
 * BitterBuffer
 * 按bit位处理字节数组,在此仅实现使用到的方法
 */
public class BitBuffer {
    private long mark = -1;
    private long position = 0;
    private long limit;
    private long capacity;

    private byte[] hb;

    public BitBuffer(int byteCapacity) {
        this.hb = new byte[byteCapacity];
        this.limit = (long)byteCapacity << 3;
        this.capacity = limit;
    }
    private BitBuffer(byte[] array,long bitLength){
        this.hb = array;
        this.limit = bitLength;
        this.capacity = limit;
    }
    public static BitBuffer wrap(byte[] array, long bitLength){
        return new BitBuffer(array,bitLength);
    }

    //byte转无符号
    private int toUnsigned(byte value){ return value < 0 ? value & 0xff : value; }
    //转ByteBuffer
    public ByteBuffer toByteBuffer(){
        int length = (int) ((position & 7) == 0 ? position >>> 3 : (position >>> 3) + 1);
        ByteBuffer byteBuffer = ByteBuffer.wrap(hb,0,length);
        return byteBuffer;
    }
    //获取底层数组
    public byte[] array(){ return hb; }

    //取出bits位的值返回
    public long get(int bits){
        if(position + bits > limit) throw new RuntimeException("超出BitBuffer范围无法取出");
        if(bits > 64) throw new RuntimeException("单次取出超过64bit位,无法取出");
        int index = (int) (position >>> 3);
        int remain = (int)(8 - (position & 7));
        //更新position
        position += bits;
        int num = remain - bits;
        if(num >= 0) return (toUnsigned(hb[index]) >>> num) & ((1 << bits) - 1);
        else{
            long code = toUnsigned(hb[index]) & ((1 << remain) - 1);
            return get(code,-num,index+1);
        }
    }
    private long get(long code,int bits,int index){
        if(bits <= 8){
            int num = 8 - bits;
            int tail = (toUnsigned(hb[index]) >>> num) & ((1 << bits) - 1);
            return (code << bits) | tail;
        }
        code = (code << 8) | toUnsigned(hb[index]);
        return get(code,bits-8,index+1);
    }

    //将code的bits位(低位->高位数bits位)放入
    public void put(long code,int bits){
        if(position + bits > limit) throw new RuntimeException("超出BitBuffer范围无法取出");
        if(bits > 64) throw new RuntimeException("单次放入超过64bit位,无法放入");
        int index = (int)(position >>> 3);
        int remain = (int)(8 - (position & 7));
        //更新position
        position += bits;
        int num = remain - bits;
        if(num >= 0) hb[index] |= code << num;
        else{
            hb[index] |= code >>> -num;
            put(code,-num,index+1);
        }
    }
    private void put(long code,int bits,int index){
        if(bits <= 8){
            hb[index] |= code << 8-bits;
            return;
        }
        hb[index] |= code >>> (bits-=8);
        put(code,bits,index+1);
    }
    public boolean hasRemaining(){ return position < limit; }
    //获取当前position值
    public long position(){ return position; }
    //设置当前position值
    public BitBuffer position(long newPosition){
        if ((newPosition > limit) || (newPosition < 0))
            throw new IllegalArgumentException();
        position = newPosition;
        if (mark > position) mark = -1;
        return this;
    }
    public long remaining(){ return limit-position; }
}

1.2.Huffman压缩

/**
 * Huffman压缩
 * 概述: 通过huffman编码来重新表示字符,使得出现频率高的字符编码短,出现少的字符编码长
 *      整体来说,所需的总的bit位是减少的,但对于字符频率均匀的数据几乎没有压缩效果
 * 实现: 压缩后头部第1个字节存储压缩后结尾长度(结尾时可能一个字节没占满),
 *      紧接着4个字节存储压缩后字节长度(对于大文件分批次构建不同huffman树进行压缩,确定长度,可以分批次解压)
 * 细节: 分别使用bio与nio实现,解压时使用多线程并行优化
 */
public class Huffman {
    private int max = 1 << 24; //单次最大读取字节数,对nio性能影响大(ByteBuffer大小影响)
    private int R = 256;//码表范围
    private int bufferLength = 8192; //写入时缓冲区大小8K

    //huffman节点类
    private class Node implements Comparable<Node>{
        private char ch;
        private int freq;
        private Node left;
        private Node right;

        public Node(char ch, int freq, Node left, Node right) {
            this.ch = ch;
            this.freq = freq;
            this.left = left;
            this.right = right;
        }
        public boolean isLeaf(){
            return left == null && right == null;
        }
        @Override
        public int compareTo(Node o) { return this.freq - o.freq; }
    }

    //压缩: 使用bio
    public void compress(InputStream in, OutputStream out){
        int length;
        byte[] buff = null;
        try{
            while (true) {
                //对于大文件分批次构建huffman树进行压缩
                length = (length = in.available()) > max ? max : length;
                if(buff == null) buff = new byte[length];
                length = in.read(buff);
                if(length == -1 || length == 0) break; //读取结束
                //统计频率
                int[] freq = new int[R];
                for (int i = 0; i < length; ++i)
                    freq[toUnsigned(buff[i])]++;
                //创建huffman树
                Node root = buildTrie(freq);
                //生成编码表(是用long类型,防止code超过范围)
                long[] st = new long[R];
                buildCodeTable(root, st, 0L, 0);
                //写入huffman树
                BitBuffer bitBuffer = new BitBuffer(length+(R << 1));
                bitBuffer.position(40);//第一个字节标记压缩数据末尾bit数,后面4个字节记录压缩后字节长度
                writeTrie(root,bitBuffer);
                //写入压缩数据
                long mixCode,code;
                int bit;
                for(int i = 0;i<length;++i){
                    mixCode = st[toUnsigned(buff[i])];
                    bit = (int) (mixCode & 0xff);
                    code = mixCode >>> 8;
                    bitBuffer.put(code,bit);
                }
                //维护头部5字节
                long gap = bitBuffer.position() - 40;
                byte tail = (byte) (gap & 7);
                //压缩数据长度(不包含头部长度5)
                int compressLength = (int) (tail == 0 ? gap >>> 3 : (gap >>> 3) + 1);
                long position = bitBuffer.position();
                bitBuffer.position(0);
                bitBuffer.put(tail,8);
                bitBuffer.put(compressLength >>> 16,16);
                bitBuffer.put(compressLength,16);
                //写入压缩数据
                byte[] buf = bitBuffer.array();
                int maxLength = (int) ((position & 7) == 0 ? position >>> 3 : (position >>> 3) + 1);
                //分段写入
                for(int k = 0;k<maxLength;k+=bufferLength){
                    if(k + bufferLength > maxLength)
                        out.write(buf,k,maxLength - k);
                    else
                        out.write(buf,k,bufferLength);
                }

            }
        } catch (IOException e){
            e.printStackTrace();
        } finally {
            try {
                out.close();
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    //压缩: 使用nio
    public void nioCompress(FileInputStream in, FileOutputStream out){
        long length = 0;
        int capacity;
        try (FileChannel channel = in.getChannel();FileChannel outChannel = out.getChannel()) {
            long size = channel.size();
            //大文件分段压缩
            while(length < size){
                capacity = (int) ((length += max) <= size ? max : size - length + max);
                MappedByteBuffer mappedByteBuffer = channel.map(FileChannel.MapMode.READ_ONLY,length-max,capacity);
                //统计频率
                int[] freq = new int[R];
                for (int i = 0; i < capacity; ++i)
                    freq[toUnsigned(mappedByteBuffer.get())]++;
                //创建huffman树
                Node root = buildTrie(freq);
                //生成编码表(是用long类型,防止编码过长)
                long[] st = new long[R];
                buildCodeTable(root, st, 0L, 0);
                //写入huffman树
                BitBuffer bitBuffer = new BitBuffer(capacity+(R << 1));
                bitBuffer.position(40);//第一个字节标记压缩数据末尾bit数,后面4个字节记录压缩后字节长度
                writeTrie(root,bitBuffer);
                //压缩数据放入字节数组
                long mixCode,code;
                int bit;
                mappedByteBuffer.flip();
                for(int i = 0;i<capacity;++i){
                    mixCode = st[toUnsigned(mappedByteBuffer.get())];
                    bit = (int) (mixCode & 0xff);
                    code = mixCode >>> 8;
                    bitBuffer.put(code,bit);
                }
                //维护头部5字节
                long gap = bitBuffer.position() - 40;
                byte tail = (byte) (gap & 7);
                //压缩数据长度(不包含头部长度5)
                int compressLength = tail == 0 ? (int)(gap >>> 3) : (int)((gap >>> 3) + 1);
                long position = bitBuffer.position();
                bitBuffer.position(0);
                bitBuffer.put(tail,8);
                bitBuffer.put(compressLength >>> 16,16);
                bitBuffer.put(compressLength,16);
                //写入压缩数据
                bitBuffer.position(position);
                ByteBuffer byteBuffer = bitBuffer.toByteBuffer();
                outChannel.write(byteBuffer);
            }

        } catch(IOException e){
            e.printStackTrace();
        }
    }

    //解压: 使用bio
    public void expand(InputStream in,OutputStream out){
        //确定线程数与容量
        int capacity = Runtime.getRuntime().availableProcessors();
        LinkedBlockingQueue<Future<ByteBuffer>> blockingQueue = new LinkedBlockingQueue<>(capacity);
        CountDownLatch latch = new CountDownLatch(1); //闭锁: 作用等待数据写入线程结束再放行
        //数据写入线程: 负责将解压数据写入
        Thread thread = new Thread(() ->{
            try {
                while(!blockingQueue.isEmpty()) {
                    Future<ByteBuffer> future = blockingQueue.poll();
                    ByteBuffer byteBuffer = future.get();
                    //写入
                    byte[] buf = byteBuffer.array();
                    int maxLength = byteBuffer.limit();
                    //分段写入
                    for(int k = 0;k<maxLength;k+=bufferLength){
                        if(k + bufferLength > maxLength)
                            out.write(buf,k,maxLength - k);
                        else
                            out.write(buf,k,bufferLength);
                    }
                }
                latch.countDown();
            } catch (Exception e) { //异常不做处理
                e.printStackTrace();
            }
        });

        try {
            //解压线程池: 负责解压数据
            ExecutorService executor = Executors.newWorkStealingPool(capacity);
            while(true) {
                //大文件可能分多段压缩,确定当前段长度
                byte[] head = new byte[5];
                int length = in.read(head); //读取头部信息
                if(length == -1 || length == 0) break;
                ByteBuffer headByte = ByteBuffer.wrap(head);
                int tailBit = headByte.get();//获取尾部bit位
                if (length != 5) throw new RuntimeException("头部信息缺失");
                length = headByte.getInt();//获取到当前段压缩字节长度
                byte[] buff = new byte[length]; //读取当前压缩段数据
                //读取压缩段
                long len = in.read(buff);
                if(length == -1 || length == 0) break;
                if (len != length) throw new RuntimeException("读取压缩信息失败");
                //解压并写入数据到byte数组: 并行提高效率
                Future<ByteBuffer> future = executor.submit(()->{
                    long bitLength = tailBit == 0 ? len << 3 : (len << 3) + tailBit - 8;
                    BitBuffer bitBuffer = BitBuffer.wrap(buff,bitLength);
                    //读取huffman树
                    Node root = readTrie(bitBuffer);
                    //解压后先写入byte数组
                    byte[] buf = new byte[max];
                    int bufIndex = 0;
                    //特殊处理: root为叶子节点,仅一个节点
                    if(root.isLeaf()){
                        long p = bitBuffer.position();
                        long limit = bitBuffer.limit();
                        for(;p<limit;p++)
                            buf[bufIndex++] = (byte)root.ch;
                        return ByteBuffer.wrap(buf,0,bufIndex);
                    }
                    //解码
                    while(bitBuffer.hasRemaining()){
                        Node x = root;
                        //此处每次读取1bit位处理,效率低,多线程并行优化
                        while(!x.isLeaf() && bitBuffer.hasRemaining()){
                            boolean bit = bitBuffer.get(1) == 1;
                            if(bit) x = x.right;
                            else x = x.left;
                        }
                        buf[bufIndex++] = (byte) x.ch;
                    }
                    return ByteBuffer.wrap(buf,0,bufIndex);
                });
                blockingQueue.put(future); //将任务放入阻塞队列,put放入速度必定比take处理速度快
                if(!thread.isAlive()) thread.start(); //启动数据写入线程
            }

            latch.await();
        } catch (Exception e) {//异常不做处理
            e.printStackTrace();
        } finally {
            try {
                out.close();
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    //解压: 使用nio
    public void nioExpand(FileInputStream in,FileOutputStream out){
        //确定线程数与容量
        int capacity = Runtime.getRuntime().availableProcessors();
        LinkedBlockingQueue<Future<ByteBuffer>> blockingQueue = new LinkedBlockingQueue<>(capacity);
        CountDownLatch latch = new CountDownLatch(1); //闭锁: 作用等待数据写入线程结束再放行
        //数据写入线程: 负责将解压数据写入
        Thread thread = new Thread(() ->{
            try {
                FileChannel outChannel = out.getChannel();
                while(!blockingQueue.isEmpty()) {
                    Future<ByteBuffer> future = blockingQueue.poll();
                    ByteBuffer byteBuffer = future.get();
                    //写入
                    outChannel.write(byteBuffer);
                }
                latch.countDown();
            } catch (Exception e) { //异常不做处理
                e.printStackTrace();
            }
        });


        try(FileChannel inChannel = in.getChannel()){
            //解压线程池: 负责解压数据
            ExecutorService executor = Executors.newWorkStealingPool(capacity);
            while(true) {
                //大文件可能分多段压缩,确定当前段长度
                byte[] head = new byte[5];
                ByteBuffer headByte = ByteBuffer.wrap(head);
                int length = inChannel.read(headByte);
                if(length == -1 || length == 0) break;
                headByte.flip();
                int tailBit = headByte.get();//获取尾部bit位
                if (length != 5) throw new RuntimeException("头部信息缺失");
                length = headByte.getInt();//获取到当前段压缩字节长度
                byte[] buff = new byte[length]; //读取当前压缩段数据
                ByteBuffer dataByte = ByteBuffer.wrap(buff);
                long len = inChannel.read(dataByte);
                if(length == -1 || length == 0) break;
                if (len != length) throw new RuntimeException("读取压缩信息失败");

                //解压并写入数据到byte数组: 并行提高效率
                Future<ByteBuffer> future = executor.submit(()->{
                    long bitLength = tailBit == 0 ? len << 3 : (len << 3) + tailBit - 8;
                    BitBuffer bitBuffer = BitBuffer.wrap(buff,bitLength);
                    //读取huffman树
                    Node root = readTrie(bitBuffer);
                    //解压后先写入byte数组
                    byte[] buf = new byte[max];
                    int bufIndex = 0;
                    //特殊处理: root为叶子节点,仅一个节点
                    if(root.isLeaf()){
                        long p = bitBuffer.position();
                        long limit = bitBuffer.limit();
                        for(;p<limit;p++)
                            buf[bufIndex++] = (byte)root.ch;
                        return ByteBuffer.wrap(buf,0,bufIndex);
                    }
                    //解码
                    while(bitBuffer.hasRemaining()){
                        Node x = root;
                        //此处每次读取1bit位处理,效率低,多线程并行优化
                        while(!x.isLeaf() && bitBuffer.hasRemaining()){
                            boolean bit = bitBuffer.get(1) == 1;
                            if(bit) x = x.right;
                            else x = x.left;
                        }
                        buf[bufIndex++] = (byte)x.ch;
                    }
                    return ByteBuffer.wrap(buf,0,bufIndex);
                });
                blockingQueue.put(future); //将任务放入阻塞队列,put放入速度比take处理速度快
                if(!thread.isAlive()) thread.start(); //启动数据写入线程
            }
            latch.await();
        } catch(Exception e){
            e.printStackTrace();
        }
    }

    //压缩: 文件夹/文件
    public void compress(File inFile,FileOutputStream out){
        Queue<Map.Entry<Integer,File>> queue = new ArrayDeque<>();
        Queue<File> fileQueue = new ArrayDeque<>();
        queue.offer(new AbstractMap.SimpleEntry<>(-1,inFile));
        //对文件名再编码: 防止同名文件名导致的问题
        Integer dKey = 0;//文件夹,被3整除
        Integer fKey = 1;//非空文件,余1
        Integer eKey = 2;//空文件,余2
        Integer key; //当前文件对应的编码
        StringBuilder table = new StringBuilder();
        StringBuilder paths = new StringBuilder();
        while(!queue.isEmpty()){
            Map.Entry<Integer,File> entity = queue.poll();
            Integer parentKey = entity.getKey();
            File file = entity.getValue();
            if(file.isDirectory()){
                key = dKey;
                dKey+=3;
                File[] files = file.listFiles();
                for(File f : files) queue.offer(new AbstractMap.SimpleEntry<>(key,f));
            }
            else {
                if (file.length() == 0) {//空文件
                    key = eKey;
                    eKey += 3;
                } else {
                    key = fKey;
                    fKey += 3;
                    fileQueue.offer(file);
                }
            }
            paths.append(parentKey);
            paths.append("/");
            paths.append(key);
            paths.append("?");

            table.append(key);
            table.append(":");
            table.append(file.getName());
            table.append("<");
        }

        table.append(">");
        table.append(paths);

        try{
            //记录文件名长度(int值)+文件名
            byte[] pathsBuff = table.toString().getBytes();
            int len = pathsBuff.length;
            byte[] head = new byte[]{(byte)(len >>> 24),(byte)(len >>> 16),(byte)(len >>> 8),(byte)len};
            out.write(head);
            out.write(pathsBuff);

            //依次压缩文件
            while(!fileQueue.isEmpty()){
                File file = fileQueue.poll();
                long length = 0;
                int capacity;
                try(FileChannel channel = new FileInputStream(file).getChannel()){
                    long size = channel.size();
                    //大文件分段压缩
                    while(length < size){
                        capacity = (int) ((length += max) <= size ? max : size - length + max);
                        MappedByteBuffer mappedByteBuffer = channel.map(FileChannel.MapMode.READ_ONLY,length-max,capacity);
                        //统计频率
                        int[] freq = new int[R];
                        for (int i = 0; i < capacity; ++i)
                            freq[toUnsigned(mappedByteBuffer.get())]++;
                        //创建huffman树
                        Node root = buildTrie(freq);
                        //生成编码表(是用long类型,防止编码过长)
                        long[] st = new long[R];
                        buildCodeTable(root, st, 0L, 0);
                        //写入huffman树
                        BitBuffer bitBuffer = new BitBuffer(capacity+(R << 1));
                        bitBuffer.position(40);//第一个字节标记压缩数据末尾bit数,后面4个字节记录压缩后字节长度
                        writeTrie(root,bitBuffer);
                        //压缩数据放入字节数组
                        long mixCode,code;
                        int bit;
                        mappedByteBuffer.flip();
                        for(int i = 0;i<capacity;++i){
                            mixCode = st[toUnsigned(mappedByteBuffer.get())];
                            bit = (int) (mixCode & 0xff);
                            code = mixCode >>> 8;
                            bitBuffer.put(code,bit);
                        }
                        //维护头部5字节
                        long gap = bitBuffer.position() - 40;
                        byte tail = (byte) (gap & 7);
                        //压缩数据长度(不包含头部长度5)
                        int compressLength = tail == 0 ? (int)(gap >>> 3) : (int)((gap >>> 3) + 1);
                        long position = bitBuffer.position();
                        bitBuffer.position(0);
                        //若到文件末尾时标记0001
                        if(length >= size) {
                            bitBuffer.put(1,4);
                            bitBuffer.put(tail,4);
                        }else{
                            bitBuffer.put(tail,8);
                        }
                        bitBuffer.put(compressLength >>> 16,16);
                        bitBuffer.put(compressLength,16);
                        //写入压缩数据
                        byte[] buf = bitBuffer.array();
                        int maxLength = (int) ((position & 7) == 0 ? position >>> 3 : (position >>> 3) + 1);
                        //分段写入
                        for(int k = 0;k<maxLength;k+=bufferLength){
                            if(k + bufferLength > maxLength)
                                out.write(buf,k,maxLength - k);
                            else
                                out.write(buf,k,bufferLength);
                        }
                    }

                }catch (Exception e){
                    e.printStackTrace();
                }
            }
        } catch (Exception e){
            e.printStackTrace();
        } finally {
            try {
                out.close();
                System.gc();//通知尽快释放MappedByteBuffer
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    //解压: 文件夹/文件
    public void expand(FileInputStream in,File outFile){
        try{
            byte[] headBuff = new byte[4];
            int len = in.read(headBuff);
            if(len != 4) throw new RuntimeException("头部信息缺失");
            ByteBuffer headByte = ByteBuffer.wrap(headBuff);
            //获取文件名字符串长度
            int length = headByte.getInt();
            byte[] pathByte = new byte[length];
            //读取文件名字符串
            len = in.read(pathByte);
            if (len != length) throw new RuntimeException("读取压缩信息失败");
            String[] arrs = new String(pathByte).split(">");
            HashMap<Integer,String> hashMap = new HashMap<>(); //值: 文件名
            Queue<Map.Entry<Integer,File>> dirsQueue = new ArrayDeque<>();//保存文件夹键值
            //创建对照表
            for(String e : arrs[0].split("<")){
                String[] entity = e.split(":");
                hashMap.put(Integer.parseInt(entity[0]),entity[1]);
            }
            //创建文件
            File f;
            Integer key = -1;
            for(String e : arrs[1].split("\\?")){
                String[] entity = e.split("/");
                int parentKey = Integer.parseInt(entity[0]);
                int childKey = Integer.parseInt(entity[1]);
                int mod = childKey % 3;
                if(parentKey == -1 && mod == 0){ //处理文件夹根目录
                    f = new File(outFile,hashMap.get(childKey));
                    f.mkdir();
                    dirsQueue.offer(new AbstractMap.SimpleEntry<>(childKey,f));
                    outFile = f;
                    key = childKey;
                    continue;
                }
                Map.Entry<Integer,File> entry = null;
                while(key != parentKey){
                    entry = dirsQueue.poll();
                    key = entry.getKey();
                }
                outFile = entry == null ? outFile : entry.getValue();
                //处理
                f = new File(outFile,hashMap.get(childKey));
                if(mod == 0){ //文件夹
                    f.mkdir();
                    dirsQueue.offer(new AbstractMap.SimpleEntry<>(childKey,f));
                }
                else if(mod == 1){ //文件直接解压
                    f.createNewFile();
                    expandFile(in,new FileOutputStream(f));
                }
                else f.createNewFile(); //空文件
            }

        } catch (Exception e){
            e.printStackTrace();
        } finally {
            try {
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }

    }
    public void expandFile(InputStream in,OutputStream out){
        //确定线程数与容量
        int capacity = Runtime.getRuntime().availableProcessors();
        LinkedBlockingQueue<Future<ByteBuffer>> blockingQueue = new LinkedBlockingQueue<>(capacity);
        CountDownLatch latch = new CountDownLatch(1); //闭锁: 作用等待数据写入线程结束再放行
        //数据写入线程: 负责将解压数据写入
        Thread thread = new Thread(() ->{
            try {
                while(!blockingQueue.isEmpty()) {
                    Future<ByteBuffer> future = blockingQueue.poll();
                    ByteBuffer byteBuffer = future.get();
                    //写入
                    byte[] buf = byteBuffer.array();
                    int maxLength = byteBuffer.limit();
                    //分段写入
                    for(int k = 0;k<maxLength;k+=bufferLength){
                        if(k + bufferLength > maxLength)
                            out.write(buf,k,maxLength - k);
                        else
                            out.write(buf,k,bufferLength);
                    }
                }
                latch.countDown();
            } catch (Exception e) { //异常不做处理
                e.printStackTrace();
            }
        });

        try {
            //解压线程池: 负责解压数据
            ExecutorService executor = Executors.newWorkStealingPool(capacity);
            while(true) {
                //大文件可能分多段压缩,确定当前段长度
                byte[] head = new byte[5];
                int length = in.read(head); //读取头部信息
                if(length == -1 || length == 0) break;
                ByteBuffer headByte = ByteBuffer.wrap(head);

                int bits = headByte.get();//获取尾部bit位
                boolean flag = false;
                if(bits > 7) {
                    bits &= 0xf;
                    flag = true;
                }

                int tailBit = bits;//获取尾部bit位
                if (length != 5) throw new RuntimeException("头部信息缺失");
                length = headByte.getInt();//获取到当前段压缩字节长度
                byte[] buff = new byte[length]; //读取当前压缩段数据
                //读取压缩段
                long len = in.read(buff);
                if(length == -1 || length == 0) break;
                if (len != length) throw new RuntimeException("读取压缩信息失败");

                //解压并写入数据到byte数组: 并行提高效率
                Future<ByteBuffer> future = executor.submit(()->{
                    long bitLength = tailBit == 0 ? len << 3 : (len << 3) + tailBit - 8;
                    BitBuffer bitBuffer = BitBuffer.wrap(buff,bitLength);
                    //读取huffman树
                    Node root = readTrie(bitBuffer);
                    //解压后先写入byte数组
                    byte[] buf = new byte[max];
                    int bufIndex = 0;
                    //特殊处理: root为叶子节点,仅一个节点
                    if(root.isLeaf()){
                        long p = bitBuffer.position();
                        long limit = bitBuffer.limit();
                        for(;p<limit;p++)
                            buf[bufIndex++] = (byte)root.ch;
                        return ByteBuffer.wrap(buf,0,bufIndex);
                    }
                    //解码
                    while(bitBuffer.hasRemaining()){
                        Node x = root;
                        //此处每次读取1bit位处理,效率低,多线程并行优化
                        while(!x.isLeaf() && bitBuffer.hasRemaining()){
                            boolean bit = bitBuffer.get(1) == 1;
                            if(bit) x = x.right;
                            else x = x.left;
                        }
                        buf[bufIndex++] = (byte) x.ch;
                    }
                    return ByteBuffer.wrap(buf,0,bufIndex);
                });
                blockingQueue.put(future); //将任务放入阻塞队列,put放入速度必定比take处理速度快
                if(!thread.isAlive()) thread.start(); //启动数据写入线程

                if(flag) break;
            }

            latch.await();
        } catch (Exception e) {//异常不做处理
            e.printStackTrace();
        } finally {
            try {
                out.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    //byte转无符号
    private int toUnsigned(byte value){ return value < 0 ? value & 0xff : value; }
    //根据统计的频率freq构建huffman树
    private Node buildTrie(int[] freq){
        PriorityQueue<Node> pq = new PriorityQueue<>();
        for(char c=0;c<R;++c)
            if (freq[c] > 0) pq.offer(new Node(c, freq[c], null, null));
        while(pq.size() > 1){
            Node left = pq.poll();
            Node right = pq.poll();
            Node parent = new Node('\0',left.freq+right.freq,left,right);
            pq.offer(parent);
        }
        return pq.poll();
    }
    //根据huffman树构建huffman码表,st为码表,int值的前3个字节保存码值,最后1个字节保存码值长度
    private void buildCodeTable(Node x,long[] st,long code,int length){
        if(!x.isLeaf()){
            code <<= 1;
            buildCodeTable(x.left,st,code,length+1);
            buildCodeTable(x.right,st,code+1,length+1);
        }
        else if(length == 0){ //只有一个根节点
            st[x.ch] = 1<<8|1;
        }
        else {
            if (length > 56) throw new RuntimeException("错误,编码过长超过long[] st无法保存码表");
            st[x.ch] = code<<8|length; //7个字节(码值)+1个字节(码值长度)
        }
    }
    //将huffman树存储到字节数组中,非叶子节点0,叶子节点1后面紧跟8bit位为叶子节点的码值
    private void writeTrie(Node x,BitBuffer buff){
        if(x.isLeaf()){
            buff.put(1,1);
            buff.put(x.ch,8);
            return;
        }
        else buff.position(buff.position()+1);
        writeTrie(x.left,buff);
        writeTrie(x.right,buff);
    }
    //读取头部存储的huffman树并创建(读取到叶子节点会自动递归结束)
    private Node readTrie(BitBuffer buff){
        boolean isLeaf = buff.get(1) == 1;
        if(isLeaf){
            Node node = new Node((char)buff.get(8),-1,null,null);
            return node;
        }
        else return new Node('\0',-1,readTrie(buff),readTrie(buff));
    }
}

2.LZW算法

使用到的三向单词查找树(仅保留用到的方法)-稍作改动直接处理字节数组

/**
 * 稍作改动的三向字符串查找树
 * 概述: 稍作改动可以直接处理字符数组
 */
public class TST<V> {
    private Node<V> root;//根节点
    private class Node<V>{  //节点
        private char c; //字符
        private Node<V> left; //左连接 <c
        private Node<V> mid;  //中连接 =c
        private Node<V> right;//右连接 >c
        private V value;    //保存键值
    }

    //获取: 键-buff[begin,end]组成的字符串
    public V get(byte[] buff,int begin,int end){
        Node x = get(root,buff,begin,end);
        if(x != null) return (V)x.value;
        if(x == null) System.out.println("x为null" + begin + ":" + end);
        return null;
    }
    private Node get(Node x,byte[] buff,int begin,int end){
        if(buff == null || buff.length == 0 || begin > end) return null;
        if(x == null) {
            System.out.println("x.为null" + begin+":"+end);
            return null;
        }
        char c = toUnsigned(buff[begin]);
        if(x.c > c) return get(x.left,buff,begin,end);
        else if(x.c < c) return get(x.right,buff,begin,end);
        else if(begin < end) return get(x.mid,buff,begin+1,end);
        else {
            if(x.value == null) System.out.println("begin:"+ begin+" end:"+end);
            return x;
        }
    }
    //插入: 键-byte数组buff[begin,end]组成的字符串,值-value
    public void put(byte[] buff,int begin,int end,V value){
        if(buff == null || buff.length == 0 || begin > end) return;
        root = put(root,buff,begin,end,value);
    }
    private Node put(Node x,byte[] buff,int begin,int end,V value){
        char c = toUnsigned(buff[begin]);
        if(x == null){
            x = new Node();
            x.c = c;
        }
        if(x.c > c) x.left = put(x.left,buff,begin,end,value);
        else if(x.c < c) x.right = put(x.right,buff,begin,end,value);
        else if(begin < end) x.mid = put(x.mid,buff,begin+1,end,value);
        else x.value = value;
        return x;
    }
    //获取最长前缀并所在byte数组的索引
    public int longestPrefixof(byte[] buff,int begin,int end){
        if(buff == null || buff.length == 0 || begin > end) return -1;
        int index = 0;
        Node x = root;
        while(x != null && begin <= end){
            char c = toUnsigned(buff[begin]);
            if(x.c > c) x = x.left;
            else if(x.c < c) x = x.right;
            else{
                if(x.value != null) index = begin;
                begin++;
                x = x.mid;
            }
        }
        return index;
    }

    //插入
    public void put(String key, V value){
        if(key == null || key.length() == 0) return;
        root = put(root,key,value,0);
    }
    /**插入: 与获取思路相同,只是遇到空节点创建新节点,到末尾时保存键值*/
    private Node put(Node x,String key,V value,int d){
        char c = key.charAt(d);
        if(x == null) { //没有节点就创建
            x = new Node();
            x.c = c;
        }
        if(x.c > c) x.left = put(x.left,key,value,d);
        else if(x.c < c) x.right = put(x.right,key,value,d);
        else if(d < key.length()-1) x.mid = put(x.mid,key,value,d+1);
        else x.value = value;   //末尾时保存键值
        return x;
    }

    //byte转无符号
    private char toUnsigned(byte value){ return (char)(value < 0 ? value & 0xff : value); }
}

2.1.LZW算法-12位定长bit位编码

/**
 * LZW算法
 * 概述: gif格式用到此算法思想,适合有大量重复值的数据,如果重复数据很少,反而会有反效果
 * 实现: 1.使用定长的12bit位码表,码表满时,停止扩展,不适合大文件
 *      2.JDK1.7+之后substring方法性能下降,在此直接处理字节数组,不再像算法4中转为字符串处理
 *      3.因为要处理字节数组,因此对三向字符串查找树稍作更改,直接处理字符
 */
public class LZW {
    private int buffLength = 8192;
    private int R = 256;
    private int W = 12;
    private int L = 1 << W;
    /** 压缩 **/
    public void compress(InputStream in, OutputStream out){
        try {
            byte[] buff = new byte[in.available()];
            int length = in.read(buff);
            //基础表
            TST<Integer> st = new TST<>(); //稍作更改的三向单词查找树(直接处理字节数组)
            for(int i=0;i<R;++i)
                st.put(""+(char)i,i);
            int code = R+1;
            BitBuffer bitBuffer = new BitBuffer(length << 1);
            //压缩
            for(int i=0;i<length;){
                //获取最长前缀索引
                int index = st.longestPrefixof(buff,i,length-1);
                //压缩后写入缓冲区
                Integer value = st.get(buff,i,index);
                bitBuffer.put(value,W);
                //扩展表
                if(index < length-1 && code < L) //W位表满之后就不再扩展
                    st.put(buff,i,index+1,code++);
                i = index + 1;
            }
            long position = bitBuffer.position();
            int len = (int)((position & 7 ) == 0 ? position >>> 3 : (position >>> 3) + 1);
            byte[] buf = bitBuffer.array();
            //分段写入
            for(int k = 0;k<len;k+=buffLength){
                if(k + buffLength > len)
                    out.write(buf,k,len - k);
                else
                    out.write(buf,k,buffLength);
            }
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                out.close();
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
    /** 解压 **/
    public void expand(InputStream in,OutputStream out){
        try {
            byte[] buff = new byte[in.available()];
            int length = in.read(buff);
            String[] st = new String[L]; //确定码表长度,直接使用定长数组存储解码表
            int i;
            //基础表
            for(i=0;i<R;++i)
                st[i] = ""+(char)i;
            st[i++] = "";
            BitBuffer bitBuffer = BitBuffer.wrap(buff,length << 3);
            if(bitBuffer.remaining() < W) return; //达到末尾结束
            //先取出第一个编码字符
            int codeword = (int) bitBuffer.get(W);
            String val = st[codeword]; //解码第一个字符
            StringBuilder builder = new StringBuilder(); //存储解码后的字符串
            while(true){
                builder.append(val);
                if(bitBuffer.remaining() < W) break; //达到末尾结束
                codeword = (int) bitBuffer.get(W); //读取一个编码字符
                String s = st[codeword];
                //扩展表
                if(i == codeword) s = val+ val.charAt(0); //特殊情况,当前解码字符刚好是下一个要加入码表的字符
                if(i < L) st[i++] = val + s.charAt(0);
                val = s;
                if(builder.length() > buffLength){ //分次写入
                    out.write(builder.toString().getBytes("ISO8859-1"));
                    builder.delete(0,builder.length());
                }
            }
            out.write(builder.toString().getBytes("ISO8859-1"));
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                out.close();
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

}

2.2.LZW算法-从9bit位开始变长编码

/**
 * LZW算法
 * 概述: gif格式用到此算法思想,适合有大量重复值的数据,如果重复数据很少,反而会有反效果
 * 实现: 1.在LZW算法基础上使用变长的编码,从9bit位开始,用完时扩展,并使用最后一位L-1作为变更标记
 *      2.JDK1.7+之后substring方法性能下降,在此直接处理字节数组,不再像算法4中转为字符串处理
 *      3.压缩性能有待改进
 */
public class LZW {
    private int buffLength = 8192; //缓冲大小8M
    private int R = 256; //基本码表
    private int W = 9; //从9bit位开始扩展码表
    private int maxW = 24; //最大扩展bit位
    private int L = 1 << W; //码表大小

    /** 压缩 **/
    public void compress(InputStream in, OutputStream out){
        try {
            byte[] buff = new byte[in.available()];
            int length = in.read(buff);
            //基础表
            TST<Integer> st = new TST<>(); //稍作更改的三向单词查找树(直接处理字节数组)
            for(int i=0;i<R;++i)
                st.put(""+(char)i,i);
            int code = R+1;
            BitBuffer bitBuffer = new BitBuffer(length << 1);
            //压缩
            for(int i=0;i<length;){
                //获取最长前缀索引
                int index = st.longestPrefixof(buff,i,length-1);
                //压缩后写入缓冲区
                Integer value = st.get(buff,i,index);
                bitBuffer.put(value,W);
                //扩展表
                if(index < length-1 && code < L-1) //W位表没有满就扩展码表
                    st.put(buff,i,index+1,code++);
                else if(code == L-1){   //L-1标记当前M bit位码表编满,扩展码表为M+1 bit位
                    bitBuffer.put(L-1,W);
                    if(W == maxW){ //达到最大就重新创建码表
                        st = new TST<>();
                        for(int k=0;k<R;++k)
                            st.put(""+(char)k,k);
                        code = R+1;
                        W = 9;
                    }
                    else {  //没有达到最大就增加bit位扩展码表
                        code++;
                        if (index < length - 1)
                            st.put(buff, i, index + 1, code++);
                        W++;
                    }
                    L = 1 << W;
                }
                i = index + 1;
            }
            long position = bitBuffer.position();
            int len = (int)((position & 7 ) == 0 ? position >>> 3 : (position >>> 3) + 1);
            byte[] buf = bitBuffer.array();
            //分段写入
            for(int k = 0;k<len;k+=buffLength){
                if(k + buffLength > len)
                    out.write(buf,k,len - k);
                else
                    out.write(buf,k,buffLength);
            }

        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                out.close();
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }

    /** 解压 **/
    public void expand(InputStream in,OutputStream out){
        try {
            byte[] buff = new byte[in.available()];
            long length = in.read(buff);
            //不确定码表总大小,使用hashMap
            HashMap<Integer,String> st = new HashMap<>(L);
            int i;
            //基础表
            for(i=0;i<R;++i)
                st.put(i,""+(char)i);
            st.put(i++,"");
            BitBuffer bitBuffer = BitBuffer.wrap(buff,length << 3);
            if(bitBuffer.remaining() < W) return; //达到末尾结束
            //先取出第一个编码字符
            int codeword = (int) bitBuffer.get(W);
            String val = st.get(codeword); //解码第一个字符
            StringBuilder builder = new StringBuilder(); //存储解码后的字符串
            while(true){
                builder.append(val); //解码后存入StringBuilder
                if(bitBuffer.remaining() < W) break; //达到末尾结束
                codeword = (int) bitBuffer.get(W); //读取一个编码字符
                if(codeword == L-1){ //达到最大码表,增加一bit位或者重置
                    if(W == maxW){ //达到最大bit位重置码表
                        W = 9;
                        L = 1 << W;
                        st = new HashMap<>(L);
                        //基础表
                        for(i=0;i<R;++i)
                            st.put(i,""+(char)i);
                        st.put(i++,"");
                        if(bitBuffer.remaining() < W) return; //结束
                        codeword = (int) bitBuffer.get(W);
                        val = st.get(codeword);
                        continue;
                    }
                    else {
                        W++;
                        L = 1 << W;
                        if(bitBuffer.remaining() < W) return; //结束
                        codeword = (int) bitBuffer.get(W);
                        i++; //L-1作为标记,不再使用跳过
                    }
                }
                String s = st.get(codeword);
                //扩展表
                if(i == codeword) s = val+ val.charAt(0); //特殊情况,当前解码字符刚好是下一个要加入码表的字符
                if(i < L) st.put(i++,val+s.charAt(0));
                val = s; //更新
                if(builder.length() > buffLength){ //分次写入
                    out.write(builder.toString().getBytes("ISO8859-1"));
                    builder.delete(0,builder.length());
                }
            }
            //写入最后部分
            out.write(builder.toString().getBytes("ISO8859-1"));
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            try {
                out.close();
                in.close();
            } catch (IOException e) {
                e.printStackTrace();
            }
        }
    }
}

Bryson

https://brysonbs.github.io/2019/07/30/DataCompress/

本博客所有文章除特別声明外，均采用 CC BY 4.0 许可协议。转载请注明来源 Bryson !

算法字符串

并发-对象的共享

非原子的64操作非volatile类型的64位数值变量(double和long),JVM允许将64位的读或写操作分解为两个32位操作,因此在多线程中使用共享且可变的long与double类型的变量是不安全的,需要使用volatile声明或

2019-08-10 并发

并发

正则表达式

正则表达式对于任意正则表达式都存在一个与之对应的非确定有限状态自动机(NFA),反之亦然 1.正则匹配/** * 正则表达式的模式匹配 * 概述: 每个正则表达式都对应一个NFA,反之亦然 * 思路: 将正则表达式转换为有向图(仅

2019-07-16 算法

算法字符串