数据压缩
在此主要介绍使用Huffman编码压缩和LZW压缩算法
1.Huffman压缩(霍夫曼压缩)
1.1.BitBuffer辅助类 - 按bit位处理字节数组
/**
* BitterBuffer
* 按bit位处理字节数组,在此仅实现使用到的方法
*/
public class BitBuffer {
private long mark = -1;
private long position = 0;
private long limit;
private long capacity;
private byte[] hb;
public BitBuffer(int byteCapacity) {
this.hb = new byte[byteCapacity];
this.limit = (long)byteCapacity << 3;
this.capacity = limit;
}
private BitBuffer(byte[] array,long bitLength){
this.hb = array;
this.limit = bitLength;
this.capacity = limit;
}
public static BitBuffer wrap(byte[] array, long bitLength){
return new BitBuffer(array,bitLength);
}
//byte转无符号
private int toUnsigned(byte value){ return value < 0 ? value & 0xff : value; }
//转ByteBuffer
public ByteBuffer toByteBuffer(){
int length = (int) ((position & 7) == 0 ? position >>> 3 : (position >>> 3) + 1);
ByteBuffer byteBuffer = ByteBuffer.wrap(hb,0,length);
return byteBuffer;
}
//获取底层数组
public byte[] array(){ return hb; }
//取出bits位的值返回
public long get(int bits){
if(position + bits > limit) throw new RuntimeException("超出BitBuffer范围无法取出");
if(bits > 64) throw new RuntimeException("单次取出超过64bit位,无法取出");
int index = (int) (position >>> 3);
int remain = (int)(8 - (position & 7));
//更新position
position += bits;
int num = remain - bits;
if(num >= 0) return (toUnsigned(hb[index]) >>> num) & ((1 << bits) - 1);
else{
long code = toUnsigned(hb[index]) & ((1 << remain) - 1);
return get(code,-num,index+1);
}
}
private long get(long code,int bits,int index){
if(bits <= 8){
int num = 8 - bits;
int tail = (toUnsigned(hb[index]) >>> num) & ((1 << bits) - 1);
return (code << bits) | tail;
}
code = (code << 8) | toUnsigned(hb[index]);
return get(code,bits-8,index+1);
}
//将code的bits位(低位->高位数bits位)放入
public void put(long code,int bits){
if(position + bits > limit) throw new RuntimeException("超出BitBuffer范围无法取出");
if(bits > 64) throw new RuntimeException("单次放入超过64bit位,无法放入");
int index = (int)(position >>> 3);
int remain = (int)(8 - (position & 7));
//更新position
position += bits;
int num = remain - bits;
if(num >= 0) hb[index] |= code << num;
else{
hb[index] |= code >>> -num;
put(code,-num,index+1);
}
}
private void put(long code,int bits,int index){
if(bits <= 8){
hb[index] |= code << 8-bits;
return;
}
hb[index] |= code >>> (bits-=8);
put(code,bits,index+1);
}
public boolean hasRemaining(){ return position < limit; }
//获取当前position值
public long position(){ return position; }
//设置当前position值
public BitBuffer position(long newPosition){
if ((newPosition > limit) || (newPosition < 0))
throw new IllegalArgumentException();
position = newPosition;
if (mark > position) mark = -1;
return this;
}
public long remaining(){ return limit-position; }
}
1.2.Huffman压缩
/**
* Huffman压缩
* 概述: 通过huffman编码来重新表示字符,使得出现频率高的字符编码短,出现少的字符编码长
* 整体来说,所需的总的bit位是减少的,但对于字符频率均匀的数据几乎没有压缩效果
* 实现: 压缩后头部第1个字节存储压缩后结尾长度(结尾时可能一个字节没占满),
* 紧接着4个字节存储压缩后字节长度(对于大文件分批次构建不同huffman树进行压缩,确定长度,可以分批次解压)
* 细节: 分别使用bio与nio实现,解压时使用多线程并行优化
*/
public class Huffman {
private int max = 1 << 24; //单次最大读取字节数,对nio性能影响大(ByteBuffer大小影响)
private int R = 256;//码表范围
private int bufferLength = 8192; //写入时缓冲区大小8K
//huffman节点类
private class Node implements Comparable<Node>{
private char ch;
private int freq;
private Node left;
private Node right;
public Node(char ch, int freq, Node left, Node right) {
this.ch = ch;
this.freq = freq;
this.left = left;
this.right = right;
}
public boolean isLeaf(){
return left == null && right == null;
}
@Override
public int compareTo(Node o) { return this.freq - o.freq; }
}
//压缩: 使用bio
public void compress(InputStream in, OutputStream out){
int length;
byte[] buff = null;
try{
while (true) {
//对于大文件分批次构建huffman树进行压缩
length = (length = in.available()) > max ? max : length;
if(buff == null) buff = new byte[length];
length = in.read(buff);
if(length == -1 || length == 0) break; //读取结束
//统计频率
int[] freq = new int[R];
for (int i = 0; i < length; ++i)
freq[toUnsigned(buff[i])]++;
//创建huffman树
Node root = buildTrie(freq);
//生成编码表(是用long类型,防止code超过范围)
long[] st = new long[R];
buildCodeTable(root, st, 0L, 0);
//写入huffman树
BitBuffer bitBuffer = new BitBuffer(length+(R << 1));
bitBuffer.position(40);//第一个字节标记压缩数据末尾bit数,后面4个字节记录压缩后字节长度
writeTrie(root,bitBuffer);
//写入压缩数据
long mixCode,code;
int bit;
for(int i = 0;i<length;++i){
mixCode = st[toUnsigned(buff[i])];
bit = (int) (mixCode & 0xff);
code = mixCode >>> 8;
bitBuffer.put(code,bit);
}
//维护头部5字节
long gap = bitBuffer.position() - 40;
byte tail = (byte) (gap & 7);
//压缩数据长度(不包含头部长度5)
int compressLength = (int) (tail == 0 ? gap >>> 3 : (gap >>> 3) + 1);
long position = bitBuffer.position();
bitBuffer.position(0);
bitBuffer.put(tail,8);
bitBuffer.put(compressLength >>> 16,16);
bitBuffer.put(compressLength,16);
//写入压缩数据
byte[] buf = bitBuffer.array();
int maxLength = (int) ((position & 7) == 0 ? position >>> 3 : (position >>> 3) + 1);
//分段写入
for(int k = 0;k<maxLength;k+=bufferLength){
if(k + bufferLength > maxLength)
out.write(buf,k,maxLength - k);
else
out.write(buf,k,bufferLength);
}
}
} catch (IOException e){
e.printStackTrace();
} finally {
try {
out.close();
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//压缩: 使用nio
public void nioCompress(FileInputStream in, FileOutputStream out){
long length = 0;
int capacity;
try (FileChannel channel = in.getChannel();FileChannel outChannel = out.getChannel()) {
long size = channel.size();
//大文件分段压缩
while(length < size){
capacity = (int) ((length += max) <= size ? max : size - length + max);
MappedByteBuffer mappedByteBuffer = channel.map(FileChannel.MapMode.READ_ONLY,length-max,capacity);
//统计频率
int[] freq = new int[R];
for (int i = 0; i < capacity; ++i)
freq[toUnsigned(mappedByteBuffer.get())]++;
//创建huffman树
Node root = buildTrie(freq);
//生成编码表(是用long类型,防止编码过长)
long[] st = new long[R];
buildCodeTable(root, st, 0L, 0);
//写入huffman树
BitBuffer bitBuffer = new BitBuffer(capacity+(R << 1));
bitBuffer.position(40);//第一个字节标记压缩数据末尾bit数,后面4个字节记录压缩后字节长度
writeTrie(root,bitBuffer);
//压缩数据放入字节数组
long mixCode,code;
int bit;
mappedByteBuffer.flip();
for(int i = 0;i<capacity;++i){
mixCode = st[toUnsigned(mappedByteBuffer.get())];
bit = (int) (mixCode & 0xff);
code = mixCode >>> 8;
bitBuffer.put(code,bit);
}
//维护头部5字节
long gap = bitBuffer.position() - 40;
byte tail = (byte) (gap & 7);
//压缩数据长度(不包含头部长度5)
int compressLength = tail == 0 ? (int)(gap >>> 3) : (int)((gap >>> 3) + 1);
long position = bitBuffer.position();
bitBuffer.position(0);
bitBuffer.put(tail,8);
bitBuffer.put(compressLength >>> 16,16);
bitBuffer.put(compressLength,16);
//写入压缩数据
bitBuffer.position(position);
ByteBuffer byteBuffer = bitBuffer.toByteBuffer();
outChannel.write(byteBuffer);
}
} catch(IOException e){
e.printStackTrace();
}
}
//解压: 使用bio
public void expand(InputStream in,OutputStream out){
//确定线程数与容量
int capacity = Runtime.getRuntime().availableProcessors();
LinkedBlockingQueue<Future<ByteBuffer>> blockingQueue = new LinkedBlockingQueue<>(capacity);
CountDownLatch latch = new CountDownLatch(1); //闭锁: 作用等待数据写入线程结束再放行
//数据写入线程: 负责将解压数据写入
Thread thread = new Thread(() ->{
try {
while(!blockingQueue.isEmpty()) {
Future<ByteBuffer> future = blockingQueue.poll();
ByteBuffer byteBuffer = future.get();
//写入
byte[] buf = byteBuffer.array();
int maxLength = byteBuffer.limit();
//分段写入
for(int k = 0;k<maxLength;k+=bufferLength){
if(k + bufferLength > maxLength)
out.write(buf,k,maxLength - k);
else
out.write(buf,k,bufferLength);
}
}
latch.countDown();
} catch (Exception e) { //异常不做处理
e.printStackTrace();
}
});
try {
//解压线程池: 负责解压数据
ExecutorService executor = Executors.newWorkStealingPool(capacity);
while(true) {
//大文件可能分多段压缩,确定当前段长度
byte[] head = new byte[5];
int length = in.read(head); //读取头部信息
if(length == -1 || length == 0) break;
ByteBuffer headByte = ByteBuffer.wrap(head);
int tailBit = headByte.get();//获取尾部bit位
if (length != 5) throw new RuntimeException("头部信息缺失");
length = headByte.getInt();//获取到当前段压缩字节长度
byte[] buff = new byte[length]; //读取当前压缩段数据
//读取压缩段
long len = in.read(buff);
if(length == -1 || length == 0) break;
if (len != length) throw new RuntimeException("读取压缩信息失败");
//解压并写入数据到byte数组: 并行提高效率
Future<ByteBuffer> future = executor.submit(()->{
long bitLength = tailBit == 0 ? len << 3 : (len << 3) + tailBit - 8;
BitBuffer bitBuffer = BitBuffer.wrap(buff,bitLength);
//读取huffman树
Node root = readTrie(bitBuffer);
//解压后先写入byte数组
byte[] buf = new byte[max];
int bufIndex = 0;
//特殊处理: root为叶子节点,仅一个节点
if(root.isLeaf()){
long p = bitBuffer.position();
long limit = bitBuffer.limit();
for(;p<limit;p++)
buf[bufIndex++] = (byte)root.ch;
return ByteBuffer.wrap(buf,0,bufIndex);
}
//解码
while(bitBuffer.hasRemaining()){
Node x = root;
//此处每次读取1bit位处理,效率低,多线程并行优化
while(!x.isLeaf() && bitBuffer.hasRemaining()){
boolean bit = bitBuffer.get(1) == 1;
if(bit) x = x.right;
else x = x.left;
}
buf[bufIndex++] = (byte) x.ch;
}
return ByteBuffer.wrap(buf,0,bufIndex);
});
blockingQueue.put(future); //将任务放入阻塞队列,put放入速度必定比take处理速度快
if(!thread.isAlive()) thread.start(); //启动数据写入线程
}
latch.await();
} catch (Exception e) {//异常不做处理
e.printStackTrace();
} finally {
try {
out.close();
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//解压: 使用nio
public void nioExpand(FileInputStream in,FileOutputStream out){
//确定线程数与容量
int capacity = Runtime.getRuntime().availableProcessors();
LinkedBlockingQueue<Future<ByteBuffer>> blockingQueue = new LinkedBlockingQueue<>(capacity);
CountDownLatch latch = new CountDownLatch(1); //闭锁: 作用等待数据写入线程结束再放行
//数据写入线程: 负责将解压数据写入
Thread thread = new Thread(() ->{
try {
FileChannel outChannel = out.getChannel();
while(!blockingQueue.isEmpty()) {
Future<ByteBuffer> future = blockingQueue.poll();
ByteBuffer byteBuffer = future.get();
//写入
outChannel.write(byteBuffer);
}
latch.countDown();
} catch (Exception e) { //异常不做处理
e.printStackTrace();
}
});
try(FileChannel inChannel = in.getChannel()){
//解压线程池: 负责解压数据
ExecutorService executor = Executors.newWorkStealingPool(capacity);
while(true) {
//大文件可能分多段压缩,确定当前段长度
byte[] head = new byte[5];
ByteBuffer headByte = ByteBuffer.wrap(head);
int length = inChannel.read(headByte);
if(length == -1 || length == 0) break;
headByte.flip();
int tailBit = headByte.get();//获取尾部bit位
if (length != 5) throw new RuntimeException("头部信息缺失");
length = headByte.getInt();//获取到当前段压缩字节长度
byte[] buff = new byte[length]; //读取当前压缩段数据
ByteBuffer dataByte = ByteBuffer.wrap(buff);
long len = inChannel.read(dataByte);
if(length == -1 || length == 0) break;
if (len != length) throw new RuntimeException("读取压缩信息失败");
//解压并写入数据到byte数组: 并行提高效率
Future<ByteBuffer> future = executor.submit(()->{
long bitLength = tailBit == 0 ? len << 3 : (len << 3) + tailBit - 8;
BitBuffer bitBuffer = BitBuffer.wrap(buff,bitLength);
//读取huffman树
Node root = readTrie(bitBuffer);
//解压后先写入byte数组
byte[] buf = new byte[max];
int bufIndex = 0;
//特殊处理: root为叶子节点,仅一个节点
if(root.isLeaf()){
long p = bitBuffer.position();
long limit = bitBuffer.limit();
for(;p<limit;p++)
buf[bufIndex++] = (byte)root.ch;
return ByteBuffer.wrap(buf,0,bufIndex);
}
//解码
while(bitBuffer.hasRemaining()){
Node x = root;
//此处每次读取1bit位处理,效率低,多线程并行优化
while(!x.isLeaf() && bitBuffer.hasRemaining()){
boolean bit = bitBuffer.get(1) == 1;
if(bit) x = x.right;
else x = x.left;
}
buf[bufIndex++] = (byte)x.ch;
}
return ByteBuffer.wrap(buf,0,bufIndex);
});
blockingQueue.put(future); //将任务放入阻塞队列,put放入速度比take处理速度快
if(!thread.isAlive()) thread.start(); //启动数据写入线程
}
latch.await();
} catch(Exception e){
e.printStackTrace();
}
}
//压缩: 文件夹/文件
public void compress(File inFile,FileOutputStream out){
Queue<Map.Entry<Integer,File>> queue = new ArrayDeque<>();
Queue<File> fileQueue = new ArrayDeque<>();
queue.offer(new AbstractMap.SimpleEntry<>(-1,inFile));
//对文件名再编码: 防止同名文件名导致的问题
Integer dKey = 0;//文件夹,被3整除
Integer fKey = 1;//非空文件,余1
Integer eKey = 2;//空文件,余2
Integer key; //当前文件对应的编码
StringBuilder table = new StringBuilder();
StringBuilder paths = new StringBuilder();
while(!queue.isEmpty()){
Map.Entry<Integer,File> entity = queue.poll();
Integer parentKey = entity.getKey();
File file = entity.getValue();
if(file.isDirectory()){
key = dKey;
dKey+=3;
File[] files = file.listFiles();
for(File f : files) queue.offer(new AbstractMap.SimpleEntry<>(key,f));
}
else {
if (file.length() == 0) {//空文件
key = eKey;
eKey += 3;
} else {
key = fKey;
fKey += 3;
fileQueue.offer(file);
}
}
paths.append(parentKey);
paths.append("/");
paths.append(key);
paths.append("?");
table.append(key);
table.append(":");
table.append(file.getName());
table.append("<");
}
table.append(">");
table.append(paths);
try{
//记录文件名长度(int值)+文件名
byte[] pathsBuff = table.toString().getBytes();
int len = pathsBuff.length;
byte[] head = new byte[]{(byte)(len >>> 24),(byte)(len >>> 16),(byte)(len >>> 8),(byte)len};
out.write(head);
out.write(pathsBuff);
//依次压缩文件
while(!fileQueue.isEmpty()){
File file = fileQueue.poll();
long length = 0;
int capacity;
try(FileChannel channel = new FileInputStream(file).getChannel()){
long size = channel.size();
//大文件分段压缩
while(length < size){
capacity = (int) ((length += max) <= size ? max : size - length + max);
MappedByteBuffer mappedByteBuffer = channel.map(FileChannel.MapMode.READ_ONLY,length-max,capacity);
//统计频率
int[] freq = new int[R];
for (int i = 0; i < capacity; ++i)
freq[toUnsigned(mappedByteBuffer.get())]++;
//创建huffman树
Node root = buildTrie(freq);
//生成编码表(是用long类型,防止编码过长)
long[] st = new long[R];
buildCodeTable(root, st, 0L, 0);
//写入huffman树
BitBuffer bitBuffer = new BitBuffer(capacity+(R << 1));
bitBuffer.position(40);//第一个字节标记压缩数据末尾bit数,后面4个字节记录压缩后字节长度
writeTrie(root,bitBuffer);
//压缩数据放入字节数组
long mixCode,code;
int bit;
mappedByteBuffer.flip();
for(int i = 0;i<capacity;++i){
mixCode = st[toUnsigned(mappedByteBuffer.get())];
bit = (int) (mixCode & 0xff);
code = mixCode >>> 8;
bitBuffer.put(code,bit);
}
//维护头部5字节
long gap = bitBuffer.position() - 40;
byte tail = (byte) (gap & 7);
//压缩数据长度(不包含头部长度5)
int compressLength = tail == 0 ? (int)(gap >>> 3) : (int)((gap >>> 3) + 1);
long position = bitBuffer.position();
bitBuffer.position(0);
//若到文件末尾时标记0001
if(length >= size) {
bitBuffer.put(1,4);
bitBuffer.put(tail,4);
}else{
bitBuffer.put(tail,8);
}
bitBuffer.put(compressLength >>> 16,16);
bitBuffer.put(compressLength,16);
//写入压缩数据
byte[] buf = bitBuffer.array();
int maxLength = (int) ((position & 7) == 0 ? position >>> 3 : (position >>> 3) + 1);
//分段写入
for(int k = 0;k<maxLength;k+=bufferLength){
if(k + bufferLength > maxLength)
out.write(buf,k,maxLength - k);
else
out.write(buf,k,bufferLength);
}
}
}catch (Exception e){
e.printStackTrace();
}
}
} catch (Exception e){
e.printStackTrace();
} finally {
try {
out.close();
System.gc();//通知尽快释放MappedByteBuffer
} catch (IOException e) {
e.printStackTrace();
}
}
}
//解压: 文件夹/文件
public void expand(FileInputStream in,File outFile){
try{
byte[] headBuff = new byte[4];
int len = in.read(headBuff);
if(len != 4) throw new RuntimeException("头部信息缺失");
ByteBuffer headByte = ByteBuffer.wrap(headBuff);
//获取文件名字符串长度
int length = headByte.getInt();
byte[] pathByte = new byte[length];
//读取文件名字符串
len = in.read(pathByte);
if (len != length) throw new RuntimeException("读取压缩信息失败");
String[] arrs = new String(pathByte).split(">");
HashMap<Integer,String> hashMap = new HashMap<>(); //值: 文件名
Queue<Map.Entry<Integer,File>> dirsQueue = new ArrayDeque<>();//保存文件夹键值
//创建对照表
for(String e : arrs[0].split("<")){
String[] entity = e.split(":");
hashMap.put(Integer.parseInt(entity[0]),entity[1]);
}
//创建文件
File f;
Integer key = -1;
for(String e : arrs[1].split("\\?")){
String[] entity = e.split("/");
int parentKey = Integer.parseInt(entity[0]);
int childKey = Integer.parseInt(entity[1]);
int mod = childKey % 3;
if(parentKey == -1 && mod == 0){ //处理文件夹根目录
f = new File(outFile,hashMap.get(childKey));
f.mkdir();
dirsQueue.offer(new AbstractMap.SimpleEntry<>(childKey,f));
outFile = f;
key = childKey;
continue;
}
Map.Entry<Integer,File> entry = null;
while(key != parentKey){
entry = dirsQueue.poll();
key = entry.getKey();
}
outFile = entry == null ? outFile : entry.getValue();
//处理
f = new File(outFile,hashMap.get(childKey));
if(mod == 0){ //文件夹
f.mkdir();
dirsQueue.offer(new AbstractMap.SimpleEntry<>(childKey,f));
}
else if(mod == 1){ //文件直接解压
f.createNewFile();
expandFile(in,new FileOutputStream(f));
}
else f.createNewFile(); //空文件
}
} catch (Exception e){
e.printStackTrace();
} finally {
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public void expandFile(InputStream in,OutputStream out){
//确定线程数与容量
int capacity = Runtime.getRuntime().availableProcessors();
LinkedBlockingQueue<Future<ByteBuffer>> blockingQueue = new LinkedBlockingQueue<>(capacity);
CountDownLatch latch = new CountDownLatch(1); //闭锁: 作用等待数据写入线程结束再放行
//数据写入线程: 负责将解压数据写入
Thread thread = new Thread(() ->{
try {
while(!blockingQueue.isEmpty()) {
Future<ByteBuffer> future = blockingQueue.poll();
ByteBuffer byteBuffer = future.get();
//写入
byte[] buf = byteBuffer.array();
int maxLength = byteBuffer.limit();
//分段写入
for(int k = 0;k<maxLength;k+=bufferLength){
if(k + bufferLength > maxLength)
out.write(buf,k,maxLength - k);
else
out.write(buf,k,bufferLength);
}
}
latch.countDown();
} catch (Exception e) { //异常不做处理
e.printStackTrace();
}
});
try {
//解压线程池: 负责解压数据
ExecutorService executor = Executors.newWorkStealingPool(capacity);
while(true) {
//大文件可能分多段压缩,确定当前段长度
byte[] head = new byte[5];
int length = in.read(head); //读取头部信息
if(length == -1 || length == 0) break;
ByteBuffer headByte = ByteBuffer.wrap(head);
int bits = headByte.get();//获取尾部bit位
boolean flag = false;
if(bits > 7) {
bits &= 0xf;
flag = true;
}
int tailBit = bits;//获取尾部bit位
if (length != 5) throw new RuntimeException("头部信息缺失");
length = headByte.getInt();//获取到当前段压缩字节长度
byte[] buff = new byte[length]; //读取当前压缩段数据
//读取压缩段
long len = in.read(buff);
if(length == -1 || length == 0) break;
if (len != length) throw new RuntimeException("读取压缩信息失败");
//解压并写入数据到byte数组: 并行提高效率
Future<ByteBuffer> future = executor.submit(()->{
long bitLength = tailBit == 0 ? len << 3 : (len << 3) + tailBit - 8;
BitBuffer bitBuffer = BitBuffer.wrap(buff,bitLength);
//读取huffman树
Node root = readTrie(bitBuffer);
//解压后先写入byte数组
byte[] buf = new byte[max];
int bufIndex = 0;
//特殊处理: root为叶子节点,仅一个节点
if(root.isLeaf()){
long p = bitBuffer.position();
long limit = bitBuffer.limit();
for(;p<limit;p++)
buf[bufIndex++] = (byte)root.ch;
return ByteBuffer.wrap(buf,0,bufIndex);
}
//解码
while(bitBuffer.hasRemaining()){
Node x = root;
//此处每次读取1bit位处理,效率低,多线程并行优化
while(!x.isLeaf() && bitBuffer.hasRemaining()){
boolean bit = bitBuffer.get(1) == 1;
if(bit) x = x.right;
else x = x.left;
}
buf[bufIndex++] = (byte) x.ch;
}
return ByteBuffer.wrap(buf,0,bufIndex);
});
blockingQueue.put(future); //将任务放入阻塞队列,put放入速度必定比take处理速度快
if(!thread.isAlive()) thread.start(); //启动数据写入线程
if(flag) break;
}
latch.await();
} catch (Exception e) {//异常不做处理
e.printStackTrace();
} finally {
try {
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//byte转无符号
private int toUnsigned(byte value){ return value < 0 ? value & 0xff : value; }
//根据统计的频率freq构建huffman树
private Node buildTrie(int[] freq){
PriorityQueue<Node> pq = new PriorityQueue<>();
for(char c=0;c<R;++c)
if (freq[c] > 0) pq.offer(new Node(c, freq[c], null, null));
while(pq.size() > 1){
Node left = pq.poll();
Node right = pq.poll();
Node parent = new Node('\0',left.freq+right.freq,left,right);
pq.offer(parent);
}
return pq.poll();
}
//根据huffman树构建huffman码表,st为码表,int值的前3个字节保存码值,最后1个字节保存码值长度
private void buildCodeTable(Node x,long[] st,long code,int length){
if(!x.isLeaf()){
code <<= 1;
buildCodeTable(x.left,st,code,length+1);
buildCodeTable(x.right,st,code+1,length+1);
}
else if(length == 0){ //只有一个根节点
st[x.ch] = 1<<8|1;
}
else {
if (length > 56) throw new RuntimeException("错误,编码过长超过long[] st无法保存码表");
st[x.ch] = code<<8|length; //7个字节(码值)+1个字节(码值长度)
}
}
//将huffman树存储到字节数组中,非叶子节点0,叶子节点1后面紧跟8bit位为叶子节点的码值
private void writeTrie(Node x,BitBuffer buff){
if(x.isLeaf()){
buff.put(1,1);
buff.put(x.ch,8);
return;
}
else buff.position(buff.position()+1);
writeTrie(x.left,buff);
writeTrie(x.right,buff);
}
//读取头部存储的huffman树并创建(读取到叶子节点会自动递归结束)
private Node readTrie(BitBuffer buff){
boolean isLeaf = buff.get(1) == 1;
if(isLeaf){
Node node = new Node((char)buff.get(8),-1,null,null);
return node;
}
else return new Node('\0',-1,readTrie(buff),readTrie(buff));
}
}
2.LZW算法
使用到的三向单词查找树(仅保留用到的方法)-稍作改动直接处理字节数组
/**
* 稍作改动的三向字符串查找树
* 概述: 稍作改动可以直接处理字符数组
*/
public class TST<V> {
private Node<V> root;//根节点
private class Node<V>{ //节点
private char c; //字符
private Node<V> left; //左连接 <c
private Node<V> mid; //中连接 =c
private Node<V> right;//右连接 >c
private V value; //保存键值
}
//获取: 键-buff[begin,end]组成的字符串
public V get(byte[] buff,int begin,int end){
Node x = get(root,buff,begin,end);
if(x != null) return (V)x.value;
if(x == null) System.out.println("x为null" + begin + ":" + end);
return null;
}
private Node get(Node x,byte[] buff,int begin,int end){
if(buff == null || buff.length == 0 || begin > end) return null;
if(x == null) {
System.out.println("x.为null" + begin+":"+end);
return null;
}
char c = toUnsigned(buff[begin]);
if(x.c > c) return get(x.left,buff,begin,end);
else if(x.c < c) return get(x.right,buff,begin,end);
else if(begin < end) return get(x.mid,buff,begin+1,end);
else {
if(x.value == null) System.out.println("begin:"+ begin+" end:"+end);
return x;
}
}
//插入: 键-byte数组buff[begin,end]组成的字符串,值-value
public void put(byte[] buff,int begin,int end,V value){
if(buff == null || buff.length == 0 || begin > end) return;
root = put(root,buff,begin,end,value);
}
private Node put(Node x,byte[] buff,int begin,int end,V value){
char c = toUnsigned(buff[begin]);
if(x == null){
x = new Node();
x.c = c;
}
if(x.c > c) x.left = put(x.left,buff,begin,end,value);
else if(x.c < c) x.right = put(x.right,buff,begin,end,value);
else if(begin < end) x.mid = put(x.mid,buff,begin+1,end,value);
else x.value = value;
return x;
}
//获取最长前缀并所在byte数组的索引
public int longestPrefixof(byte[] buff,int begin,int end){
if(buff == null || buff.length == 0 || begin > end) return -1;
int index = 0;
Node x = root;
while(x != null && begin <= end){
char c = toUnsigned(buff[begin]);
if(x.c > c) x = x.left;
else if(x.c < c) x = x.right;
else{
if(x.value != null) index = begin;
begin++;
x = x.mid;
}
}
return index;
}
//插入
public void put(String key, V value){
if(key == null || key.length() == 0) return;
root = put(root,key,value,0);
}
/**插入: 与获取思路相同,只是遇到空节点创建新节点,到末尾时保存键值*/
private Node put(Node x,String key,V value,int d){
char c = key.charAt(d);
if(x == null) { //没有节点就创建
x = new Node();
x.c = c;
}
if(x.c > c) x.left = put(x.left,key,value,d);
else if(x.c < c) x.right = put(x.right,key,value,d);
else if(d < key.length()-1) x.mid = put(x.mid,key,value,d+1);
else x.value = value; //末尾时保存键值
return x;
}
//byte转无符号
private char toUnsigned(byte value){ return (char)(value < 0 ? value & 0xff : value); }
}
2.1.LZW算法-12位定长bit位编码
/**
* LZW算法
* 概述: gif格式用到此算法思想,适合有大量重复值的数据,如果重复数据很少,反而会有反效果
* 实现: 1.使用定长的12bit位码表,码表满时,停止扩展,不适合大文件
* 2.JDK1.7+之后substring方法性能下降,在此直接处理字节数组,不再像算法4中转为字符串处理
* 3.因为要处理字节数组,因此对三向字符串查找树稍作更改,直接处理字符
*/
public class LZW {
private int buffLength = 8192;
private int R = 256;
private int W = 12;
private int L = 1 << W;
/** 压缩 **/
public void compress(InputStream in, OutputStream out){
try {
byte[] buff = new byte[in.available()];
int length = in.read(buff);
//基础表
TST<Integer> st = new TST<>(); //稍作更改的三向单词查找树(直接处理字节数组)
for(int i=0;i<R;++i)
st.put(""+(char)i,i);
int code = R+1;
BitBuffer bitBuffer = new BitBuffer(length << 1);
//压缩
for(int i=0;i<length;){
//获取最长前缀索引
int index = st.longestPrefixof(buff,i,length-1);
//压缩后写入缓冲区
Integer value = st.get(buff,i,index);
bitBuffer.put(value,W);
//扩展表
if(index < length-1 && code < L) //W位表满之后就不再扩展
st.put(buff,i,index+1,code++);
i = index + 1;
}
long position = bitBuffer.position();
int len = (int)((position & 7 ) == 0 ? position >>> 3 : (position >>> 3) + 1);
byte[] buf = bitBuffer.array();
//分段写入
for(int k = 0;k<len;k+=buffLength){
if(k + buffLength > len)
out.write(buf,k,len - k);
else
out.write(buf,k,buffLength);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
out.close();
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/** 解压 **/
public void expand(InputStream in,OutputStream out){
try {
byte[] buff = new byte[in.available()];
int length = in.read(buff);
String[] st = new String[L]; //确定码表长度,直接使用定长数组存储解码表
int i;
//基础表
for(i=0;i<R;++i)
st[i] = ""+(char)i;
st[i++] = "";
BitBuffer bitBuffer = BitBuffer.wrap(buff,length << 3);
if(bitBuffer.remaining() < W) return; //达到末尾结束
//先取出第一个编码字符
int codeword = (int) bitBuffer.get(W);
String val = st[codeword]; //解码第一个字符
StringBuilder builder = new StringBuilder(); //存储解码后的字符串
while(true){
builder.append(val);
if(bitBuffer.remaining() < W) break; //达到末尾结束
codeword = (int) bitBuffer.get(W); //读取一个编码字符
String s = st[codeword];
//扩展表
if(i == codeword) s = val+ val.charAt(0); //特殊情况,当前解码字符刚好是下一个要加入码表的字符
if(i < L) st[i++] = val + s.charAt(0);
val = s;
if(builder.length() > buffLength){ //分次写入
out.write(builder.toString().getBytes("ISO8859-1"));
builder.delete(0,builder.length());
}
}
out.write(builder.toString().getBytes("ISO8859-1"));
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
out.close();
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
2.2.LZW算法-从9bit位开始变长编码
/**
* LZW算法
* 概述: gif格式用到此算法思想,适合有大量重复值的数据,如果重复数据很少,反而会有反效果
* 实现: 1.在LZW算法基础上使用变长的编码,从9bit位开始,用完时扩展,并使用最后一位L-1作为变更标记
* 2.JDK1.7+之后substring方法性能下降,在此直接处理字节数组,不再像算法4中转为字符串处理
* 3.压缩性能有待改进
*/
public class LZW {
private int buffLength = 8192; //缓冲大小8M
private int R = 256; //基本码表
private int W = 9; //从9bit位开始扩展码表
private int maxW = 24; //最大扩展bit位
private int L = 1 << W; //码表大小
/** 压缩 **/
public void compress(InputStream in, OutputStream out){
try {
byte[] buff = new byte[in.available()];
int length = in.read(buff);
//基础表
TST<Integer> st = new TST<>(); //稍作更改的三向单词查找树(直接处理字节数组)
for(int i=0;i<R;++i)
st.put(""+(char)i,i);
int code = R+1;
BitBuffer bitBuffer = new BitBuffer(length << 1);
//压缩
for(int i=0;i<length;){
//获取最长前缀索引
int index = st.longestPrefixof(buff,i,length-1);
//压缩后写入缓冲区
Integer value = st.get(buff,i,index);
bitBuffer.put(value,W);
//扩展表
if(index < length-1 && code < L-1) //W位表没有满就扩展码表
st.put(buff,i,index+1,code++);
else if(code == L-1){ //L-1标记当前M bit位码表编满,扩展码表为M+1 bit位
bitBuffer.put(L-1,W);
if(W == maxW){ //达到最大就重新创建码表
st = new TST<>();
for(int k=0;k<R;++k)
st.put(""+(char)k,k);
code = R+1;
W = 9;
}
else { //没有达到最大就增加bit位扩展码表
code++;
if (index < length - 1)
st.put(buff, i, index + 1, code++);
W++;
}
L = 1 << W;
}
i = index + 1;
}
long position = bitBuffer.position();
int len = (int)((position & 7 ) == 0 ? position >>> 3 : (position >>> 3) + 1);
byte[] buf = bitBuffer.array();
//分段写入
for(int k = 0;k<len;k+=buffLength){
if(k + buffLength > len)
out.write(buf,k,len - k);
else
out.write(buf,k,buffLength);
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
out.close();
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
/** 解压 **/
public void expand(InputStream in,OutputStream out){
try {
byte[] buff = new byte[in.available()];
long length = in.read(buff);
//不确定码表总大小,使用hashMap
HashMap<Integer,String> st = new HashMap<>(L);
int i;
//基础表
for(i=0;i<R;++i)
st.put(i,""+(char)i);
st.put(i++,"");
BitBuffer bitBuffer = BitBuffer.wrap(buff,length << 3);
if(bitBuffer.remaining() < W) return; //达到末尾结束
//先取出第一个编码字符
int codeword = (int) bitBuffer.get(W);
String val = st.get(codeword); //解码第一个字符
StringBuilder builder = new StringBuilder(); //存储解码后的字符串
while(true){
builder.append(val); //解码后存入StringBuilder
if(bitBuffer.remaining() < W) break; //达到末尾结束
codeword = (int) bitBuffer.get(W); //读取一个编码字符
if(codeword == L-1){ //达到最大码表,增加一bit位或者重置
if(W == maxW){ //达到最大bit位重置码表
W = 9;
L = 1 << W;
st = new HashMap<>(L);
//基础表
for(i=0;i<R;++i)
st.put(i,""+(char)i);
st.put(i++,"");
if(bitBuffer.remaining() < W) return; //结束
codeword = (int) bitBuffer.get(W);
val = st.get(codeword);
continue;
}
else {
W++;
L = 1 << W;
if(bitBuffer.remaining() < W) return; //结束
codeword = (int) bitBuffer.get(W);
i++; //L-1作为标记,不再使用跳过
}
}
String s = st.get(codeword);
//扩展表
if(i == codeword) s = val+ val.charAt(0); //特殊情况,当前解码字符刚好是下一个要加入码表的字符
if(i < L) st.put(i++,val+s.charAt(0));
val = s; //更新
if(builder.length() > buffLength){ //分次写入
out.write(builder.toString().getBytes("ISO8859-1"));
builder.delete(0,builder.length());
}
}
//写入最后部分
out.write(builder.toString().getBytes("ISO8859-1"));
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
out.close();
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}