Article 9008 of comp.lang.perl: Xref: feenix.metronet.com comp.lang.perl:9008 Newsgroups: comp.lang.perl Path: feenix.metronet.com!news.utdallas.edu!corpgate!bnrgate!nott!torn!howland.reston.ans.net!pipex!sunic!trane.uninett.no!news.eunet.no!nuug!news.eunet.fi!news.spb.su!KremlSun!kiae!relcom!relay1!river!csoft!news-server From: "Dmitry S. Kohmanyuk" Subject: a data compression toolkit in Perl Organization: Animals Paradise Farm Message-ID: Reply-To: dk@farm.cs.kiev.ua Lines: 308 Sender: news-server@river.cs.kiev.ua Date: Sat, 18 Dec 93 01:27:38 +0300 my apologies to everyone - just not sure does it make it the first time... hey, folks, since I haven't finished my reversed regexp matching code anyway, here is another little small (and may be useful) thingo... history: some time ago I talked to someone explaining Perl's advantages. I stressed the fact that Perl allows you to handle binary data easily. "So you can write, say, a compressor in Perl? " - "Yeah." just to keep that promise... (and it was a real breeze to write'n'debug - - just couple of days while simultaneously hacking BSDI's cyrillization and writing DNS management tool (in Perl, of course ;) the usage in pretty simple. just two subroutines. see the enclosed example. feel free to send your benchmarks on something more powerful than [34]86 PCs ;) # This is a shell archive. # Remove everything above and including the cut line. # Then run the rest of the file through sh. #----cut here-----cut here-----cut here-----cut here----# #!/bin/sh # shar: Shell Archiver # Run the following text with /bin/sh to create: # readme # lzpack.pl # lztest # This archive created: Sat Dec 11 02:05:22 1993 echo shar: extracting readme sed 's/^XX//' << \SHAR_EOF > readme XX XXPerlLZ - Lempel-Ziv compressor in Perl XX-------------------------------------- XX XXWritten by: Dmitry Kohmanyuk XX XXVersion: 1.0 XX XX XXUsage: XX XXrequire 'lzpack.pl'; XX XX$packed_string = &lz_pack($string); XX$original_string = &lz_unpack($packed_string); XX XX XXNotes: XX XXData format is architecture-independent. XX XXthe algorithm is _somewhat_ similar to LZRW. XX XXhere's the differences: XX XX- no sliding dictinary. encoding is done with 4K sized blocks. XX XX- format is different: one-byte sig, two-byte len, then data for the block. XX XX- hashing is totally different: XX XX - another hash function (Perl builtin one, of course ;) XX - hashing is 'perfect' (we match exactly three chars). XX this means that we start compare from 4th char, XX and there is no need to Ross's 'hash partitions' XX - we hash all possible 3-char substrings, XX not just tails of copy strings and literals as LZRW does XX - hash chains can be arbitrary length; search for 'TUNE IT'. XX - hash chains are sorted by recency. XX XX- we optimize search by stopping when match of maximum length is found. XX seems to be a big win on very redundant files. XX XX- this is Perl, not C. Should I say more? XX XX SHAR_EOF if test 1158 -ne "`wc -c readme`" then echo shar: error transmitting readme '(should have been 1158 characters)' fi echo shar: extracting lzpack.pl sed 's/^XX//' << \SHAR_EOF > lzpack.pl XX XX# XX# lz_(un)?pack routines by Dmitry Kohmanyuk XX# XX# Version: 1.0 XX# XX# Notes: XX# XX# the compressed data is in machine-independent format. XX# XX# the algorithm implemented is LZ77 family, with some XX# Perl-specific optimizations (mostly in %hash). XX# Buffer size is 4K. No sliding dictionary is involved. XX# XX# you can control speed/space efficiency somewhat by XX# changing max hash chain length (search for 'TUNE IT'). XX# XX# compression ratio is between compress(1) and gzip(1L). XX# XX# speed is not very warp, but this is Perl, not C. XX# OTOH, the source code is much shorter ;) XX# XX XX XX## $packed_text = &lz_pack($text); XXsub lz_pack XX{ XX local($buf) = @_; XX local($buf_pos); XX local($result) = ''; XX XX local($in, $in_len, $in_pos); XX local($out, $out_bits, $out_bytes); XX local(%hash); # I vote for it XX local($text, $len, $pos); XX XX vec(0, 7, 42); # deep magic. XX XX for ($buf_pos = 0; $buf_pos < length($buf); $buf_pos += $in_len) { XX XX $in = substr($buf, $buf_pos, 4096); XX $in_len = length($in); XX $out = $out_bits = $out_bytes = ''; XX %hash = (); XX XX #DBG# $out_len = $match_cnt = $lit_cnt = $trunc_cnt = 0; XX XX for ($in_pos = 0; $in_pos < $in_len; ) { XX $len = -2; # (1 char - 3 min_match_len) XX XX # find best match, if any XX $text = substr($in, $in_pos + 3, 15); XX substr($hash{substr($in, $in_pos, 3)}, 32) = ''; # TUNE IT XX foreach (unpack('S*', $hash{substr($in, $in_pos, 3)})) { XX ($text ^ substr($in, $_, 15)) =~ /^\0*/; XX #print "match: ", length($&), "@$_, len=$len\n"; XX next unless $len < length($&); XX XX $len = length($&); XX $pos = $_; XX last if $len == 15; XX } XX XX if ($len >= 0) { XX $out_bits .= '1'; XX $out_bytes .= pack('n', (($in_pos + 3 - $pos) << 4) | $len); XX #DBG# $out_len += 17; XX #DBG# $match_cnt++; XX } else { XX # $len == -2 XX $out_bits .= '0'; XX $out_bytes .= substr($in, $in_pos, 1); XX #DBG# $out_len += 9; XX #DBG# $lit_cnt++; XX } XX XX if (length($out_bits) >= 16) { XX $out .= pack('B*', $out_bits); XX $out .= $out_bytes; XX $out_bits = $out_bytes = ''; XX } XX XX # update hash table XX for ($len += 3; $len--; $in_pos++) { XX substr($hash{substr($in, $in_pos, 3)}, 0, 0) XX = pack('S', $in_pos + 3); # I want .0= XX } XX XX # go next loop. XX } XX XX if (length($out_bits)) { XX $out_bits .= '0' x (16 - length($out_bits)); XX $out .= pack('B*', $out_bits); XX $out .= $out_bytes; XX $out_bits = $out_bytes = ''; XX } XX XX #DBG# print LOG "out_len=", $out_len/8, "\n"; XX #DBG# print LOG "lit_cnt=$lit_cnt, match_cnt=$match_cnt, trunc_cnt=$trunc_cnt\n"; XX #DBG# printf LOG "avg copy len=%.2f, avg match len=%.2f\n", XX #DBG# ($in_len - $lit_cnt) / $match_cnt, $in_len / ($lit_cnt + $match_cnt); XX $result .= (length($out) < $in_len) ? XX "\xD1" . pack('n', length($out)) . $out XX : "\xD0" . pack('n', $in_len) . $in; XX } XX XX $result; XX} XX XX XX## $original_text = &lz_unpack($packed_text); XXsub lz_unpack XX{ XX local($buf) = @_; XX local($buf_pos); XX local($result) = ''; XX XX local($sig, $block_len); XX local($in, $in_pos); XX local(@flags); XX local($pos, $len); XX XX for ($buf_pos = 0; $buf_pos < length($buf); $buf_pos += $block_len) { XX ($sig, $block_len) = unpack('Cn', substr($buf, $buf_pos, 3)); XX $buf_pos += 3; XX $in = substr($buf, $buf_pos, $block_len); XX XX if ($sig == 0xD0) { XX $result .= $in; XX next; XX } elsif ($sig != 0xD1) { XX # bad block magic! XX return undef; XX } XX XX # do packed block XX for ($in_pos = 0; $in_pos < $block_len; ) { XX $in_pos += 2; XX foreach (split(//, XX unpack('B16', substr($in, $in_pos - 2, 2)))) { XX if ($_) { XX # copy string XX ($pos) = unpack('n', substr($in, $in_pos, 2)); XX $in_pos += 2; XX $len = ($pos & 0x0F) + 3; XX $pos >>= 4; XX while ($len > $pos) { XX # special case: overlap in copy XX $result .= substr($result, -$pos, $pos); XX $len -= $pos; XX } XX $result .= substr($result, -$pos, $len); XX } else { XX # literal XX $result .= substr($in, $in_pos, 1); XX $in_pos++; XX } XX } XX } XX XX # go do next block XX } XX XX $result; XX} XX XX XX'require ok'; SHAR_EOF if test 4115 -ne "`wc -c lzpack.pl`" then echo shar: error transmitting lzpack.pl '(should have been 4115 characters)' fi echo shar: extracting lztest sed 's/^XX//' << \SHAR_EOF > lztest XX#!/usr/local/bin/perl XX XXrequire 'lzpack.pl'; XX XX XXbinmode(STDIN); XXbinmode(STDOUT); XXwhile (read(STDIN, $_, 16*1024)) { XX &test($_); XX} XX&test('abcde' x 1024); XX XX XXsub test XX{ XX $data = shift; XX $packed = &lz_pack($data); XX #print STDOUT $packed, "\n-*-\n"; XX $unpacked = &lz_unpack($packed); XX #print STDOUT $unpacked, "\n-*-\n"; XX printf STDERR "test: %s, compression ratio: %.2f\n", XX $unpacked eq $data? "ok" : "error", XX length($packed) / length($data); XX} XX SHAR_EOF if test 446 -ne "`wc -c lztest`" then echo shar: error transmitting lztest '(should have been 446 characters)' fi # End of shell archive exit 0 -- ;; Geometry of crowbar in crystal backspaces ... -- ;; Geometry of crowbar in crystal backspaces ...