Article 9008 of comp.lang.perl:
Xref: feenix.metronet.com comp.lang.perl:9008
Newsgroups: comp.lang.perl
Path: feenix.metronet.com!news.utdallas.edu!corpgate!bnrgate!nott!torn!howland.reston.ans.net!pipex!sunic!trane.uninett.no!news.eunet.no!nuug!news.eunet.fi!news.spb.su!KremlSun!kiae!relcom!relay1!river!csoft!news-server
From:  "Dmitry S. Kohmanyuk" <dk@farm.cs.kiev.ua>
Subject: a data compression toolkit in Perl
Organization: Animals Paradise Farm
Message-ID: <AIQ9Z4jK3B@farm.cs.kiev.ua>
Reply-To: dk@farm.cs.kiev.ua
Lines: 308
Sender: news-server@river.cs.kiev.ua
Date: Sat, 18 Dec 93 01:27:38 +0300 


my apologies to everyone - just not sure does it make it the first time...


hey, folks,

since I haven't finished my reversed regexp matching code anyway,
here is another little small (and may be useful) thingo...

history:
some time ago I talked to someone explaining Perl's advantages.
I stressed the fact that Perl allows you to handle binary data easily.
"So you can write, say, a compressor in Perl? " - "Yeah."

just to keep that promise... (and it was a real breeze to write'n'debug -
- just couple of days while simultaneously hacking BSDI's
cyrillization and writing DNS management tool (in Perl, of course ;)

the usage in pretty simple. just two subroutines. see the enclosed example.

feel free to send your benchmarks on something more powerful than [34]86 PCs ;)


#       This is a shell archive.
#       Remove everything above and including the cut line.
#       Then run the rest of the file through sh.
#----cut here-----cut here-----cut here-----cut here----#
#!/bin/sh
# shar:    Shell Archiver
#       Run the following text with /bin/sh to create:
#       readme
#       lzpack.pl
#       lztest
# This archive created: Sat Dec 11 02:05:22 1993
echo shar: extracting readme
sed 's/^XX//' << \SHAR_EOF > readme
XX
XXPerlLZ - Lempel-Ziv compressor in Perl
XX--------------------------------------
XX
XXWritten by: Dmitry Kohmanyuk <dk@farm.cs.kiev.ua>
XX
XXVersion: 1.0
XX
XX
XXUsage:
XX
XXrequire 'lzpack.pl';
XX
XX$packed_string = &lz_pack($string);
XX$original_string = &lz_unpack($packed_string);
XX
XX
XXNotes:
XX
XXData format is architecture-independent.
XX
XXthe algorithm is _somewhat_ similar to LZRW.
XX
XXhere's the differences:
XX
XX- no sliding dictinary. encoding is done with 4K sized blocks.
XX
XX- format is different: one-byte sig, two-byte len, then data for the block.
XX
XX- hashing is totally different:
XX
XX      - another hash function (Perl builtin one, of course ;)
XX      - hashing is 'perfect' (we match exactly three chars).
XX          this means that we start compare from 4th char,
XX          and there is no need to Ross's 'hash partitions'
XX      - we hash all possible 3-char substrings,
XX          not just tails of copy strings and literals as LZRW does
XX      - hash chains can be arbitrary length; search for 'TUNE IT'.
XX        - hash chains are sorted by recency.
XX
XX- we optimize search by stopping when match of maximum length is found.
XX   seems to be a big win on very redundant files.
XX
XX- this is Perl, not C. Should I say more?
XX
XX
SHAR_EOF
if test 1158 -ne "`wc -c readme`"
then
echo shar: error transmitting readme '(should have been 1158 characters)'
fi
echo shar: extracting lzpack.pl
sed 's/^XX//' << \SHAR_EOF > lzpack.pl
XX
XX#
XX# lz_(un)?pack routines by Dmitry Kohmanyuk <dk@farm.cs.kiev.ua>
XX#
XX# Version: 1.0
XX#
XX# Notes:
XX#
XX#     the compressed data is in machine-independent format.
XX#
XX#     the algorithm implemented is LZ77 family, with some
XX#     Perl-specific optimizations (mostly in %hash).
XX#     Buffer size is 4K.  No sliding dictionary is involved.
XX#
XX#     you can control speed/space efficiency somewhat by
XX#     changing max hash chain length (search for 'TUNE IT').
XX#
XX#     compression ratio is between compress(1) and gzip(1L).
XX#
XX#     speed is not very warp, but this is Perl, not C.
XX#     OTOH, the source code is much shorter ;)
XX#
XX
XX
XX## $packed_text = &lz_pack($text);
XXsub   lz_pack
XX{
XX      local($buf) = @_;
XX      local($buf_pos);
XX      local($result) = '';
XX
XX      local($in, $in_len, $in_pos);
XX      local($out, $out_bits, $out_bytes);
XX      local(%hash);   # I vote for it
XX      local($text, $len, $pos);
XX
XX      vec(0, 7, 42);          # deep magic.
XX
XX      for ($buf_pos = 0; $buf_pos < length($buf); $buf_pos += $in_len) {
XX
XX          $in = substr($buf, $buf_pos, 4096);
XX          $in_len = length($in);
XX          $out = $out_bits = $out_bytes = '';
XX          %hash = ();
XX
XX          #DBG# $out_len = $match_cnt = $lit_cnt = $trunc_cnt = 0;
XX
XX          for ($in_pos = 0; $in_pos < $in_len; ) {
XX              $len = -2;      # (1 char - 3 min_match_len)
XX
XX              # find best match, if any
XX              $text = substr($in, $in_pos + 3, 15);
XX              substr($hash{substr($in, $in_pos, 3)}, 32) = ''; # TUNE IT
XX              foreach (unpack('S*', $hash{substr($in, $in_pos, 3)})) {
XX                  ($text ^ substr($in, $_, 15)) =~ /^\0*/;
XX                  #print "match: ", length($&), "@$_, len=$len\n";
XX                  next unless $len < length($&);
XX
XX                  $len = length($&);
XX                  $pos = $_;
XX                  last if $len == 15;
XX              }
XX
XX              if ($len >= 0) {
XX                  $out_bits .= '1';
XX                  $out_bytes .= pack('n', (($in_pos + 3 - $pos) << 4) | $len);
XX                  #DBG# $out_len += 17;
XX                  #DBG# $match_cnt++;
XX              } else {
XX                  # $len == -2
XX                  $out_bits .= '0';
XX                  $out_bytes .= substr($in, $in_pos, 1);
XX                  #DBG# $out_len += 9;
XX                  #DBG# $lit_cnt++;
XX              }
XX
XX              if (length($out_bits) >= 16) {
XX                  $out .= pack('B*', $out_bits);
XX                  $out .= $out_bytes;
XX                  $out_bits = $out_bytes = '';
XX              }
XX
XX              # update hash table
XX              for ($len += 3; $len--; $in_pos++) {
XX                  substr($hash{substr($in, $in_pos, 3)}, 0, 0)
XX                      = pack('S', $in_pos + 3);       # I want .0=
XX              }
XX
XX              # go next loop.
XX          }
XX
XX          if (length($out_bits)) {
XX              $out_bits .= '0' x (16 - length($out_bits));
XX              $out .= pack('B*', $out_bits);
XX              $out .= $out_bytes;
XX              $out_bits = $out_bytes = '';
XX          }
XX
XX            #DBG# print LOG "out_len=", $out_len/8, "\n";
XX            #DBG# print LOG "lit_cnt=$lit_cnt, match_cnt=$match_cnt, trunc_cnt=$trunc_cnt\n";
XX            #DBG# printf LOG "avg copy len=%.2f, avg match len=%.2f\n",
XX          #DBG#            ($in_len - $lit_cnt) / $match_cnt, $in_len / ($lit_cnt + $match_cnt);
XX          $result .= (length($out) < $in_len) ?
XX                        "\xD1" . pack('n', length($out)) . $out
XX                        : "\xD0" . pack('n', $in_len) . $in;
XX      }
XX
XX      $result;
XX}
XX
XX
XX## $original_text = &lz_unpack($packed_text);
XXsub   lz_unpack
XX{
XX      local($buf) = @_;
XX      local($buf_pos);
XX      local($result) = '';
XX
XX      local($sig, $block_len);
XX      local($in, $in_pos);
XX      local(@flags);
XX      local($pos, $len);
XX
XX      for ($buf_pos = 0; $buf_pos < length($buf); $buf_pos += $block_len) {
XX          ($sig, $block_len) = unpack('Cn', substr($buf, $buf_pos, 3));
XX          $buf_pos += 3;
XX          $in = substr($buf, $buf_pos, $block_len);
XX
XX          if ($sig == 0xD0) {
XX              $result .= $in;
XX              next;
XX          } elsif ($sig != 0xD1) {
XX              # bad block magic!
XX              return undef;
XX          }
XX
XX          # do packed block
XX          for ($in_pos = 0; $in_pos < $block_len; ) {
XX              $in_pos += 2;
XX              foreach (split(//,
XX                      unpack('B16', substr($in, $in_pos - 2, 2)))) {
XX                  if ($_) {
XX                      # copy string
XX                      ($pos) = unpack('n', substr($in, $in_pos, 2));
XX                      $in_pos += 2;
XX                      $len = ($pos & 0x0F) + 3;
XX                      $pos >>= 4;
XX                      while ($len > $pos) {
XX                          # special case: overlap in copy
XX                          $result .= substr($result, -$pos, $pos);
XX                          $len -= $pos;
XX                      }
XX                      $result .= substr($result, -$pos, $len);
XX                  } else {
XX                      # literal
XX                      $result .= substr($in, $in_pos, 1);
XX                      $in_pos++;
XX                  }
XX              }
XX          }
XX
XX          # go do next block
XX      }
XX
XX      $result;
XX}
XX
XX
XX'require ok';
SHAR_EOF
if test 4115 -ne "`wc -c lzpack.pl`"
then
echo shar: error transmitting lzpack.pl '(should have been 4115 characters)'
fi
echo shar: extracting lztest
sed 's/^XX//' << \SHAR_EOF > lztest
XX#!/usr/local/bin/perl
XX
XXrequire 'lzpack.pl';
XX
XX
XXbinmode(STDIN);
XXbinmode(STDOUT);
XXwhile (read(STDIN, $_, 16*1024)) {
XX      &test($_);
XX}
XX&test('abcde' x 1024);
XX
XX
XXsub   test
XX{
XX      $data = shift;
XX      $packed = &lz_pack($data);
XX      #print STDOUT $packed, "\n-*-\n";
XX      $unpacked = &lz_unpack($packed);
XX      #print STDOUT $unpacked, "\n-*-\n";
XX      printf STDERR "test: %s, compression ratio: %.2f\n",
XX              $unpacked eq $data? "ok" : "error",
XX              length($packed) / length($data);
XX}
XX
SHAR_EOF
if test 446 -ne "`wc -c lztest`"
then
echo shar: error transmitting lztest '(should have been 446 characters)'
fi
#       End of shell archive
exit 0

--
                           ;; Geometry of crowbar in crystal backspaces ...


--
                           ;; Geometry of crowbar in crystal backspaces ...