[Xapian-discuss] Compressed Btrees (bugged?)

Olly Betts olly at survex.com
Wed Dec 15 15:56:10 GMT 2004


On Wed, Dec 15, 2004 at 03:36:39PM +0000, Olly Betts wrote:
> I'll work up a revised patch shortly.

Attached.

Cheers,
    Olly
-------------- next part --------------
Index: Makefile.am
===================================================================
RCS file: /usr/data/cvs/xapian/xapian-core/Makefile.am,v
retrieving revision 1.138
diff -p -u -r1.138 Makefile.am
--- Makefile.am	2 Nov 2004 04:46:45 -0000	1.138
+++ Makefile.am	15 Dec 2004 15:02:20 -0000
@@ -41,5 +41,6 @@ lib_LTLIBRARIES = libxapian.la
 # doesn't really work as we'd hope as we get to play with the mangled symbols
 #libxapian_la_LDFLAGS = -export-symbols-regex 'Xapian'
 
 libxapian_la_LIBADD = common/libcommon.la backends/libbackend.la \
- matcher/libmatcher.la languages/liblanguages.la api/libapi.la $(LIBNET_LA)
+ matcher/libmatcher.la languages/liblanguages.la api/libapi.la $(LIBNET_LA) \
+ -lz
Index: backends/quartz/btree.cc
===================================================================
RCS file: /usr/data/cvs/xapian/xapian-core/backends/quartz/btree.cc,v
retrieving revision 1.181
diff -p -u -r1.181 btree.cc
--- backends/quartz/btree.cc	13 Dec 2004 01:47:49 -0000	1.181
+++ backends/quartz/btree.cc	15 Dec 2004 15:46:34 -0000
@@ -77,6 +77,13 @@ PWRITE_PROTOTYPE
 # include <io.h> // for _commit()
 #endif
 
+#ifdef COMPRESSED_TAGS
+#include <zlib.h>
+// Only try to compress tags longer than this many bytes.
+const size_t COMPRESS_MIN = 4;
+#define DONT_COMPRESS -1
+#endif /* COMPRESSED_TAGS */
+
 // Only useful for platforms like Windows which distinguish between text and
 // binary files.
 #ifndef __WIN32__
@@ -1089,6 +1096,62 @@ Btree::add(const string &key, string tag
 
     form_key(key);
 
+#ifdef COMPRESSED_TAGS
+    bool compressed = false;
+    if (compress_strategy != DONT_COMPRESS && tag.size() > COMPRESS_MIN) {
+	z_stream stream;
+	int err;
+
+	stream.next_in = (Bytef *)(tag.data());
+	stream.avail_in = (uInt)tag.size();
+
+	// If compressed size is >= tag.size(), we don't want to compress.
+	unsigned long blk_len = tag.size() - 1;
+	unsigned char * blk = new unsigned char[blk_len];
+	stream.next_out = blk;
+	stream.avail_out = (uInt)blk_len;
+
+	stream.zalloc = reinterpret_cast<alloc_func>(0);
+	stream.zfree = reinterpret_cast<free_func>(0);
+	stream.opaque = (voidpf)0;
+
+	//cout << "deflateInit about to be called\n" << endl;
+	// -15 means raw deflate with 32K LZ77 window (largest)
+	// memLevel 9 is the highest (8 is default)
+	err = deflateInit2(&stream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, -15, 9,
+			   compress_strategy);
+	//cout << "deflateInit returns " << Z_OK << endl;
+	if (err != Z_OK) abort();
+	if (!compress_dict.empty()) {
+	    err = deflateSetDictionary(&stream,
+		    reinterpret_cast<const Bytef *>(compress_dict.data()),
+		    compress_dict.size());
+	    //cout << "deflateSetDictionary returns " << Z_OK << endl;
+	    if (err != Z_OK) abort();
+	}
+
+	err = deflate(&stream, Z_FINISH);
+	if (err == Z_STREAM_END) {
+	    // If deflate succeeded, then the output was at least one byte
+	    // smaller than the input.
+	    //cout << "deflate compressed "<<tag.size()<<"["<<tag<<"] to ";
+	    tag.assign(reinterpret_cast<const char *>(blk), stream.total_out);
+	    //cout << tag.size()<<"["<<tag<<"]"<<endl;
+	    compressed = true;
+	    err = deflateEnd(&stream);
+	    if (err != Z_OK) {
+		cout << "deflateEnd failed, err = " << err << endl;
+		abort();
+	    }
+	} else {
+	    // Deflate failed - presumably the data wasn't compressible.
+	    (void)deflateEnd(&stream);
+	}
+
+	delete [] blk;
+    }
+#endif /* COMPRESSED_TAGS */
+
     // sort of matching kt.append_chunk(), but setting the chunk
     const size_t cd = kt.key().length() + K1 + I2 + C2 + C2;  // offset to the tag data
     const size_t L = max_item_size - cd; // largest amount of tag data for any chunk
@@ -1127,7 +1190,11 @@ Btree::add(const string &key, string tag
 	size_t l = (i == m ? residue : (i == 1 ? first_L : L));
 	Assert(cd + l <= block_size);
 	Assert(string::size_type(o + l) <= tag.length());
+#ifndef COMPRESSED_TAGS
 	kt.set_tag(cd, tag.data() + o, l);
+#else /* COMPRESSED_TAGS */
+	kt.set_tag(cd, tag.data() + o, l, compressed);
+#endif /* COMPRESSED_TAGS */
 	kt.set_component_of(i);
 	
 	o += l;
@@ -1221,6 +1288,9 @@ Btree::read_tag(Cursor * C_, string *tag
     if (n > 1) tag->reserve(max_item_size * n);
 
     item.append_chunk(tag);
+#ifdef COMPRESSED_TAGS
+    bool compressed = item.get_compressed();
+#endif /* COMPRESSED_TAGS */
 
     for (int i = 2; i <= n; i++) {
 	next(C_, 0);
@@ -1228,6 +1298,103 @@ Btree::read_tag(Cursor * C_, string *tag
     }
     // At this point the cursor is on the last item - calling next will move
     // it to the next key (Bcursor::get_tag() relies on this).
+#ifdef COMPRESSED_TAGS
+    if (!compressed) return;
+
+    // FIXME: Perhaps we should we decompress each chunk as we read it so we
+    // don't need both the full compressed and uncompressed tags in memory
+    // at once.  But at least this makes for a cleaner patch while this is
+    // still somewhat experimental.
+    z_stream stream;
+    int err;
+
+    string utag;
+    // May not be enough for a compressed tag, but it's a reasonable guess.
+    utag.reserve(tag->size() + tag->size() / 2);
+
+    Bytef buf[8192];
+
+    //cout << "inflating" << endl;
+    stream.next_out = buf;
+    stream.avail_out = (uInt)sizeof(buf);
+
+    stream.zalloc = reinterpret_cast<alloc_func>(0);
+    stream.zfree = reinterpret_cast<free_func>(0);
+
+    Bytef header[6] = { 0x78, 0x20, 0, 0, 0, 0};
+    if (!compress_dict.empty()) {
+	stream.next_in = header;
+	stream.avail_in = 6;
+	uLong adler = adler32(0L, Z_NULL, 0);
+	stream.adler = adler;
+	adler = adler32(adler,
+			reinterpret_cast<const Bytef *>(compress_dict.data()),
+			compress_dict.size());
+	set_int4(header, 2, adler);
+	// cout << "inflateInit" << endl;
+	err = inflateInit(&stream);
+	if (err != Z_OK) {
+	    cout << "inflateInit err = " << err << endl;
+	    abort();
+	}
+	err = inflate(&stream, Z_SYNC_FLUSH);
+	if (err != Z_NEED_DICT) {
+	    cout << "don't need dict?!" << endl;
+	    abort();
+	}
+	err = inflateSetDictionary(&stream,
+		reinterpret_cast<const Bytef *>(compress_dict.data()),
+		compress_dict.size());
+	if (err != Z_OK) {
+	    cout << "iSD failed with err = " << err << endl;
+	}
+    } else {
+	stream.next_in = Z_NULL;
+	stream.avail_in = 0;
+	// cout << "inflateInit2" << endl;
+	err = inflateInit2(&stream, -15);
+	if (err != Z_OK) {
+	    cout << "inflateInit2 err = " << err << endl;
+	    abort();
+	}
+    }
+
+    stream.next_in = (Bytef*)tag->data();
+    stream.avail_in = (uInt)tag->size();
+
+    while (err != Z_STREAM_END) {
+	stream.next_out = buf;
+	stream.avail_out = (uInt)sizeof(buf);
+	err = inflate(&stream, Z_SYNC_FLUSH);
+	if (err == Z_BUF_ERROR && stream.avail_in == 0) {
+	    cout << "Z_BUF_ERROR - faking checksum of " << stream.adler << endl;
+	    Bytef header[4];
+	    set_int4(header, 0, stream.adler);
+	    stream.next_in = header;
+	    stream.avail_in = 4;
+	    err = inflate(&stream, Z_SYNC_FLUSH);
+	    if (err == Z_STREAM_END) break;
+	}
+
+	if (err != Z_OK && err != Z_STREAM_END) {
+	    cout << "inflate err = " << err << endl;
+	    abort();
+	}
+
+	utag.append(reinterpret_cast<const char *>(buf),
+		    stream.next_out - buf);
+    }
+    Assert(utag.size() == stream.total_out);
+    if (utag.size() != stream.total_out) {
+	cout << "utag.size() != stream.total_out (" << utag.size() << "!=" << stream.total_out << ")" << endl;
+	abort();
+    }
+
+    err = inflateEnd(&stream);
+    if (err != Z_OK) abort();
+
+    swap(*tag, utag);
+#endif /* COMPRESSED_TAGS */
 }
 
 void
@@ -1356,6 +1523,38 @@ Btree::basic_open(bool revision_supplied
     max_item_size = (block_size - DIR_START - BLOCK_CAPACITY * D2) / BLOCK_CAPACITY;
 
     base_letter = ch;
+
+#ifdef COMPRESSED_TAGS
+    // If foo_compress exists then try to compress tags in foo_DB with zlib
+    // using the contents of foo_compress as a dictionary.
+    int fd = ::open((name + "compress").c_str(), O_RDONLY | O_BINARY);
+    compress_strategy = DONT_COMPRESS;
+    if (fd > 0) {
+	compress_strategy = Z_DEFAULT_STRATEGY;
+	compress_dict = sys_read_all_bytes(fd, 65536);
+	::close(fd);
+    }
+    fd = ::open((name + "compress_strategy").c_str(), O_RDONLY | O_BINARY);
+// Look at the first character of foo_compress_strategy:
+//	Z_DEFAULT_STRATEGY for normal data
+// F/f:	Z_FILTERED for data produced by a filter (or predictor)
+// H/h:	Z_HUFFMAN_ONLY to force Huffman encoding only (no string match)
+//// R/r:	Z_RLE to limit match distances to one (run-length encoding)
+    CompileTimeAssert(DONT_COMPRESS != Z_DEFAULT_STRATEGY);
+    CompileTimeAssert(DONT_COMPRESS != Z_FILTERED);
+    CompileTimeAssert(DONT_COMPRESS != Z_HUFFMAN_ONLY);
+//    CompileTimeAssert(DONT_COMPRESS != Z_RLE);
+    if (fd > 0) {
+	string s = sys_read_all_bytes(fd, 1);
+	::close(fd);
+	switch (s[0]) {
+	    case 'F': case 'f': compress_strategy = Z_FILTERED; break;
+	    case 'H': case 'h': compress_strategy = Z_HUFFMAN_ONLY; break;
+//	    case 'R': case 'r': compress_strategy = Z_RLE; break;
+	    default: compress_strategy = Z_DEFAULT_STRATEGY;
+	}
+    }
+#endif /* COMPRESSED_TAGS */
 
     /* ready to open the main file */
 
Index: backends/quartz/btree.h
===================================================================
RCS file: /usr/data/cvs/xapian/xapian-core/backends/quartz/btree.h,v
retrieving revision 1.87
diff -p -u -r1.87 btree.h
--- backends/quartz/btree.h	13 Dec 2004 01:47:49 -0000	1.87
+++ backends/quartz/btree.h	15 Dec 2004 15:02:20 -0000
@@ -24,6 +24,9 @@
 #ifndef OM_HGUARD_BTREE_H
 #define OM_HGUARD_BTREE_H
 
+// Compress tags with zlib.
+#define COMPRESSED_TAGS
+
 #include <algorithm>
 #include <string>
 using std::string;
@@ -179,7 +182,12 @@ public:
     Item_base(T p_, int c) : p(p_ + GETINT2(p_, c)) { }
     Item_base(T p_) : p(p_) { }
     T get_address() const { return p; }
+#ifdef COMPRESSED_TAGS
+    int size() const { return (GETINT2(p, 0) & 0x7fff); } /* I in diagram above */
+    bool get_compressed() const { return *p & 0x80; }
+#else /* COMPRESSED_TAGS */
     int size() const { return GETINT2(p, 0); } /* I in diagram above */
+#endif /* COMPRESSED_TAGS */
     int component_of() const {
 	return GETINT2(p, GETK(p, I2) + I2 - C2);
     }
@@ -265,9 +273,16 @@ public:
 	set_component_of(1);
     }
     // FIXME passing cd here is icky
+#ifdef COMPRESSED_TAGS
+    void set_tag(int cd, const char *start, int len, bool compressed) {
+#else /* COMPRESSED_TAGS */
     void set_tag(int cd, const char *start, int len) {
+#endif /* COMPRESSED_TAGS */
 	memmove(p + cd, start, len);
 	set_size(cd + len);
+#ifdef COMPRESSED_TAGS
+	if (compressed) *p |= 0x80;
+#endif /* COMPRESSED_TAGS */
     }
     void fake_root_item() {
 	set_key_len(K1 + C2);   // null key length
@@ -692,6 +707,14 @@ class Btree {
 	 *  when updating (in Btree::add_item().
 	 */
 	byte * split_p;
+
+#ifdef COMPRESSED_TAGS
+	/** DONT_COMPRESS or Z_DEFAULT_COMPRESSION, Z_HUFFMAN_ONLY, Z_RLE. */
+	int compress_strategy;
+
+	/** The dictionary (if any) to use for tag compression. */
+	string compress_dict;
+#endif /* COMPRESSED_TAGS */
 
 	/* Debugging methods */
 //	void report_block_full(int m, int n, const byte * p);


More information about the Xapian-discuss mailing list