[Xapian-discuss] Compressed Btrees (bugged?)
Olly Betts
olly at survex.com
Wed Dec 15 15:56:10 GMT 2004
On Wed, Dec 15, 2004 at 03:36:39PM +0000, Olly Betts wrote:
> I'll work up a revised patch shortly.
Attached.
Cheers,
Olly
-------------- next part --------------
Index: Makefile.am
===================================================================
RCS file: /usr/data/cvs/xapian/xapian-core/Makefile.am,v
retrieving revision 1.138
diff -p -u -r1.138 Makefile.am
--- Makefile.am 2 Nov 2004 04:46:45 -0000 1.138
+++ Makefile.am 15 Dec 2004 15:02:20 -0000
@@ -41,5 +41,6 @@ lib_LTLIBRARIES = libxapian.la
# doesn't really work as we'd hope as we get to play with the mangled symbols
#libxapian_la_LDFLAGS = -export-symbols-regex 'Xapian'
libxapian_la_LIBADD = common/libcommon.la backends/libbackend.la \
- matcher/libmatcher.la languages/liblanguages.la api/libapi.la $(LIBNET_LA)
+ matcher/libmatcher.la languages/liblanguages.la api/libapi.la $(LIBNET_LA) \
+ -lz
Index: backends/quartz/btree.cc
===================================================================
RCS file: /usr/data/cvs/xapian/xapian-core/backends/quartz/btree.cc,v
retrieving revision 1.181
diff -p -u -r1.181 btree.cc
--- backends/quartz/btree.cc 13 Dec 2004 01:47:49 -0000 1.181
+++ backends/quartz/btree.cc 15 Dec 2004 15:46:34 -0000
@@ -77,6 +77,13 @@ PWRITE_PROTOTYPE
# include <io.h> // for _commit()
#endif
+#ifdef COMPRESSED_TAGS
+#include <zlib.h>
+// Only try to compress tags longer than this many bytes.
+const size_t COMPRESS_MIN = 4;
+#define DONT_COMPRESS -1
+#endif /* COMPRESSED_TAGS */
+
// Only useful for platforms like Windows which distinguish between text and
// binary files.
#ifndef __WIN32__
@@ -1089,6 +1096,62 @@ Btree::add(const string &key, string tag
form_key(key);
+#ifdef COMPRESSED_TAGS
+ bool compressed = false;
+ if (compress_strategy != DONT_COMPRESS && tag.size() > COMPRESS_MIN) {
+ z_stream stream;
+ int err;
+
+ stream.next_in = (Bytef *)(tag.data());
+ stream.avail_in = (uInt)tag.size();
+
+ // If compressed size is >= tag.size(), we don't want to compress.
+ unsigned long blk_len = tag.size() - 1;
+ unsigned char * blk = new unsigned char[blk_len];
+ stream.next_out = blk;
+ stream.avail_out = (uInt)blk_len;
+
+ stream.zalloc = reinterpret_cast<alloc_func>(0);
+ stream.zfree = reinterpret_cast<free_func>(0);
+ stream.opaque = (voidpf)0;
+
+ //cout << "deflateInit about to be called\n" << endl;
+ // -15 means raw deflate with 32K LZ77 window (largest)
+ // memLevel 9 is the highest (8 is default)
+ err = deflateInit2(&stream, Z_DEFAULT_COMPRESSION, Z_DEFLATED, -15, 9,
+ compress_strategy);
+ //cout << "deflateInit returns " << Z_OK << endl;
+ if (err != Z_OK) abort();
+ if (!compress_dict.empty()) {
+ err = deflateSetDictionary(&stream,
+ reinterpret_cast<const Bytef *>(compress_dict.data()),
+ compress_dict.size());
+ //cout << "deflateSetDictionary returns " << Z_OK << endl;
+ if (err != Z_OK) abort();
+ }
+
+ err = deflate(&stream, Z_FINISH);
+ if (err == Z_STREAM_END) {
+ // If deflate succeeded, then the output was at least one byte
+ // smaller than the input.
+ //cout << "deflate compressed "<<tag.size()<<"["<<tag<<"] to ";
+ tag.assign(reinterpret_cast<const char *>(blk), stream.total_out);
+ //cout << tag.size()<<"["<<tag<<"]"<<endl;
+ compressed = true;
+ err = deflateEnd(&stream);
+ if (err != Z_OK) {
+ cout << "deflateEnd failed, err = " << err << endl;
+ abort();
+ }
+ } else {
+ // Deflate failed - presumably the data wasn't compressible.
+ (void)deflateEnd(&stream);
+ }
+
+ delete [] blk;
+ }
+#endif /* COMPRESSED_TAGS */
+
// sort of matching kt.append_chunk(), but setting the chunk
const size_t cd = kt.key().length() + K1 + I2 + C2 + C2; // offset to the tag data
const size_t L = max_item_size - cd; // largest amount of tag data for any chunk
@@ -1127,7 +1190,11 @@ Btree::add(const string &key, string tag
size_t l = (i == m ? residue : (i == 1 ? first_L : L));
Assert(cd + l <= block_size);
Assert(string::size_type(o + l) <= tag.length());
+#ifndef COMPRESSED_TAGS
kt.set_tag(cd, tag.data() + o, l);
+#else /* COMPRESSED_TAGS */
+ kt.set_tag(cd, tag.data() + o, l, compressed);
+#endif /* COMPRESSED_TAGS */
kt.set_component_of(i);
o += l;
@@ -1221,6 +1288,9 @@ Btree::read_tag(Cursor * C_, string *tag
if (n > 1) tag->reserve(max_item_size * n);
item.append_chunk(tag);
+#ifdef COMPRESSED_TAGS
+ bool compressed = item.get_compressed();
+#endif /* COMPRESSED_TAGS */
for (int i = 2; i <= n; i++) {
next(C_, 0);
@@ -1228,6 +1298,103 @@ Btree::read_tag(Cursor * C_, string *tag
}
// At this point the cursor is on the last item - calling next will move
// it to the next key (Bcursor::get_tag() relies on this).
+#ifdef COMPRESSED_TAGS
+ if (!compressed) return;
+
+ // FIXME: Perhaps we should we decompress each chunk as we read it so we
+ // don't need both the full compressed and uncompressed tags in memory
+ // at once. But at least this makes for a cleaner patch while this is
+ // still somewhat experimental.
+ z_stream stream;
+ int err;
+
+ string utag;
+ // May not be enough for a compressed tag, but it's a reasonable guess.
+ utag.reserve(tag->size() + tag->size() / 2);
+
+ Bytef buf[8192];
+
+ //cout << "inflating" << endl;
+ stream.next_out = buf;
+ stream.avail_out = (uInt)sizeof(buf);
+
+ stream.zalloc = reinterpret_cast<alloc_func>(0);
+ stream.zfree = reinterpret_cast<free_func>(0);
+
+ Bytef header[6] = { 0x78, 0x20, 0, 0, 0, 0};
+ if (!compress_dict.empty()) {
+ stream.next_in = header;
+ stream.avail_in = 6;
+ uLong adler = adler32(0L, Z_NULL, 0);
+ stream.adler = adler;
+ adler = adler32(adler,
+ reinterpret_cast<const Bytef *>(compress_dict.data()),
+ compress_dict.size());
+ set_int4(header, 2, adler);
+ // cout << "inflateInit" << endl;
+ err = inflateInit(&stream);
+ if (err != Z_OK) {
+ cout << "inflateInit err = " << err << endl;
+ abort();
+ }
+ err = inflate(&stream, Z_SYNC_FLUSH);
+ if (err != Z_NEED_DICT) {
+ cout << "don't need dict?!" << endl;
+ abort();
+ }
+ err = inflateSetDictionary(&stream,
+ reinterpret_cast<const Bytef *>(compress_dict.data()),
+ compress_dict.size());
+ if (err != Z_OK) {
+ cout << "iSD failed with err = " << err << endl;
+ }
+ } else {
+ stream.next_in = Z_NULL;
+ stream.avail_in = 0;
+ // cout << "inflateInit2" << endl;
+ err = inflateInit2(&stream, -15);
+ if (err != Z_OK) {
+ cout << "inflateInit2 err = " << err << endl;
+ abort();
+ }
+ }
+
+ stream.next_in = (Bytef*)tag->data();
+ stream.avail_in = (uInt)tag->size();
+
+ while (err != Z_STREAM_END) {
+ stream.next_out = buf;
+ stream.avail_out = (uInt)sizeof(buf);
+ err = inflate(&stream, Z_SYNC_FLUSH);
+ if (err == Z_BUF_ERROR && stream.avail_in == 0) {
+ cout << "Z_BUF_ERROR - faking checksum of " << stream.adler << endl;
+ Bytef header[4];
+ set_int4(header, 0, stream.adler);
+ stream.next_in = header;
+ stream.avail_in = 4;
+ err = inflate(&stream, Z_SYNC_FLUSH);
+ if (err == Z_STREAM_END) break;
+ }
+
+ if (err != Z_OK && err != Z_STREAM_END) {
+ cout << "inflate err = " << err << endl;
+ abort();
+ }
+
+ utag.append(reinterpret_cast<const char *>(buf),
+ stream.next_out - buf);
+ }
+ Assert(utag.size() == stream.total_out);
+ if (utag.size() != stream.total_out) {
+ cout << "utag.size() != stream.total_out (" << utag.size() << "!=" << stream.total_out << ")" << endl;
+ abort();
+ }
+
+ err = inflateEnd(&stream);
+ if (err != Z_OK) abort();
+
+ swap(*tag, utag);
+#endif /* COMPRESSED_TAGS */
}
void
@@ -1356,6 +1523,38 @@ Btree::basic_open(bool revision_supplied
max_item_size = (block_size - DIR_START - BLOCK_CAPACITY * D2) / BLOCK_CAPACITY;
base_letter = ch;
+
+#ifdef COMPRESSED_TAGS
+ // If foo_compress exists then try to compress tags in foo_DB with zlib
+ // using the contents of foo_compress as a dictionary.
+ int fd = ::open((name + "compress").c_str(), O_RDONLY | O_BINARY);
+ compress_strategy = DONT_COMPRESS;
+ if (fd > 0) {
+ compress_strategy = Z_DEFAULT_STRATEGY;
+ compress_dict = sys_read_all_bytes(fd, 65536);
+ ::close(fd);
+ }
+ fd = ::open((name + "compress_strategy").c_str(), O_RDONLY | O_BINARY);
+// Look at the first character of foo_compress_strategy:
+// Z_DEFAULT_STRATEGY for normal data
+// F/f: Z_FILTERED for data produced by a filter (or predictor)
+// H/h: Z_HUFFMAN_ONLY to force Huffman encoding only (no string match)
+//// R/r: Z_RLE to limit match distances to one (run-length encoding)
+ CompileTimeAssert(DONT_COMPRESS != Z_DEFAULT_STRATEGY);
+ CompileTimeAssert(DONT_COMPRESS != Z_FILTERED);
+ CompileTimeAssert(DONT_COMPRESS != Z_HUFFMAN_ONLY);
+// CompileTimeAssert(DONT_COMPRESS != Z_RLE);
+ if (fd > 0) {
+ string s = sys_read_all_bytes(fd, 1);
+ ::close(fd);
+ switch (s[0]) {
+ case 'F': case 'f': compress_strategy = Z_FILTERED; break;
+ case 'H': case 'h': compress_strategy = Z_HUFFMAN_ONLY; break;
+// case 'R': case 'r': compress_strategy = Z_RLE; break;
+ default: compress_strategy = Z_DEFAULT_STRATEGY;
+ }
+ }
+#endif /* COMPRESSED_TAGS */
/* ready to open the main file */
Index: backends/quartz/btree.h
===================================================================
RCS file: /usr/data/cvs/xapian/xapian-core/backends/quartz/btree.h,v
retrieving revision 1.87
diff -p -u -r1.87 btree.h
--- backends/quartz/btree.h 13 Dec 2004 01:47:49 -0000 1.87
+++ backends/quartz/btree.h 15 Dec 2004 15:02:20 -0000
@@ -24,6 +24,9 @@
#ifndef OM_HGUARD_BTREE_H
#define OM_HGUARD_BTREE_H
+// Compress tags with zlib.
+#define COMPRESSED_TAGS
+
#include <algorithm>
#include <string>
using std::string;
@@ -179,7 +182,12 @@ public:
Item_base(T p_, int c) : p(p_ + GETINT2(p_, c)) { }
Item_base(T p_) : p(p_) { }
T get_address() const { return p; }
+#ifdef COMPRESSED_TAGS
+ int size() const { return (GETINT2(p, 0) & 0x7fff); } /* I in diagram above */
+ bool get_compressed() const { return *p & 0x80; }
+#else /* COMPRESSED_TAGS */
int size() const { return GETINT2(p, 0); } /* I in diagram above */
+#endif /* COMPRESSED_TAGS */
int component_of() const {
return GETINT2(p, GETK(p, I2) + I2 - C2);
}
@@ -265,9 +273,16 @@ public:
set_component_of(1);
}
// FIXME passing cd here is icky
+#ifdef COMPRESSED_TAGS
+ void set_tag(int cd, const char *start, int len, bool compressed) {
+#else /* COMPRESSED_TAGS */
void set_tag(int cd, const char *start, int len) {
+#endif /* COMPRESSED_TAGS */
memmove(p + cd, start, len);
set_size(cd + len);
+#ifdef COMPRESSED_TAGS
+ if (compressed) *p |= 0x80;
+#endif /* COMPRESSED_TAGS */
}
void fake_root_item() {
set_key_len(K1 + C2); // null key length
@@ -692,6 +707,14 @@ class Btree {
* when updating (in Btree::add_item().
*/
byte * split_p;
+
+#ifdef COMPRESSED_TAGS
+ /** DONT_COMPRESS or Z_DEFAULT_COMPRESSION, Z_HUFFMAN_ONLY, Z_RLE. */
+ int compress_strategy;
+
+ /** The dictionary (if any) to use for tag compression. */
+ string compress_dict;
+#endif /* COMPRESSED_TAGS */
/* Debugging methods */
// void report_block_full(int m, int n, const byte * p);
More information about the Xapian-discuss
mailing list