[Xapian-discuss] docx support
Colin Bell
colinabell at gmail.com
Thu Jul 24 09:08:24 BST 2008
This is how I do it using tinyxml parser. My xml parsing may be a bit
convoluted but it works. This can be applied for powerpoint and excel
too.
...
mime_map["docx"] = "application/vnd.openxmlformats-
officedocument.wordprocessingml.document";
mime_map["pptx"] = "application/vnd.openxmlformats-
officedocument.presentationml.presentation";
mime_map["xlsx"] = "application/vnd.openxmlformats-
officedocument.spreadsheetml.sheet";
...
//HANDLE DOCX WORD DOCUMENTS
if (mimetype == "application/vnd.openxmlformats-
officedocument.wordprocessingml.document"){
string cmd = "unzip -p " + shell_protect(filepath) + " docProps/
core.xml";
fileData+=parseWordXMetaData(mstdout_to_string(cmd));
cmd = "unzip -p " + shell_protect(filepath) + " docProps/app.xml";
fFileData+=parseWordXMetaData(mstdout_to_string(cmd));
cmd = "unzip -p " + shell_protect(filepath) + " docProps/custom.xml";
fileData+=parseWordXCustomMetaData(mstdout_to_string(cmd));
cmd = "unzip -p " + shell_protect(filepath) + " word/document.xml";
try{
XmlParser xmlparser;
xmlparser.parse_html(mstdout_to_string(cmd));
dump = xmlparser.dump;
} catch (ReadError) {
cout << "\"" << cmd << "\" failed - skipping\n";
return 0;
}
}
string parseWordXCustomMetaData(string xml){
string fileData = "";
TiXmlDocument doc;
doc.Parse((char *) xml.c_str());
TiXmlElement* root = doc.RootElement();
if(root){
TiXmlNode * pParent = root->FirstChild();
if(pParent){
TiXmlNode * pChild = root->IterateChildren(pParent);
for (pChild = pParent; pChild != 0; pChild = pChild->NextSibling()){
if(pChild){
TiXmlElement* aElem = pChild->ToElement();
if(aElem){
string name = aElem->Attribute("name");
TiXmlNode * pProperty = aElem->FirstChild();
if(pProperty){
TiXmlNode * pPropertyChild = aElem->IterateChildren(pProperty);
for (pPropertyChild = pProperty; pPropertyChild != 0;
pPropertyChild = pPropertyChild->NextSibling()){
if(pPropertyChild){
TiXmlElement* bElem = pPropertyChild->ToElement();
if(bElem->GetText()){
fileData+= "name:" + name + "=\"" + bElem->GetText() + "\"\n";
}
}
}
}
}
}
}
}
}
return fileData;
}
Easy peasy ;-)
On 23 Jul 2008, at 19:38, Frank Bruzzaniti wrote:
> Is office 2007 formats like docx supported?
>
> Is there anyway to get xapian to index office 2007 formats?
>
> Is there any option/procedure to add a new mime plugin?
> For example if you rename a docx .zip you can retrieve text from
> document.xml
>
> Thanks
>
> Frank
>
> _______________________________________________
> Xapian-discuss mailing list
> Xapian-discuss at lists.xapian.org
> http://lists.xapian.org/mailman/listinfo/xapian-discuss
More information about the Xapian-discuss
mailing list