provide a command line utility that converts the native format into pure plain text
add the configuration for that utility to /etc/mwsearch.conf
install unzip, JExcel (jxl.jar), ppthtml,
edit the file /etc/mwsearch.conf file and add the path to your own converter. /usr/local/bin (can also be an other location, have to be the same in mwsearch.conf and the file location)
converter=doc=wvText docx=/usr/bin/bin/docx2txt odp=/usr/bin/bin/odp2txt odt=/usr/bin/bin/odt2txt pptx=/usr/bin/bin/pptx2txt ppt=/usr/bin/bin/ppt2html pdf=/usr/bin/bin/pdf2text xhtml=/usr/bin/bin/html2text html=/usr/bin/bin/html2text htm=/usr/bin/bin/html2text xls=/usr/bin/bin/xls2txt pl= c= h= inc= php= cs= txt= text= csv= xml= xsl= xslt=
MS Word 2007 Importer
create file /usr/local/bin/docx2txt and make it executable with chmod +x /usr/local/bin/docx2txt
#!/bin/sh unzip -oq $1 -d /tmp/MS # Extract the file tr "<" "\012" < /tmp/MS/word/document.xml | grep ^w:t \ | cut '-d>' -f2, | uniq > $1.plain rm -r /tmp/MS #Delete excess (formatting) folders
MS PowerPoint 2007 Importer
create file /usr/local/bin/pptx2txt and make it executable with chmod +x /usr/local/bin/pptx2txt
#!/bin/sh unzip -oq $1 -d /tmp/PP # Extract the file cat /tmp/PP/ppt/slides/slide*.xml | tr "<" "\012" | grep ^a:t \ | cut '-d>' -f2, | uniq >> $1.plain #rm -r /tmp/PP #Delete excess (formatting) folders
MS PowerPoint 97-2003 Importer
create file /usr/local/bin/ppt2txt and make it executable with chmod +x /usr/local/bin/ppt2txt
#!/bin/sh /usr/bin/ppthtml "$1" | html2text -nobs > "$2"
MS Excel 97-2003 Importer
create file /usr/local/bin/xls2txt and make it executable with chmod +x /usr/local/bin/xls2txt
jxl.jar must be in the same folder as the script.
#!/bin/sh java -jar jxl.jar -csv "$1" > "$2"
OpenOffice Writer Importer
create file /usr/local/bin/odt2txt and make it executable with chmod +x /usr/local/bin/odt2txt
#!/bin/sh
if [ ! -z $1 ]; then
TEMPDIR=`mktemp -d`
unzip -oq $1 -d $TEMPDIR # Extract the file
tr "<" "\012" < $TEMPDIR/content.xml | grep ^text \
| cut '-d>' -f2, | uniq > $1.plain
rm -r $TEMPDIR #Delete excess (formatting) folders
fi
OpenOffice Impress Importer
create file /usr/local/bin/odp2txt and make it executable with chmod +x /usr/local/bin/odp2txt
#!/bin/sh
if [ ! -z $1 ]; then
TEMPDIR=`mktemp -d`
unzip -oq $1 -d $TEMPDIR # Extract the file
tr "<" "\012" < $TEMPDIR/content.xml | egrep '^text:p|text:span' \
| cut '-d>' -f2, | uniq > $1.plain
rm -r $TEMPDIR #Delete excess (formatting) folders
fi http://opengarden.org/dekiwiki/confi...to_search_them (many thanx to Glenn Pipe)
http://opengarden.org/dekiwiki/confi..._formats_added (many thanx to rrusso2@gmail.com)
Feel free to correct this page and add other extensions. greats wikiwurm
You can find most of the filter (scripts that convert binary content to plain text) in the /var/www/dekiwiki/bin/filters directory.