Motif iterator

Installation

 1# install apache arrow libraries; based on https://arrow.apache.org/install/
 2sudo apt-get update
 3sudo apt-get install -y -V ca-certificates lsb-release wget
 4wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
 5sudo apt-get install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
 6sudo apt-get update
 7sudo apt-get install -y -V libarrow-dev libarrow-glib-dev libparquet-dev libparquet-glib-dev
 8
 9# clone git repo if not done yet
10git clone https://dries_decap@bitbucket.org/dries_decap/suffixtree-motif-speller.git
11cd suffixtree-motif-speller/motifIterator
12mkdir build
13cd build
14cmake ../
15make

Execution

Run the motif iterator like this:

 1input="/data/bls/wheat/wheat_12/GENE_FAMILY_0001"
 2output="/tmp/gene_family_0001.parquet"
 3bls_thresholds="0.1,0.5,0.6,0.7,0.8" # comma separated list of bls thresholds
 4length=8
 5degen=3
 6./motifIterator discovery ${input} --bls ${bls_thresholds} --length ${length} --fullIupac --degen ${degen} --parquet --output ${output}
 7
 8# or in counted mode if multiple gene families are in a single file
 9./motifIterator discovery ${input} --bls ${bls_thresholds} --length ${length} --fullIupac --degen ${degen} --parquet --output ${output} --count
10
11# or read data from stdin to process multiple files
12input="/data/bls/wheat/wheat_12/GENE_FAMILY_000*"
13cat ${input} | ./motifIterator discovery - --bls ${bls_thresholds} --length ${length} --fullIupac --degen ${degen} --parquet --output ${output} --count