Motif iterator¶
Installation¶
1# install apache arrow libraries; based on https://arrow.apache.org/install/
2sudo apt-get update
3sudo apt-get install -y -V ca-certificates lsb-release wget
4wget https://apache.jfrog.io/artifactory/arrow/$(lsb_release --id --short | tr 'A-Z' 'a-z')/apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
5sudo apt-get install -y -V ./apache-arrow-apt-source-latest-$(lsb_release --codename --short).deb
6sudo apt-get update
7sudo apt-get install -y -V libarrow-dev libarrow-glib-dev libparquet-dev libparquet-glib-dev
8
9# clone git repo if not done yet
10git clone https://dries_decap@bitbucket.org/dries_decap/suffixtree-motif-speller.git
11cd suffixtree-motif-speller/motifIterator
12mkdir build
13cd build
14cmake ../
15make
Execution¶
Run the motif iterator like this:
1input="/data/bls/wheat/wheat_12/GENE_FAMILY_0001"
2output="/tmp/gene_family_0001.parquet"
3bls_thresholds="0.1,0.5,0.6,0.7,0.8" # comma separated list of bls thresholds
4length=8
5degen=3
6./motifIterator discovery ${input} --bls ${bls_thresholds} --length ${length} --fullIupac --degen ${degen} --parquet --output ${output}
7
8# or in counted mode if multiple gene families are in a single file
9./motifIterator discovery ${input} --bls ${bls_thresholds} --length ${length} --fullIupac --degen ${degen} --parquet --output ${output} --count
10
11# or read data from stdin to process multiple files
12input="/data/bls/wheat/wheat_12/GENE_FAMILY_000*"
13cat ${input} | ./motifIterator discovery - --bls ${bls_thresholds} --length ${length} --fullIupac --degen ${degen} --parquet --output ${output} --count