#!/usr/bin/perl use strict; use lib "$ENV{GUS_HOME}/lib/perl"; use GUS::Pipeline::Manager; use GUS::Pipeline::MakeTaskDirs; use GUS::Pipeline::NfsCluster; use GUS::Pipeline::SshCluster; use CBIL::Util::PropertySet; use CBIL::Util::GenomeDir; use File::Basename; # the "Steps" module contains all the pipeline-step routines # that are called below. use ApiComplexa::DataLoad::Steps; $| = 1; umask 002; my $propertiesFile = $ARGV[0]; &usage unless -e $propertiesFile; # [name, default (or null if reqd), comment] my @properties = ( # properties required by Pipeline API - incomplete, waiting for steps ["blastmatrix.taskSize", "", "number of seqs to process per blastmatrix subtask"], ["blastsimilarity.params", "", "paramters for blastsimilarity"], ["blastsimilarity.taskSize", "", "number of seqs to process per blastsimilarity subtask"], ["buildDir", "", "root of the build's directory tree"], ["cap4Dir", "", "directory containing executable cap4"], ["clusterServer", "", "full name of cluster server"], ["contigDbName", "", "sres.externaldatabase.name for contigs"], ["contigDbRlsVer", "", "sres.externaldatabaserelease.version for contigs"] ["externalDbDir", "", "directory for downloading data from external databases"], ["fileOfRepeats", "", "repeat file for use by repeatmasker"], ["gb_db_rel_id", "", "genbank external_database_release_id"], ["genusNickname", "", "e.g Crypto"], ["gusConfigFile", "", ""], ["includeSubspecies", "", "yes or no, get a list of taxon_ids for the subspecies of the given taxon_id(ncbi_tax_id)"], ["intermedCluster.length", "", "min length cut-off for clustering algorithm"], ["intermedCluster.percent", "", "min percent identity cutoff for clustering algorithm"], ["intermedCluster.logbase", "", ""], ["intermedCluster.consistentEnds", "", "requirement for consistent ends, yes or no"], ["intermedCluster.cliqueSzArray", "", ""], ["intermedCluster.logbaseArray", "", ""], ["gusConfigFile", "", "gus configuration file"], ["loadcontigs-nrdbRestart", "", "algorithm_invocation_id(s) of previous starts, for restart of BLAST similarity loading plugin"], ["nrdbDbName" , "","the externaldatabase.name for nrdb"], ["nrdbDbRlsVer" , "","the externaldatabaserelease.version for the current nrdb"], ["nodePath", "", "full path of scratch dir on cluster node"], ["phrapDir", "", "full path of directory containing phrap software"], ["projectId", "", "project_id from core.projectInfo"], ["release", "", "build release number"], ["repeatmask.dangleMax", "","option for trimDangling in repeatMasker"], ["repeatmask.options", "", "number of seqs to process per blastmatrix subtask"], ["repeatmask.path", "", "path on node of repeatmask executable"], ["repeatmask.taskSize", "", "number of seqs to process per repeatmask subtask"], ["serverPath", "", "full path of update dir on cluster server"], ["speciesNickname", "", "eg cparvum"], ["speciesFullname", "", "eg Cryptosporidium parvum"], ["stopBefore", "none", "the step to stop before. uses the signal name"], ["taskSize", "", "number of seqs to process per subtask"], ["ncbiTaxId", "", "tax_id from NCBI taxonomy db"], ["testNextPlugin", "false", "'true' to run in no-commit mode the first plugin not yet done and then exit"], ["wuBlastBinPathCluster", "", "path of wu blast bin dir on Liniac"], ); my $propertySet = CBIL::Util::PropertySet->new($propertiesFile, \@properties, 1); my $buildDir = $propertySet->getProp('buildDir'); my $release = $propertySet->getProp('release'); my $genusNickname = $propertySet->getProp('genusNickname'); my $buildName = "$release/analysis_pipeline/$genusNickname"; my $pipelineDir = "$buildDir/$buildName"; ########################## The Pipeline ########################## my $cluster; if ($propertySet->getProp('clusterServer') ne "none") { $cluster = GUS::Pipeline::SshCluster->new($propertySet->getProp('clusterServer'), $propertySet->getProp('clusterUser') ); } else { $cluster = GUS::Pipeline::NfsCluster->new(); } my $mgr = GUS::Pipeline::Manager->new($pipelineDir, $propertySet, $propertiesFile, $cluster, $propertySet->getProp('testNextPlugin')); $mgr->{buildName} = $buildName; $mgr->{pipelineDir} = $pipelineDir; $mgr->{propertiesFile} = $propertiesFile; $mgr->{propertySet} = $propertySet; my $taxonId = &getTaxonIdFromTaxId($mgr); $mgr->{taxonId} = $taxonId; &createPipelineDir($mgr); &createBlastMatrixDir($mgr,"intermedTranscriptCons", "intermedTranscriptCons"); &createBlastMatrixDir($mgr,"intermedTranscriptCons", "unallignededTranscripts"); &createBlastMatrixDir($mgr,"unallignededTranscripts", "unallignededTranscripts"); &createSimilarityDir($mgr,"contigs","nrdb","(\d+)","matrix=BLOSUM62 V=100 B=1000000 -hspmax=1000000 W=4 T=18 -gi E=1e-3 - wordmask=seg -hspsepQmax=50000","blastx"); &createSimilarityDir($mgr,"annotatedproteins","nrdb","(\d+)","matrix=BLOSUM62 -topcomboN=1 V=10 B=10 W=4 T=18 -span1 -gi E=1e-3 -wordmask=seg","blastp"); &createRepeatMaskDir($mgr,"newTranscripts"); &createRepeatMaskDir($mgr,"intermedTranscriptCons"); &createRepeatMaskDir($mgr,"unalignedTranscripts"); &createRepeatMaskDir($mgr,"finalTranscriptCons"); &createGenomeDir($mgr,"newTranscripts", "contigs"); &createGenomeDir($mgr,"finalTranscriptCons", "contigs"); ©PipelineDirToComputeCluster($mgr); &extractContigs($mgr); ©FilesToComputeCluster($mgr,"contigs"); # signal contigsExtract &extractNRDB($mgr); ©FilesToComputeCluster($mgr,"nrdb"); &startProteinBlastOnComputeCluster($mgr,"nrdb"); ©FilesFromComputeCluster($mgr,"contigs-nrdb","similarity"); &loadProteinBlast($mgr, "contigs-nrdb","DoTS::ExternalAASequence"); &extractAnnotatedProteins($mgr); ©FilesToComputeCluster($mgr,"annotatedProteins"); &startProteinBlastOnComputeCluster($mgr,"annotatedProteins"); ©FilesFromComputeCluster($mgr,"annotatedProteins-nrdb","similarity"); &loadProteinBlast($mgr, "annotatedproteins-nrdb","DoTS::TranslatedAASequence"); &makeTranscriptSeqs($mgr); &extractTranscriptSeqs($mgr,"newTranscripts"); ©FilesToComputeCluster($mgr,"newTranscripts"); &startTranscriptAlignToContigs($mgr,"newTranscripts"); ©FilesFromComputeCluster($mgr,"newTranscripts","repeatmask"); ©FilesFromComputeCluster($mgr,"newTranscripts-contigs","genome"); &loadContigAlignments($mgr, "newTranscripts", "contigs"); &clusterByContigAlign($mgr, "initial"); &splitCluster("initial", $mgr); &assembleTranscripts("", $propertySet->getProp('reassemble'), "initial", $mgr); &reassembleTranscripts("initial", $mgr); &deleteAssembliesWithNoTranscripts($mgr, "initial"); &extractAssemblies($mgr,"intermedTranscriptCons"); ©FilesToComputeCluster($mgr,"intermedTranscriptCons"); &extractTranscriptSeqs($mgr,"unalignededTranscripts"); ©FilesToComputeCluster($mgr,"unalignedtranscripts"); &startTranscriptMatrixOnComputeCluster($mgr,"intermed"); ©FilesFromComputeCluster($mgr,"intermedTranscriptCons","matrix"); &clusterByBlastSim($mgr,"intermed","intermed-intermed","unaligned-unaligned","intermed-unaligned"); &splitCluster($mgr,"intermed"); &assembleTranscripts("--assemble_old", "no", "intermed", $mgr); &reassembleTranscripts("intermed", $mgr); &deleteAssembliesWithNoTranscripts($mgr, "intermed"); &extractAssemblies($mgr,"finalTranscriptCons"); ©FilesToComputeCluster($mgr,"finalTranscriptCons"); &startTranscriptAlignToContigs($mgr,"finalTranscriptCons"); ©FilesFromComputeCluster($mgr,"finalTranscriptCons-contigs", "genome"); &loadContigAlignments($mgr, "finalTranscripts", "contigs"); $mgr->goodbye("Pipeline complete!\n");