gwAlign-Unify 42KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055
  1. #!/usr/bin/env python
  2. import argparse
  3. version='SEDMATCHGITVERSION'
  4. year=2018
  5. author='Julien Fouret'
  6. contact='julien@fouret.me'
  7. ##parse argument
  8. parser = argparse.ArgumentParser(description=
  9. """
  10. Genome-Wide Alignments - Unify
  11. From Exons to well-aligned and filtered CDS
  12. ---------------
  13. Steps automatized on HPC infrastructure with upype python package
  14. For more informations:
  15. https://fouret.me/gitea/jfouret/upype
  16. ---------------
  17. -------------------------------------------------------------------
  18. mode:create First create a file architecture to extract CDS
  19. from a database of MSA (Multiple Sequence
  20. Alignment) at exon-level formatted like UCSC.
  21. -------------------------------------------------------------------
  22. mode:getCDS Then reassemble all exons into one CDS avoiding
  23. reproducing mistakes in the parent file.
  24. -------------------------------------------------------------------
  25. mode:annotate Optionally get the corresponding name (HUGO) of
  26. the gene with the accession ids (UCSC, NCBI and
  27. Uniprot) and add kegg, go and uniprot
  28. annotations based on a UCSC-like mySQL
  29. database.
  30. -------------------------------------------------------------------
  31. mode:align re-align all CDS with a coding-aware method
  32. (macse).
  33. -------------------------------------------------------------------
  34. mode:compute_TCS Compute TCS (Transitive consistency score) with
  35. t_coffee for each alignment
  36. -------------------------------------------------------------------
  37. mode:blast_search Search in Trembl and sp the sequence from the
  38. reference species to associate an id.
  39. -------------------------------------------------------------------
  40. mode:stat_filter Use TCS score to filter MSA and to produce
  41. statistics on sequence qualities by species.
  42. -------------------------------------------------------------------
  43. mode:report Build an html report with quality assesments
  44. -------------------------------------------------------------------
  45. """
  46. ,epilog="Version : "+str(version)+"\n"+str(year)+"\nAuthor : "+author+" for more informations or enquiries please contact "+contact,formatter_class=argparse.RawTextHelpFormatter)
  47. parser.add_argument('-mode', metavar='keyword', required=True, help="Only one by one")
  48. parser.add_argument('-out', metavar='/path', required=True, help="[all] folder in which to write all computed alignments ")
  49. parser.add_argument('-v',action='store_true', help="[all] verbose")
  50. parser.add_argument('-debug',action='store_true', help="[all] more verbose")
  51. parser.add_argument('-al', metavar='/path', required=False, help="[create/getCDS] alignment file in fasta format ")
  52. parser.add_argument('-spec', metavar='/path' , required=False, help="""[getCDS] tab file with assemblies info (name:path)
  53. (with .dict and .fai)""")
  54. parser.add_argument('-ref', metavar='name', required=False, help="""[create/align/compute_TCS/stat_filter] reference species for multiple alignment
  55. mode annotate hase been design and tested only for hg19
  56. (default='hg19')""",default='hg19')
  57. parser.add_argument('-queue', metavar='queue', required=False, help="""[getCDS/align/compute_TCS] queue for PBS
  58. (default='SEDMATCHQUEUE')""",default='SEDMATCHQUEUE')
  59. parser.add_argument('-batchLim', metavar='N', default='10', help="""[getCDS/align/compute_TCS] Number of operation per batch
  60. (default='10')""")
  61. parser.add_argument('-mems', metavar='lm,rm,hm', default='10,3,256', help="""[align] Precise memory config in GB for alignments:
  62. >lm is the basic parameter to use first for alignments
  63. >rs is what jobs will reserve
  64. >hm is an increased amount of memory for failures
  65. default='10,3,256'""")
  66. parser.add_argument('-keep',action='store_true', help="[align/compute_TCS] Only files with missing target (subsequent errors)")
  67. parser.add_argument('-name', metavar='name' , required=False, help="""[stat_filter/report] sub-folder with fitered data
  68. (default='default_filter')""",default='default_filter')
  69. parser.add_argument('-host', metavar='IP' , required=False, help="""[annotate] mysql host for ucsc database
  70. (default='10.0.0.200')""",default='10.0.0.200')
  71. parser.add_argument('-port', metavar='PORT' , required=False, help="""[annotate]mysql port for ucsc database
  72. (default='3306')""",default='3306')
  73. parser.add_argument('-blast_db', metavar='path', required=False, help="""[blast_search] Blast database
  74. with Trembl and Sprot sequences from ref species
  75. (default=None)""",default=None)
  76. parser.add_argument('-cpu', metavar='N', default='1', help="""[stat_filter] Number of cpu to be used
  77. (default='1')""")
  78. parser.add_argument('-phase_filter', metavar='N,N', required=False, help="""[stat_filter] Min TCS score with:
  79. 1: consensus sequence
  80. 2: each sequences
  81. (default='5,3')""",default='5,3')
  82. parser.add_argument('-ratio', metavar='N', required=False, help="""[stat_filter] Min ratio species conserved in MSA
  83. min ratio of species conseserve/all species considered
  84. (default='0.6')""",default='0.6')
  85. parser.add_argument('-focus', metavar='name,name', required=False, help="""[stat_filter] sub list of species that matters
  86. the ratio threshold is re-applied on this sub-list.
  87. comma-separated
  88. (default=None)""",default=None)
  89. parser.add_argument('-picard', metavar='/path', required=False, help="""[getCDS] picard jar path
  90. (default='SEDMATCHPICARD')""",default='SEDMATCHPICARD')
  91. parser.add_argument('-macse', metavar='/path', required=False, help="""[align] macse jar path
  92. (default='SEDMATCHMACSE')""",default='SEDMATCHMACSE')
  93. args=parser.parse_args()
  94. import os
  95. import sys
  96. import mysql.connector
  97. from Bio import SeqIO
  98. from Bio.Seq import Seq
  99. import re
  100. from upype import *
  101. ##define function
  102. def writedb(query,file_name,header):
  103. global cnx
  104. global args
  105. if args.v:
  106. print query
  107. print("\n")
  108. cursor=cnx.cursor()
  109. cursor.execute(query)
  110. rows=cursor.fetchall()
  111. tab_file=open(file_name,"w")
  112. tab_file.write("\t".join(header))
  113. for row in rows:
  114. tab_file.write("\n")
  115. tab_file.write("\t".join(map(str,row)))
  116. tab_file.close()
  117. cursor.close()
  118. def Get_annotation(ID):
  119. """
  120. @summary: Take a kgID as input to search the ucsc database for name, cross reference ids and functional information for KEGG and GO annotation spaces. All information are written in other files.
  121. @param ID: a kgID, kg stand for knowGene
  122. @type ID: str
  123. """
  124. global args
  125. global iterNoName
  126. global allSymbol
  127. field='kgID'
  128. #global ID_type
  129. query_where='WHERE (ref.'+field+'="'+ID+'");'
  130. query_name=['SELECT hgnc.symbol,ref.kgID,ref.spID,ref.refseq,ref.geneSymbol']
  131. query_name.append('FROM kgXref ref')
  132. query_name.append('LEFT JOIN proteinDB.hgncXref hgnc ON ( hgnc.uniProt=ref.spID ) AND (hgnc.refSeq=ref.refseq)')
  133. query_name.append(query_where)
  134. query_alias=['SELECT ref.alias']
  135. query_alias.append('FROM kgAlias ref')
  136. query_alias.append(query_where)
  137. query_uniprot=['SELECT ref.geneSymbol,feats.start,feats.end,class.val AS `class`,type.val AS `type`,feats.softEndBits AS `OutOfRange`']
  138. query_uniprot.append('FROM uniProt.feature feats')
  139. query_uniprot.append('LEFT JOIN kgXref ref ON (ref.spID=feats.acc)')
  140. query_uniprot.append('LEFT JOIN uniProt.featureClass class ON (class.id=feats.featureClass)')
  141. query_uniprot.append('LEFT JOIN uniProt.featureType type ON (type.id=feats.featureType)')
  142. query_uniprot.append(query_where)
  143. query_kegg=['SELECT ref.geneSymbol,kegg.mapID,des.description']
  144. query_kegg.append('FROM keggPathway kegg')
  145. query_kegg.append('LEFT JOIN kgXref ref ON (ref.kgID=kegg.kgID)')
  146. query_kegg.append('LEFT JOIN keggMapDesc des ON (kegg.mapID=des.mapID)')
  147. query_kegg.append(query_where)
  148. query_go=['SELECT ref.geneSymbol, goa.goId, goterm.name, goterm.term_type']
  149. query_go.append('FROM go.goaPart goa')
  150. query_go.append('LEFT JOIN kgXref ref ON (ref.spID=goa.dbObjectId)')
  151. query_go.append('LEFT JOIN go.term goterm ON (goa.goId=goterm.acc)')
  152. query_go.append(query_where)
  153. annotPath=rootedDir.results+"/"+ID+"/annotation/"
  154. mkdirp(annotPath)
  155. writedb(" ".join(query_name),annotPath+"/name.tab",['geneSymbol','kgID','uniprot','refSeq','kgName'])
  156. name_file=open(annotPath+"/name.tab","r")
  157. name_file_line1=name_file.readlines()[1]
  158. gene_symbol=name_file_line1.split("\t")[0]
  159. hgnc=name_file_line1.split("\t")[0]
  160. spID=name_file_line1.split("\t")[2]
  161. refseq=name_file_line1.split("\t")[3]
  162. kgname=name_file_line1.split("\t")[4]
  163. gene_symbol=hgnc
  164. if (hgnc=='None' or hgnc=='') or (spID=='' and refseq=='') :
  165. gene_symbol='kg_'+kgname
  166. elif kgname=='None' or kgname=='':
  167. gene_symbol='sp_'+spID
  168. elif spID=='None' or spID=='':
  169. gene_symbol='rs_'+refseq
  170. elif refseq=='None' or refseq=='':
  171. gene_symbol='NoID_'+str(iterNoName)
  172. iterNoName+=1
  173. gene_symbol=renameFileName(gene_symbol)
  174. if (gene_symbol in allSymbol.keys()):
  175. allSymbol[gene_symbol]+=1
  176. gene_symbol='dup'+str(allSymbol[gene_symbol])+'_'+gene_symbol
  177. else:
  178. allSymbol[gene_symbol]=0
  179. name_file.close()
  180. with open(annotPath+"/consName.txt",'w') as consNameFIle:
  181. consNameFIle.write(gene_symbol)
  182. #os.system('mv '+rootedDir.results+"/"+ID+".name.tab"+' '+rootedDir.results+"/"+gene_symbol+'-'+ID+".name.tab")
  183. writedb(" ".join(query_uniprot),annotPath+"/uniprot.tab",['geneSymbol','start','end','class','type','OutOfRange'])
  184. writedb(" ".join(query_kegg),annotPath+"/kegg.tab",['geneSymbol','mapID','description'])
  185. writedb(" ".join(query_go),annotPath+"/go.tab",['geneSymbol','goId','name','type'])
  186. writedb(" ".join(query_alias),annotPath+"/alias.txt",['alias'])
  187. return gene_symbol
  188. #start program
  189. #parse args
  190. modeList=args.mode.split(',')
  191. ### Create directories and listing...
  192. kgPathDict=dict()
  193. if 'create' in modeList:
  194. alnFileName=os.path.abspath(args.al)
  195. rootedDir=RootDir(args.out,batch=True)
  196. rootedDir.logs.writeArgs(args)
  197. # iter and create and print ref.fa
  198. rehg19Exon1=re.compile('^>(uc\w*)(\.\d*)_'+args.ref+'_1_(\d*) (\d*) (\d*) (\d*)\s?(\w+:\d+-\d+[+-]{1};?)*$')
  199. exons_file=open(alnFileName,'r')
  200. for line in exons_file.readlines():
  201. m=rehg19Exon1.match(line.rstrip())
  202. if m:
  203. if args.v:
  204. out_logger('create - Create_dir '+line)
  205. kgID=m.group(1)+m.group(2)
  206. path=rootedDir.results+"/"+kgID
  207. kgPathDict[kgID]=path
  208. mkdirp(path)
  209. with open(rootedDir.reports+'/kgPathDictFile.tab','w') as kgPathDictFile:
  210. for key in kgPathDict.keys():
  211. kgPathDictFile.write(key+"\t"+kgPathDict[key]+"\n")
  212. saveRoot(rootedDir)
  213. else:
  214. if args.debug:
  215. rootedDir=RootDir(args.out,batch=True)
  216. rootedDir.logs.writeArgs(args)
  217. else:
  218. rootedDir=loadRoot(args.out)
  219. kgPathDict=dict()
  220. with open(rootedDir.reports+'/kgPathDictFile.tab','r') as kgPathDictFile:
  221. for line in kgPathDictFile.readlines():
  222. key,value=line.rstrip().split("\t")
  223. kgPathDict[key]=value
  224. # get CDS
  225. #########################
  226. ###### Explanation ######
  227. # All this workflow is based on a file from ucsc which contains all sequences matching any hg19 knowngene \
  228. # However this file has a major problem: no gap have been considered in hg19 sequences. Meaning that \
  229. # only deletion are considered in other species and insertion are not reported in this files.
  230. # Fortunately there is one fasta entry per exon per species and the entry name gives the genomic location \
  231. # of the each exon is the corresponding species' genome.
  232. # The next lines of codes focuses on parsing this files for genomic coordinates aiming at using Picard \
  233. # to extract sequences from the genomes.
  234. #
  235. # In addition to this first issue, 5 cases are possible with those coordinates
  236. # 1- The coordinates are complete, fragmented or not, and consistent meaning that start<end thus \
  237. # those locations are appended to a bed file that will finally used to produce a cds.fa file.
  238. # 2- The coordinates are missing then a folder is created with '---' sequence in a cds.fa file
  239. # 3- The coordinates are not consistent. start>end ... well I suppose the outer sequences should be \
  240. # considered. Due to lack of time this feature has not been implemented yet and original sequences are \
  241. # added in a cds.fa file (with potential insertions missing...)
  242. # 4- The species is te reference (hg19) thus original sequences from input file have been used
  243. # 5- In place of the genomic location the mention "mapped" is present. This means that exons have been \
  244. # assembled using mapping then platypus thus orginals sequences have been used.
  245. if 'getCDS' in modeList:
  246. alnFileName=os.path.abspath(args.al)
  247. specDict=dict()
  248. with open(args.spec,'r') as species_file:
  249. for line in species_file.readlines():
  250. if line.rstrip()!='':
  251. if args.v:
  252. out_logger('getCDS-Pasing -spec argument : line :'+line.rstrip())
  253. specie,assembly=line.rstrip().split("\t")
  254. specDict[specie]=assembly
  255. rootedDir.addCommand('java','java -XX:ParallelGCThreads=1','java -version 2>&1 | xargs')
  256. picard_cmd=rootedDir.getCommand("java",options={'-jar':args.picard})
  257. rootedDir.addCommand("picard",picard_cmd,picard_cmd+' CheckFingerprint --version 2>&1 | sed \'s/(.*$//g\'')
  258. reID=re.compile('^(uc\w*)(\.\d*)_(\w*)_(\d*)_(\d*) (\d*) (\d*) (\d*)\s*(\w+.*)?$')
  259. fasta_sequences = SeqIO.parse(open(alnFileName),'fasta')
  260. cmdList=list()
  261. iterBatch=1
  262. Nlim=int(args.batchLim)
  263. serialBatch=rootedDir.addSerializer("gwAlign_getCDS",Nlim,queue=args.queue,ncpus='1',mem="6gb",workdir=rootedDir.results,verbose=args.v)
  264. init=True
  265. # step 1 - iteration over exon file
  266. total_entry=len(SeqIO.to_dict(SeqIO.parse(open(alnFileName),'fasta')).keys())
  267. if args.v : out_logger("getCDS - "+str(total_entry)+" fasta entry to parse.")
  268. count=0
  269. def getCDS(cmdList):
  270. global args
  271. global kgPathDict
  272. global specDict
  273. global rootedDir
  274. if args.v:
  275. out_logger("getCDS - Getting CDS of "+oldkgID)
  276. oldkgPath=kgPathDict[oldkgID]
  277. for key in specDict.keys():
  278. oldRegFileName=oldkgPath+'/'+key+'/region.list'
  279. #TODO si regionlist exist
  280. if args.v:
  281. out_logger("getCDS - Getting CDS of "+oldkgID+" --- Focus on species: "+key)
  282. if os.path.isfile(oldRegFileName+'.tmp'):
  283. getRefOpt={
  284. 'R':specDict[key],
  285. 'O':oldkgPath+'/'+key+'/cds.fa',
  286. 'INTERVAL_LIST':oldRegFileName,
  287. 'VERBOSITY':'ERROR',
  288. 'QUIET':'true'
  289. }
  290. cmdList.append('cat '+specDict[key].rstrip('fa').rstrip('fasta').rstrip('fna')+'dict '+oldRegFileName+'.tmp'+' > '+oldRegFileName+' ; rm '+oldRegFileName+'.tmp')
  291. cmdList.append(rootedDir.getCommand("picard",options=getRefOpt,subprogram='ExtractSequences',sep='='))
  292. cmdList.append('rm '+oldRegFileName)
  293. if args.v: out_logger("getCDS - Getting CDS of "+oldkgID+" --> serialize")
  294. rootedDir.serialize("gwAlign_getCDS",cmdList)
  295. for fasta in fasta_sequences:
  296. count+=1
  297. # step 1.2 - get name and sequence
  298. name, sequence = fasta.description, str(fasta.seq)
  299. m=reID.match(name)
  300. if m and (m.group(3) in specDict.keys()):
  301. spec=m.group(3)
  302. kgID=m.group(1)+m.group(2)
  303. if args.v :
  304. out_logger("getCDS parsing input fasta file for id="+kgID+"\tspecies="+spec+"\texon="+m.group(4))
  305. if init:
  306. oldkgID=kgID
  307. init=False
  308. if kgID!=oldkgID:
  309. getCDS(cmdList)
  310. cmdList=list()
  311. oldkgID=kgID
  312. mkdirp(kgPathDict[kgID]+'/'+spec)
  313. if m.group(9)==None:
  314. mkdirp(kgPathDict[kgID]+'/'+spec+'/missingExon'+m.group(4))
  315. cmdList.append('echo ">'+'exon_'+m.group(4)+"\n"+'---" > '+kgPathDict[kgID]+'/'+spec+'/missingExon'+m.group(4)+'/cds.fa')
  316. elif m.group(9)=='mapped':
  317. cmdList.append('echo ">'+'exon_'+m.group(4)+"\n"+sequence+'" >>'+kgPathDict[kgID]+'/'+spec+'/cds.fa')
  318. else:
  319. regionList=m.group(9).split(';')
  320. regFileName=kgPathDict[kgID]+'/'+spec+'/region.list'
  321. numExt=0
  322. for region in regionList:
  323. numExt+=1
  324. chromosome,positions=region.split(':')
  325. start,end=positions[:-1].split('-')
  326. strand=positions[-1]
  327. if int(start)>int(end): # In case of complex regions no time is available for me to deal corretly with that so for now the sequence from ucsc is trusted... meaning that some insertion might be missed !
  328. #TODO deal better with those complex regions...
  329. mkdirp(kgPathDict[kgID]+'/'+spec+'/rearrangedExon'+m.group(4)+'_ext'+str(numExt))
  330. cmdList.append('echo ">'+'exon_'+m.group(4)+'_ext'+str(numExt)+"\n"+sequence+'" >>'+kgPathDict[kgID]+'/'+spec+'/rearrangedExon'+m.group(4)+'_ext'+str(numExt)+'/cds.fa')
  331. else:
  332. with open(regFileName+'.tmp','a') as regFile:
  333. regFile.write("\t".join([chromosome,start,end,strand,'exon_'+m.group(4)+'_ext'+str(numExt)])+"\n")
  334. if len(cmdList)!=0:
  335. getCDS(cmdList)
  336. rootedDir.finishSerializer("gwAlign_getCDS")
  337. elif 'align' in modeList : # no align just after getCDS. need time to finish
  338. low_mem,res_mem,high_mem=args.mems.split(",")
  339. Nlim=int(args.batchLim)
  340. rootedDir.addCommand("alignCDS",'alignCDS.py')
  341. cmdList=list()
  342. serialBatch=rootedDir.addSerializer("gwAlign_alignCDS",Nlim,queue=args.queue,ncpus='1',mem=res_mem+"g",workdir=rootedDir.results)
  343. for key in kgPathDict.keys():
  344. if not (args.keep and os.path.isfile(kgPathDict[key]+'/ref_macse_AA.fasta')):
  345. if args.v : out_logger("align : missing alignment for "+key)
  346. optAlign={
  347. '-gene_dir':kgPathDict[key],
  348. '-ref':args.ref,
  349. '-virt_mem':low_mem,
  350. '-macse':args.macse,
  351. '-boost_mem':high_mem
  352. }
  353. cmdList.append(rootedDir.getCommand("alignCDS",kgPathDict[key],optAlign))
  354. rootedDir.serialize("gwAlign_alignCDS",cmdList)
  355. cmdList=list()
  356. rootedDir.finishSerializer("gwAlign_alignCDS")
  357. # add elif to re-alignBigCDS with reporting ...
  358. elif 'compute_TCS' in modeList:
  359. rootedDir.addCommand("t_coffee","t_coffee","t_coffee -version | sed 's/PROGRAM: T-COFFEE //g'")
  360. Nlim=int(args.batchLim)
  361. rootedDir.addSerializer("gwAlign_compute_TCS",Nlim,queue=args.queue,ncpus='2',mem="2gb",workdir=rootedDir.results)
  362. for key in kgPathDict.keys():
  363. mkdirp(kgPathDict[key]+'/tcs_eval')
  364. if not (args.keep and os.path.isfile(kgPathDict[key]+'/tcs_eval/translated_dna.score_ascii')):
  365. cmdList=["ln -sf "+kgPathDict[key]+"/ref_macse_NT.fasta "+kgPathDict[key]+'/tcs_eval/dna.fa']
  366. optTranslate={
  367. '-action':'+translate',
  368. '-in':'dna.fa',
  369. '-output':"fasta_aln"
  370. }
  371. posTranslate=["> translated_dna.fa"]
  372. cmdList.append(rootedDir.getCommand("t_coffee",options=optTranslate,subprogram="-other_pg seq_reformat",positionals=posTranslate,wd=kgPathDict[key]+'/tcs_eval'))
  373. optTCS={
  374. '-infile':'translated_dna.fa',
  375. '-evaluate':'',
  376. '-output':"score_ascii",
  377. '-master':args.ref,
  378. '-n_core':'2'
  379. }
  380. cmdList.append(rootedDir.getCommand("t_coffee",options=optTCS,wd=kgPathDict[key]+'/tcs_eval'))
  381. rootedDir.serialize("gwAlign_compute_TCS",cmdList)
  382. rootedDir.finishSerializer("gwAlign_compute_TCS")
  383. elif 'blast_search' in modeList:
  384. if args.v : out_logger(" Entering mode blast_search")
  385. if args.blast_db == None:
  386. err_logger(" blast_search - argument -blast_db ")
  387. sys.exit(1)
  388. blast_db=os.path.abspath(args.blast_db)
  389. rootedDir.addCommand("blastp","blastp","blastp -version | grep 'blastp: ' | sed 's/blastp: //g'")
  390. rootedDir.addCommand("t_coffee","t_coffee","t_coffee -version | sed 's/PROGRAM: T-COFFEE //g'")
  391. Nlim=int(args.batchLim)
  392. rootedDir.addSerializer("gwAlign_blast_search",Nlim,queue=args.queue,ncpus='1',mem="2gb",workdir=rootedDir.results)
  393. for key in kgPathDict.keys():
  394. cmdList=list()
  395. optTranslate={
  396. '-action':'+translate',
  397. '-in':'ref.fa',
  398. '-output':"fasta_aln"
  399. }
  400. posTranslate=["> ref_tr.fa"]
  401. cmdList.append(rootedDir.getCommand("t_coffee",options=optTranslate,subprogram="-other_pg seq_reformat",positionals=posTranslate,wd=kgPathDict[key]))
  402. optBlast={
  403. '-query':'ref_tr.fa',
  404. '-db':blast_db,
  405. '-use_sw_tback':'',
  406. '-seg':'no',
  407. '-outfmt':'5',
  408. '-max_hsps_per_subject':'1',
  409. '-max_target_seqs':'5'
  410. }
  411. posBlast=["> ref_blast.xml"]
  412. cmdList.append(rootedDir.getCommand("blastp",options=optBlast,positionals=posBlast,wd=kgPathDict[key]))
  413. rootedDir.serialize("gwAlign_blast_search",cmdList)
  414. rootedDir.finishSerializer("gwAlign_blast_search")
  415. elif 'stat_filter' in modeList:
  416. if args.v : out_logger(" Entering mode stat_filter")
  417. tcs_min_cons,tcs_min_each=args.phase_filter.split(',')
  418. cpu=int(args.cpu)
  419. if args.focus!=None:
  420. focusList=args.focus.split(',')
  421. else:
  422. focusList=None
  423. selectRatio=float(args.ratio)
  424. def intscore(char):
  425. if char=='-':
  426. value=-1 # assign -1 score for gaps
  427. elif char=='#':
  428. value=-1
  429. else:
  430. value=int(char)
  431. return(value)
  432. class score:
  433. """
  434. @summary: Parse t_coffee score files (ascii format)
  435. @ivar means : dictionary of mean score (ints) with seqname as keys
  436. @type means : dict(<string:int>)
  437. @ivar seqs : dictionary of seq score (ints) with seqname as keys
  438. @type seqs : dict(<string:int Array>)
  439. @ivar mean_cons: mean score for consensus
  440. @type mean_cons: int
  441. @ivar mean_seq: seq score for consensus
  442. @type mean_seq: int array
  443. """
  444. def __init__(self,filename):
  445. import re
  446. re_score=re.compile('^SCORE=([\d]+)$')
  447. re_species=re.compile('^([\S]+)[\s]+:[\s]+([\d]+)$')
  448. re_seq=re.compile('^([\S]+)[\s]+([\-\d]+)$')
  449. self.means=dict() # dictionary of mean score (ints)
  450. self.seqs =dict() # dictionary of score ascii seq (int arrays)
  451. with open(filename,"r") as filescore:
  452. for line in filescore:
  453. line=line.rstrip()
  454. m=re_score.match(line)
  455. if m:
  456. self.score=intscore(m.group(1))
  457. else:
  458. m=re_species.match(line)
  459. if m:
  460. if m.group(1)!="cons":
  461. self.means[m.group(1)]=intscore(m.group(2))
  462. self.seqs[m.group(1)]=[]
  463. else:
  464. self.mean_cons=intscore(m.group(2))
  465. self.seq_cons=[]
  466. else:
  467. m=re_seq.match(line)
  468. if m:
  469. if m.group(1)!="cons":
  470. self.seqs[m.group(1)]+=[intscore(x) for x in list(m.group(2))]
  471. else:
  472. self.seq_cons+=[intscore(x) for x in list(m.group(2))]
  473. def tcs_filter_and_stats(in_gene_dir,in_ref=args.ref,in_name=args.name,in_tcs_min_cons=tcs_min_cons,in_tcs_min_each=tcs_min_each,in_out_filtered="tcs_filtered_aln.fa",in_out_conserved_bed="codon_conserved.bed",in_out_exon="nucl_exon.bed",in_tcs_score="./tcs_eval/translated_dna.score_ascii",in_dna="./tcs_eval/dna.fa",in_focusList=focusList,in_ratio=selectRatio):
  474. """
  475. parser.add_argument('-gene_dir', metavar='/path', required=False,default='.', help="gene ID directory")
  476. parser.add_argument('-ref', metavar='ref_species' , required=False, help="name of the reference species",default='hg19')
  477. parser.add_argument('-name', metavar='name', required=False,default="filtered_default", help="name of phasing-filtering")
  478. parser.add_argument('-tcs_min_cons', metavar='N', required=False,default=5, help="minimum of TCS quality 1-9 for consensus seq")
  479. parser.add_argument('-tcs_min_each', metavar='N', required=False,default=3, help="minimum of TCS quality 1-9 for each seq")
  480. parser.add_argument('-out_stat', metavar='name', required=False,default="stat.txt", help="name for stat file (one in parent and another in filtered directory)")
  481. parser.add_argument('-out_filtered', metavar='name', required=False,default="tcs_filtered_aln.fa", help="filtered fasta file")
  482. parser.add_argument('-out_conserved_bed', metavar='name', required=False,default="codon_conserved.bed", help="bed file for conserved regions")
  483. parser.add_argument('-out_exon', metavar='name', required=False,default="nucl_exon.bed", help="bed file for exonic intervall on ref sequence")
  484. parser.add_argument('-tcs_score', metavar='filename', required=False,default="./tcs_eval/translated_dna.score_ascii", help="score_ascii TCS file (based on translated dna) (t_coffee)")
  485. parser.add_argument('-dna', metavar='filename', required=False,default="./tcs_eval/dna.fa", help="score_ascii TCS file (t_coffee)")
  486. parser.add_argument('-v',required=False,action='store_true',help="verbose")
  487. """
  488. import itertools
  489. import os
  490. import re
  491. import sys
  492. import traceback
  493. from Bio import SeqIO
  494. from Bio.Seq import Seq
  495. from upype import mkdirp
  496. from Bio.SubsMat import MatrixInfo
  497. allowedAA=['C','S','T','P','A','G','N','D','E','Q','H','R','K','M','I','L','V','F','Y','W']
  498. def score_matrix(pair):
  499. if pair not in MatrixInfo.blosum62:
  500. return MatrixInfo.blosum62[(tuple(reversed(pair)))]
  501. else:
  502. return MatrixInfo.blosum62[pair]
  503. def score_spec(nucl_ref,nucl_spec): # compute blossum62 score
  504. nucl_ref=nucl_ref.upper()
  505. nucl_spec=nucl_spec.upper()
  506. if not nucl_spec in allowedAA:
  507. minScore=100
  508. for AA in allowedAA:
  509. if score_matrix((nucl_ref,AA))<minScore:
  510. minScore=score_matrix((nucl_ref,AA))
  511. return(minScore)
  512. else:
  513. return(score_matrix((nucl_ref,nucl_spec)))
  514. def getAAFromCodingAlignedString(alignedString,posFirstBase):
  515. codon_spec=str(alignedString)[posFirstBase:posFirstBase+3]
  516. fs_gap=codon_spec.count("!")+codon_spec.count("-")
  517. if fs_gap > 0:
  518. aa_spec='-'
  519. else:
  520. aa_spec=Seq(codon_spec).translate().__str__().upper()
  521. return(aa_spec)
  522. os.chdir(in_gene_dir)
  523. re_title=re.compile('.*(tr|sp)\|(.*)\|(.*)_[\S]+ ([^=]*)( [^=]+=*)')
  524. err=""
  525. war=""
  526. msg=""
  527. maxscore=None
  528. from Bio.Blast import NCBIXML
  529. result=open("ref_blast.xml","r")
  530. records= NCBIXML.parse(result)
  531. item=next(records)
  532. q_len=float(item.query_length)
  533. subject_id=None
  534. for alignment in item.alignments:
  535. s_len=float(alignment.length)
  536. for hsp in alignment.hsps:
  537. q_cov=float(hsp.align_length)/q_len
  538. s_cov=float(hsp.align_length)/s_len
  539. if q_cov>0.9 and s_cov>0.9:
  540. m=re_title.match(alignment.title)
  541. if not m:
  542. err="A blast database entry title {"+alignment.title+"} is not recognized for: "+in_gene_dir.rstrip('/').split('/')[-1]
  543. return({"out":msg,"err":err})
  544. #gene_name=m.group(3)
  545. #long_name=m.group(4)
  546. if hsp.score>maxscore or maxscore==None:
  547. subject_id=m.group(2)
  548. query_aln=hsp.query
  549. subject_aln=hsp.sbjct
  550. maxscore=hsp.score
  551. subject_start=hsp.sbjct_start # all next 1-based
  552. subject_end=hsp.sbjct_end
  553. query_start=hsp.query_start
  554. query_end=hsp.query_end
  555. elif hsp.score==maxscore:
  556. db_type=m.group(1) # tr or sp
  557. if db_type=='sp':
  558. subject_id=m.group(2)
  559. query_aln=hsp.query
  560. subject_aln=hsp.sbjct
  561. subject_start=hsp.sbjct_start # all next 1-based
  562. subject_end=hsp.sbjct_end
  563. query_start=hsp.query_start
  564. query_end=hsp.query_end
  565. outFiles=dict()
  566. try:
  567. feat_dict={
  568. "INIT_MET":[],
  569. "DISULFID":[],
  570. "CROSSLNK":[],
  571. "ACT_SITE":[],
  572. "METAL":[],
  573. "BINDING":[]
  574. }
  575. if subject_id!=None:
  576. blast_res=subject_id+"\t"+str(maxscore)
  577. i=query_start # sould be first ref pos in the hit
  578. prot_uniprot2ref=[0]*subject_start # 1-based with a subject start set
  579. for j in range(0,len(query_aln)):
  580. aa_query=query_aln[j]
  581. aa_subject=subject_aln[j]
  582. if ((aa_subject!='-')and(aa_subject!='!')and(aa_subject!='?')):
  583. if ((aa_query!='-')and(aa_query!='!')and(aa_query!='?')):
  584. prot_uniprot2ref.append(i)
  585. i+=1
  586. else:
  587. prot_uniprot2ref.append(-1) # joker to ignore
  588. elif ((aa_query!='-')and(aa_query!='!')and(aa_query!='?')) :
  589. i+=1
  590. import requests,json
  591. from requests.exceptions import SSLError
  592. requestURL = "https://www.ebi.ac.uk/proteins/api/features/"+subject_id+"?types=INIT_MET%2CDISULFID%2CCROSSLNK%2CACT_SITE%2CMETAL%2CBINDING"
  593. ssl_issue=True
  594. while ssl_issue:
  595. try:
  596. r = requests.get(requestURL, headers={ "Accept" : "application/json"})
  597. ssl_issue=False
  598. except SSLError as e:
  599. ssl_issue=True
  600. except Exception as e:
  601. ssl_issue=False
  602. raise e
  603. if not r.ok:
  604. err=" EBI API request failed for: "+in_gene_dir.rstrip('/').split('/')[-1]+"; please check the following url: "+requestURL
  605. return({"out":msg,"err":err})
  606. uniprot_dict=json.loads(r.text)
  607. features = uniprot_dict["features"]
  608. uniprot_seq=uniprot_dict["sequence"] # to check wether aa is well aligned in human without fs
  609. #DISULFID COORD
  610. # begin of disulfid is the first cystein in a 1-based
  611. # end of disulfid is the second cystein in a 1-based
  612. #CROSSLNK COORD
  613. # IDEM for intrachain
  614. # start=end in 1-based for inter-chain
  615. #Site COORD are 1-based
  616. for feat in features:
  617. try:
  618. if int(feat["begin"]) >= subject_start and int(feat["begin"]) <= subject_end:
  619. feat_dict[feat["type"]].append(int(feat["begin"]))
  620. if feat["begin"]!=feat["end"] :
  621. if int(feat["end"]) >= subject_start and int(feat["end"]) <= subject_end :
  622. feat_dict[feat["type"]].append(int(feat["end"]))
  623. except Exception, e:
  624. pass
  625. else:
  626. blast_res="None\t-1"
  627. out_filtered=in_gene_dir+"/"+in_name+"/"+in_out_filtered
  628. out_conserved_bed=in_gene_dir+"/"+in_name+"/"+in_out_conserved_bed
  629. out_exon=in_gene_dir+"/"+in_name+"/"+in_out_exon
  630. out_stat=in_gene_dir+"/"+in_name+"/stat_raw.txt"
  631. out_stat_filtered=in_gene_dir+"/"+in_name+"/stat_filtered.txt"
  632. discard_path=in_gene_dir+"/"+in_name+"/discard.txt"
  633. blast_hit_path=in_gene_dir+"/"+in_name+"/blast.txt"
  634. outFiles[blast_hit_path]=blast_res+"\n"
  635. alnScore=score(in_tcs_score)
  636. record_dict = SeqIO.to_dict(SeqIO.parse(in_dna, "fasta"))
  637. conserved_dict=dict()
  638. tcs_min_cons=int(in_tcs_min_cons)
  639. tcs_min_each=int(in_tcs_min_each)
  640. dna_ref2aln=[0] # 1-based
  641. i=1
  642. for nucl in str(record_dict[in_ref].seq):
  643. if ((nucl!='-')and(nucl!='!')and(nucl!='?')):
  644. dna_ref2aln.append(i)
  645. i+=1
  646. conservedArray=[]
  647. for i in range(0,len(alnScore.seq_cons)):
  648. if alnScore.seq_cons[i]>=tcs_min_cons:
  649. conservedArray+=[i]
  650. if alnScore.score==0:
  651. msg="stat_filter DISCARD see : "+os.path.abspath(discard_path)
  652. outFiles[discard_path]="MSA have been discarded because the computed TCS score is 0. It could mean that only the reference is there."+"\n"
  653. elif float(len(alnScore.means.keys()))/float(len(record_dict.keys())) < in_ratio:
  654. msg="stat_filter DISCARD see : "+os.path.abspath(discard_path)
  655. outFiles[discard_path]="MSA have been discarded because the ratio between conserved species (at least partially) and considered species is inferior to the specified ratio threshold : "+str(in_ratio)+"\n"
  656. elif len(conservedArray)==0:
  657. msg="stat_filter DISCARD see : "+os.path.abspath(discard_path)
  658. outFiles[discard_path]="MSA have been discarded because no columns were above the TCS threshold :"+str(tcs_min_cons)+"\n"
  659. elif in_focusList!=None:
  660. total=float(len(in_focusList))
  661. conserved=0
  662. for species in in_focusList:
  663. if not species in record_dict.keys():
  664. raise ValueError('One of species name specified in -focus argument does not exists')
  665. elif species in alnScore.means.keys():
  666. conserved+=1
  667. if float(conserved)/total<in_ratio:
  668. msg="stat_filter DISCARD see : "+os.path.abspath(discard_path)
  669. outFiles[discard_path]="MSA have been discarded because the ratio between conserved species (at least partially) and considered species specified in -focus argument is inferior to the specified ratio threshold : "+str(in_ratio)+"\n"
  670. if not discard_path in outFiles.keys() :
  671. outFiles[out_stat]=""
  672. outFiles[out_stat_filtered]=""
  673. outFiles[out_filtered]=""
  674. raw_scores={
  675. "INIT_MET":dict(),
  676. "DISULFID":dict(),
  677. "CROSSLNK":dict(),
  678. "ACT_SITE":dict(),
  679. "METAL":dict(),
  680. "BINDING":dict()
  681. }
  682. filtered_scores={
  683. "INIT_MET":dict(),
  684. "DISULFID":dict(),
  685. "CROSSLNK":dict(),
  686. "ACT_SITE":dict(),
  687. "METAL":dict(),
  688. "BINDING":dict()
  689. }
  690. header_feats="\t".join(raw_scores.keys())
  691. outFiles[out_stat_filtered]+="\t".join(["species","N","gap","fs"])+"\t"+header_feats+"\n"
  692. outFiles[out_stat]+="\t".join(["species","N","gap","fs","end_stop","before_end_stop","mean_score"])+"\t"+header_feats+"\n"
  693. for species in record_dict.keys():
  694. if not species in alnScore.means.keys():
  695. alnScore.means[species]=-1
  696. alnScore.seqs[species]=[-1]*len(alnScore.seq_cons)
  697. else:
  698. cod_seq=[str(record_dict[species].seq)[3*x:3*x+3] for x in range(0,len(str(record_dict[species].seq))/3)]
  699. conserved_dict[species]=[]
  700. outFiles[out_filtered]+=">"+species+"\n"
  701. limLine=0
  702. N=str(record_dict[species].seq).count("N")
  703. N+=str(record_dict[species].seq).count("n")
  704. fs=str(record_dict[species].seq).count("!")
  705. gap=str(record_dict[species].seq).count("-")
  706. m_qual=alnScore.means[species]
  707. # TODO Compute feature scores !
  708. feat_line=""
  709. for f_type in feat_dict.keys():
  710. # initiate score to 0
  711. raw_scores[f_type][species]=0
  712. for aa_pos_uniprot_ref in sorted(set(feat_dict[f_type])):
  713. # dna_ref2aln prot_uniprot2ref
  714. aa_uniprot=uniprot_seq[aa_pos_uniprot_ref-1].upper()
  715. aa_pos_ref=prot_uniprot2ref[aa_pos_uniprot_ref]
  716. if aa_pos_ref!=-1:
  717. aln_cod_start=dna_ref2aln[(aa_pos_ref)*3] -3 # to switch from 1-based to 0-based !
  718. aa_ref=getAAFromCodingAlignedString(str(record_dict[in_ref].seq),aln_cod_start)
  719. if aa_ref==aa_uniprot:
  720. aa_spec=getAAFromCodingAlignedString(str(record_dict[species].seq),aln_cod_start)
  721. if aa_spec!='X':
  722. raw_scores[f_type][species]+=score_spec(aa_ref,aa_spec)
  723. ### VERY USEFUL CTRL LINES
  724. if f_type=="DISULFID" and species==in_ref and str(record_dict[in_ref].seq).count('!')==0:
  725. tmp_war=""
  726. # Check if it is C ok ?
  727. if aa_uniprot!='C':
  728. tmp_war+="Uniprot sequence is not a C but a "+aa_uniprot+"\n"
  729. if aa_ref!='C' and aa_ref!='-' :
  730. tmp_war+="Ref sequence is not a C but a "+aa_ref+"\n"
  731. if tmp_war!="":
  732. tmp_war=" WARNING - DISULFID CHECKPOINT (on raw CDS) (uniprot id: "+subject_id+" [hit start="+str(subject_start)+"] at pos:"+str(aa_pos_uniprot_ref)+") failed:\n"+tmp_war
  733. war+=tmp_war
  734. ###
  735. feat_line+="\t"+str(raw_scores[f_type][species])
  736. if re.match('TAG|TAA|TGA',list(reversed(list(itertools.dropwhile(lambda x: x == '---', reversed(cod_seq)))))[-1],re.IGNORECASE):
  737. end_stop=1
  738. else:
  739. end_stop=0
  740. before_end_stop=0
  741. oldcodon=None
  742. for codon in list(reversed(list(itertools.dropwhile(lambda x: x == '---', reversed(cod_seq)))))[0:-1]:
  743. if oldcodon==None:
  744. oldcodon=codon
  745. elif ( (oldcodon+codon).count('!')==3 and re.match('TAG|TAA|TGA',re.sub('!','',oldcodon+codon),re.IGNORECASE)):
  746. before_end_stop+=1
  747. oldcodon=codon
  748. outFiles[out_stat]+="\t".join([species,str(N),str(gap),str(fs),str(end_stop),str(before_end_stop),str(m_qual)])+feat_line+"\n"
  749. for f_type in feat_dict.keys():
  750. filtered_scores[f_type][species]=0
  751. for i in range(0,len(cod_seq)):
  752. if i in conservedArray: # if the consensus is not of a good quality ==> clipping
  753. limLine+=1
  754. if alnScore.seqs[species][i]>=tcs_min_each or alnScore.seqs[species][i]==-1:
  755. outFiles[out_filtered]+=re.sub('!','-',cod_seq[i])
  756. conserved_dict[species]+=[cod_seq[i]]
  757. # warning i is aln_cod_start
  758. aln_cod_start=i*3
  759. if (aln_cod_start+1) in dna_ref2aln: # else there is a !/-/? in ref seq
  760. aa_pos_ref=dna_ref2aln.index(aln_cod_start+1)/3
  761. if subject_id!=None:
  762. if aa_pos_ref in prot_uniprot2ref:
  763. aa_pos_uniprot_ref=1+prot_uniprot2ref.index(aa_pos_ref)
  764. for f_type in feat_dict.keys():
  765. if aa_pos_uniprot_ref in feat_dict[f_type]:
  766. # dna_ref2aln prot_uniprot2ref
  767. aa_uniprot=uniprot_seq[aa_pos_uniprot_ref-1].upper()
  768. aa_ref=getAAFromCodingAlignedString(str(record_dict[in_ref].seq),aln_cod_start)
  769. if aa_ref==aa_uniprot:
  770. aa_spec=getAAFromCodingAlignedString(str(record_dict[species].seq),aln_cod_start)
  771. if aa_spec!='X':
  772. filtered_scores[f_type][species]+=score_spec(aa_ref,aa_spec)
  773. ### VERY USEFUL CTRL LINES
  774. if f_type=="DISULFID" and species==in_ref and str(record_dict[in_ref].seq).count('!')==0:
  775. tmp_war=""
  776. # Check if it is C ok ?
  777. if aa_uniprot!='C':
  778. tmp_war+="Uniprot sequence is not a C but a "+aa_uniprot+"\n"
  779. if aa_ref!='C' and aa_ref!='-' :
  780. tmp_war+="Ref sequence is not a C but a "+aa_ref+"\n"
  781. if tmp_war!="":
  782. tmp_war=" WARNING - DISULFID CHECKPOINT (on filtered CDS) (uniprot id: "+subject_id+" [hit start="+str(subject_start)+"] at pos:"+str(aa_pos_uniprot_ref)+") failed:\n"+tmp_war
  783. war+=tmp_war
  784. ###
  785. else :
  786. outFiles[out_filtered]+='NNN'
  787. conserved_dict[species]+=['NNN'] # hardmasking for badly aligned sequences
  788. filtered_feat_line=""
  789. for f_type in feat_dict.keys():
  790. filtered_feat_line+="\t"+str(filtered_scores[f_type][species])
  791. if limLine==20:
  792. outFiles[out_filtered]+="\n"
  793. limLine=0
  794. if limLine!=0 : outFiles[out_filtered]+="\n"
  795. N=''.join(conserved_dict[species]).count("N")
  796. N+=''.join(conserved_dict[species]).count("n")
  797. fs=''.join(conserved_dict[species]).count("!")
  798. gap=''.join(conserved_dict[species]).count("-")
  799. outFiles[out_stat_filtered]+="\t".join([species,str(N),str(gap),str(fs)])+filtered_feat_line+"\n"
  800. msg="stat_filter FINISHED for :"+in_gene_dir.rstrip('/').split('/')[-1]
  801. start=conservedArray[0]
  802. line="# Conserved codons (in codon/amino acid index) with aligned sequence as reference (counting gaps)"
  803. outFiles[out_conserved_bed]=""
  804. old_i=start
  805. for i in conservedArray[1:]:
  806. if i-1!=old_i:
  807. line=in_ref+"\t"+str(start)+"\t"+str(old_i+1) # 0-based IN OUT
  808. outFiles[out_conserved_bed]+=line+"\n"
  809. start=i
  810. old_i=i
  811. line=in_ref+"\t"+str(start)+"\t"+str(old_i+1) # 0-based IN OUT
  812. outFiles[out_conserved_bed]+=line+"\n"
  813. i=0
  814. line="# exons (in nucleotid index) with unaligned sequence as reference (not counting gaps)"
  815. newInterval=True
  816. start=0
  817. outFiles[out_exon]=""
  818. with open("exons.pos") as exonFile:
  819. for exonline in exonFile.readlines():
  820. exonline=exonline.rstrip()
  821. if newInterval:
  822. outFiles[out_exon]+=line+"\n"
  823. old_i=i
  824. numExon=exonline
  825. newInterval=False
  826. else:
  827. if exonline!=numExon:
  828. newInterval=True
  829. line=in_ref+"\t"+str(start)+"\t"+str(old_i+1) # 0-based IN OUT
  830. start=i
  831. else:
  832. old_i=i
  833. i+=1
  834. if not newInterval:
  835. line=in_ref+"\t"+str(start)+"\t"+str(old_i+1) # 0-based IN OUT
  836. outFiles[out_exon]+=line+"\n"
  837. except Exception, e:
  838. exc_type, exc_value, exc_tb = sys.exc_info()
  839. err="\n".join(traceback.format_exception(exc_type, exc_value, exc_tb))
  840. if err!="":
  841. err="stat_filter: ERROR failed job for :"+in_gene_dir.rstrip('/').split('/')[-1]+"\n"+err
  842. if war!="":
  843. war="stat_filter: WARNING handled by job for :"+in_gene_dir.rstrip('/').split('/')[-1]+"\n"+war
  844. return({"msg":msg,"outFiles":outFiles,"err":err,"dir":in_gene_dir+"/"+in_name,"war":war})
  845. from multiprocessing import Pool
  846. pool=Pool(processes=cpu)
  847. jobs=[]
  848. for kgID in kgPathDict.keys():
  849. if args.v:
  850. out_logger("stat_filter: initiate job for "+kgID+" in "+kgPathDict[kgID])
  851. jobs.append(pool.apply_async(tcs_filter_and_stats,args=(kgPathDict[kgID],)))
  852. pool.close()
  853. while len(jobs)!=0:
  854. for i in range(0,min(cpu+1,len(jobs))):
  855. if jobs[i].ready():
  856. outjob=jobs[i].get(5)
  857. if outjob["err"]=="":
  858. if args.v : out_logger(outjob["msg"])
  859. if outjob["war"] !="" : out_logger(outjob["war"])
  860. job_work_dir=outjob["dir"]
  861. if os.path.isdir(job_work_dir):
  862. import shutil
  863. shutil.rmtree(job_work_dir)
  864. if args.v : out_logger(" stat_filter - manager - creating folder "+job_work_dir)
  865. os.makedirs(job_work_dir)
  866. for file_path in outjob["outFiles"].keys():
  867. if args.v : out_logger(" stat_filter - manager - writing "+file_path)
  868. with open(file_path,"w") as con:
  869. con.write(outjob["outFiles"][file_path])
  870. else:
  871. err_logger(outjob["err"])
  872. #sys.exit(1)
  873. del jobs[i]
  874. if args.v : out_logger("stat_filter -- "+str(len(jobs))+" jobs left")
  875. break
  876. pool.join()
  877. elif 'report' in modeList:
  878. os.chdir(rootedDir.reports)
  879. rmdFile=search_lib('gwAlign_Unify_report.rmd')
  880. if os.path.isfile(rootedDir.reports+"/report.rmd"):
  881. os.remove(rootedDir.reports+"/report.rmd")
  882. from shutil import copyfile
  883. copyfile(rmdFile, rootedDir.reports+"/report.rmd")
  884. if os.path.isfile(rootedDir.reports+"/report.html"):
  885. os.remove(rootedDir.reports+"/report.html")
  886. submit('Rscript -e "rmarkdown::render(\''+rootedDir.reports+"/report.rmd"+'\',params = list(name = \''+args.name+'\') )"')
  887. ### Annotation
  888. if 'annotate' in modeList:
  889. allSymbol=dict()
  890. iterNoName=1
  891. cnx = mysql.connector.connect(user='genome',host=args.host,port=args.port,database='hg19')
  892. for key in kgPathDict.keys():
  893. gene_name=Get_annotation(key)
  894. cnx.close()
  895. with open(rootedDir.reports+'/all_withCounts.txt','w') as logFile:
  896. for key in allSymbol:
  897. logFile.write(key+"\t"+str(allSymbol[key])+"\n")
  898. with open(rootedDir.reports+'/duplicate.txt','w') as logFile:
  899. for key in allSymbol:
  900. if allSymbol[key]>0:
  901. logFile.write(key+"\n")
  902. submit('cd '+rootedDir.path+' ; for kgID in $(ls results) ; do echo -e "$kgID\t$(cat results/$kgID/annotation/consName.txt)" ; done > reports/consName.tab')
  903. submit('cd '+rootedDir.reports+' ; for path in $(awk \'{ print $2 }\' kgPathDictFile.tab); do cat $path/annotation/consName.txt | sed -r "s/$/\t$(echo $path | sed \'s/\//\\\//g\')\n/g" ; done > consNameDict.tab')
  904. dirList=os.listdir(rootedDir.results)
  905. goDict=dict()
  906. goList=list()
  907. headGO=['goId','name','type','genes']
  908. keggDict=dict()
  909. keggList=list()
  910. headKegg=['mapId','name','genes']
  911. keggFileName=rootedDir.reports+'/kegg.tab'
  912. goFileName=rootedDir.reports+'/go.tab'
  913. for dirName in dirList:
  914. with open(rootedDir.results+'/'+dirName+'/annotation/consName.txt') as consNameFile:
  915. gene=consNameFile.readline().rstrip()
  916. File=open(rootedDir.results+'/'+dirName+'/annotation/go.tab','r')
  917. File.readline()
  918. for line in File.readlines():
  919. line=line.rstrip()
  920. lineList=line.split("\t")
  921. goId=lineList[1]
  922. name=lineList[2]
  923. gotype=lineList[3]
  924. if goId in goDict.keys():
  925. goDict[goId].append(gene)
  926. else:
  927. goDict[goId]=[gene]
  928. goList.append([goId,name,gotype])
  929. File.close()
  930. File=open(rootedDir.results+'/'+dirName+'/annotation/kegg.tab','r')
  931. File.readline()
  932. for line in File.readlines():
  933. line=line.rstrip()
  934. lineList=line.split("\t")
  935. mapId=lineList[1]
  936. name=lineList[2]
  937. if mapId in keggDict.keys():
  938. keggDict[mapId].append(gene)
  939. else:
  940. keggList.append([mapId,name])
  941. keggDict[mapId]=[gene]
  942. File.close()
  943. keggFile=open(keggFileName,'w')
  944. keggFile.write("\t".join(headKegg)+"\n")
  945. goFile=open(goFileName,'w')
  946. goFile.write("\t".join(headGO)+"\n")
  947. for lineList in keggList:
  948. mapId=lineList[0]
  949. keggFile.write("\t".join(lineList)+"\t"+','.join(keggDict[mapId])+"\n")
  950. for lineList in goList:
  951. goId=lineList[0]
  952. goFile.write("\t".join(lineList)+"\t"+','.join(goDict[goId])+"\n")
  953. goFile.close()
  954. keggFile.close()
  955. ## end of annotation
  956. saveRoot(rootedDir)
  957. sys.exit(0)