pBRIT

Documentation: Batch Processing

pBRIT can be accessed using an API. Using the script below, it is possible to submit multiple gene sets at once to pBRIT and to fetch the results automatically when they are available. As such pBRIT can be incorporated in third party tools and pipelines. Right-click to download the script. An example dataset with example commands to use the API is available for download here.

Code

#!/usr/bin/python
# default modules
import sys
import json
import urllib
import urllib2
import getopt 
import os.path
from time import strftime,sleep
# non-default packages (install with "pip install ")
import requests
import time
import shutil

start_time = time.time()
## SET LOCAL INSTALLATIONS HERE:
pbrit_url = "http://143.169.238.105/pbrit/api/"



#######################
## MAIN PROGRAM FLOW ##
#######################
def main() :
	# parse commandline
	optlist,args = getArgs(sys.argv[1:])
	
	# check if train_genes is available
	try:
		os.path.isfile(optlist['T'])
	except:
		print("Provided Training_Genes file does not exist")
		Usage()	

	# check if test_genes is available
	test_genes = []
	if 't' in optlist:
		try:
			os.path.isfile(optlist['t'])
		except:
			print("Provided Test_Genes file does not exist")
			Usage()		
		test_genes.append(optlist['t'])
	else:
		try:
			os.path.isfile(optlist['I'])
		except:
			print("Provided Batch_Input file does not exist")
			Usage()		
		# go over files to test and check all of them.
		error = 0
		with open(optlist['I']) as f:
			for line in f:
				line = line.rstrip()
				if not os.path.isfile(line):
					print("Provided Test_Genes file does not exist: "+str(line))
					error += 1
					continue
				test_genes.append(line)
				
		if error > 0:
			print "\nSome Test_Genes files did not exist. Exiting."
			Usage()
	
	# submit jobs.
	print "Submitting",len(test_genes),"jobs";
	job_list = {}
	for test_file in test_genes:
		# sumbit the job.
		#print "test file Name :", test_file
		name = optlist['N']+'.'+os.path.basename(test_file)
		answer = requests.post(pbrit_url+"Submit", data={'method':optlist['m'], 'email':optlist['e'], \
		    'job_name':name, 'db_version':optlist['d'],'use_pheno':optlist['p']}, \
		    files={'train_file': open(optlist['T'], 'r'), 'test_file': open(test_file,'r')})
		answer = json.loads(answer.text)
		if 'ERROR' in answer:
			print "Submission failed for test-set : ",test_file
			print "  Reason : ",answer['ERROR']
			continue
		# store for follow up.
		#print answer,"\t",type(answer)
		#for keys in answer:
		#	print keys,"\t",answer[keys]
		#print answer['job_key']
		job_list[answer['job_key']] = {}	
		job_list[answer['job_key']]['status'] = 0
		job_list[answer['job_key']]['comments'] = answer['comments']
		job_list[answer['job_key']]['test_file'] = test_file 
		# sleep 1 second, because job_id == timestamp...
		sleep(1)

	print "All Jobs submitted. Waiting for jobs to finish.";
	## wait for jobs to finish.
	done = 0
	nr_submitted = len(job_list)
	nr_finished = 0
	while done == 0:
		done = 1
		for job_key in job_list:
			if job_list[job_key]['status'] != 0:
				continue
			answer = fetch_json_page(pbrit_url + 'GetStatus/'+job_key)
			if answer['status'] == 'Finished':
				nr_finished += 1
				print str(nr_finished)+"/"+str(nr_submitted),":",job_key, "finished"
				job_list[job_key]['status'] = 1
			elif answer['status'] == 'Failed':
				print job_key, "failed"
				nr_finished += 1
				job_list[job_key]['status'] = -1
			else:
				done = 0
	
	
	for job_key in job_list:
		rank_file =  job_list[job_key]['test_file']
		wh = open(rank_file+"_OUTPUT.txt","w")
		answer = fetch_json_page(pbrit_url + 'GetResults/'+job_key)
		rank_hash = answer['ranking']
	
		for keys in sorted(rank_hash,key=int):
			print >>wh,keys,"\t",rank_hash[keys]['Symbol'],"\t",rank_hash[keys]['Score']
	
		response = requests.get(pbrit_url + 'GetGlobalHeatMap/'+job_key,stream=True)
		with open(rank_file+'_OUTPUT.png', 'wb') as out_file:
			shutil.copyfileobj(response.raw, out_file)

		response = requests.get(pbrit_url + 'GetModel/'+job_key,stream=True)
                with open(rank_file+'_fm_OBJECT.dat', 'wb') as out_file:
                        shutil.copyfileobj(response.raw, out_file)




		wh.close()

			
		
def getArgs(args):  
	## arguments
	# -h : print help 
	# -T : training genes (mandatory)
	# -t : test genes for single sample
	## -I : batch input file for test genes
	# -m : method : TFIDF / TFIDF_SVD (default)
	# -O : Output file Name
	# -e : email (get notified of results by mail)
	# -N : job name prefix. this name will be suffixed by the test_file name
	# -p : Include Phenotype annotations of test genes: 1 (yes, default) or 0 (no)
	# -d : database version : defaults to newest.
	# -l : list available database versions. 
	opts, args = getopt.getopt(args, 'T:t:I:hm:e:N:p:d:l')
	optlist = dict()
	for opt, arg in opts:
		optlist[opt[1:]] = arg
	
	if 'h' in optlist:
		Usage()
	if 'l' in optlist:
		ListDbVersions()
	

	notes = ''
	errors = ''
	if 't' in optlist and 'I' in optlist:
		errors +=  "ERROR: Arguments '-t' (single sample) and '-I' (batch) are mutually exclusive\n"
	if 't' not in optlist and 'I' not in optlist:
		errors += "ERROR: Either '-t' or '-I' is mandatory\n"
	if 'T' not in optlist:
		errors += "ERROR: Argument -T is mandatory\n";
	if 'm' not in optlist:
		notes += "NOTE: Applying TFIDF_SVD datafusion (default)"
		optlist['m'] = 'TFIDF_SVD'
	if 'e' not in optlist:
		optlist['e'] = '';

	if 'N' not in optlist:
		optlist['N'] = strftime("%Y-%m-%d %H:%M:%S") 
		notes += "NOTE: No job_name provided, using default prefix: "+optlist['N']+"\n"
	
	if 'd' not in optlist :
		answer = fetch_json_page(pbrit_url + 'GetDbVersions')
		optlist['d'] = answer[0]
		notes += "NOTE: No annotation release provided. Set to most recent version: "+answer[0]+"\n"

	if 'p' not in optlist or optlist['p'] not in ['0', '1']:
		optlist['p'] = 1
		notes += "NOTE: Including test-gene phenotype annotations in regression (default)"+"\n"

	if not notes == '':
		print notes
		sleep(3)
	if not errors == '':
		print errors
		Usage()
	return(optlist,args)


def ListDbVersions():
	answer = fetch_json_page(pbrit_url + 'GetDbVersions')
	print "\nAvailable annotation releases: "
	for rel in answer:
		print "  -",rel
	print "\n"
	sys.exit();

def Usage():
	# print help
	print "\n\nUsage: python Submit_by_API.py "
	print " -h : print this help"
	print " -T : file with (T)raining genes. one HUGO/Ensembl entry per line"
	print " -t : file with (t)est genes for one sample. one HUGO/Ensembl entry per line"
	print " -I : batch input: path to test-files (-t) : one file_path per line"
	print " -m : method : TFIDF / TFIDF-SVD (default)"
	print " -e : email : optional, get notified of results"
	print " -N : job name prefix (defaults to timestamp). Will be suffixed by test_file."
	print " -p : Include Phenotype annotations of test genes: 1 (yes, default) or 0 (no)"
	print " -d : database version : defaults to newest."
	print " -l : list available database versions. "
	print "  \n  => -t and -I are mutually exclusive !"
	print " \nOutput:"
	print "    - ranked genes are printed to .ranked";
	print "    - job details are printed to .settings";
	print "\n\n"
	sys.exit(0)

def fetch_json_page(url):
    try_nr = 0
    while try_nr < 10:
	try_nr += 1
    	try: 
    	    	data = urllib2.urlopen(url)
	    	j = json.load(data)
	except:
	        continue
        break
    if try_nr == 10:
	print('Fetching api repsonse failed for following url:')
        print(url)
        sys.exit(2)
     
    ## return data
    return j 


if __name__ == "__main__":
	main()
	print("--- %s seconds ---" % (time.time() - start_time))

The following example command submits a set of 10 gene-sets for ranking based on a single set of training genes: All parameters are explained in the help function of the script above.

Code

 python Submit_by_API.py -T Training_set_ext_30.txt -I BatchFile.txt -m TFIDF -e my.name@email.com -p 1 -d Jan2015

pBRIT candiate gene prioritization

Documentation: Batch Processing