Commit f0100c48 authored by Venkat Malladi's avatar Venkat Malladi

Merge branch '23-ref.to.datahub' into 'develop'

Resolve "Move references to GUDMAP/RBK"

Closes #23

See merge request !45
parents 60452a9c bfc0f444
Pipeline #8156 failed with stages
in 3 minutes and 54 seconds
before_script:
- module add python/3.6.4-anaconda
- pip install --user pytest-pythonpath==0.7.1 pytest-cov==2.5.1
- module load python/3.6.4-anaconda
- pip install --user attrs==19.1.0 pytest-pythonpath==0.7.1 pytest-cov==2.5.1 deriva==1.3.0
- module load singularity/3.5.3
- module load nextflow/20.01.0
- ln -sfn /project/BICF/BICF_Core/shared/gudmap/test_data/* ./test_data/
......@@ -9,6 +9,7 @@ before_script:
stages:
- unit
- reference
- integration
- consistency
......@@ -74,19 +75,6 @@ inferMetadata:
if [[ ${ended} == "" ]]; then exit 1; fi
- pytest -m inferMetadata
getRef:
stage: unit
only:
- push
- tags
except:
- merge_requests
script:
- mkdir -p hu
- mkdir -p mo
- cp -R /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2 ./hu/
- cp -R /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2 ./mo/
trimData:
stage: unit
only:
......@@ -211,6 +199,81 @@ outputBag:
- pytest -m outputBag
humanBioHPC:
stage: reference
only:
- push
- tags
except:
- merge_requests
script:
- mkdir -p hu
- cp -R /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2 ./hu/
mouseBioHPC:
stage: reference
only:
- push
- tags
except:
- merge_requests
script:
- mkdir -p mo
- cp -R /project/BICF/BICF_Core/shared/gudmap/references/GRCh38.p12.v31/hisat2 ./mo/
humanDataHub:
stage: reference
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
- referenceBase=dev.gudmap.org
- refName=GRCh
- refHuVersion=38.p12.v31
- references=$(echo ${referenceBase}/${refName}${refHuVersion})
- GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1)
- GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2)
- GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3)
- query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE})
- curl --request GET ${query} > refQuery.json
- refURL=$(python ./workflow/scripts/extractRefData.py --returnParam URL)
- loc=$(dirname ${refURL})
- if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi
- filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)')
- test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/)
- test=$(echo ${test} | grep -o ${filename})
- if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi
mousenDataHub:
stage: reference
only:
- push
- tags
except:
- merge_requests
script:
- ln -sfn `readlink -e ./test_data/auth/cookies.txt` ~/.bdbag/deriva-cookies.txt
- referenceBase=dev.gudmap.org
- refName=GRCm
- refHuVersion=38.p6.vM22
- references=$(echo ${referenceBase}/${refName}${refHuVersion})
- GRCv=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f1)
- GRCp=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f2)
- GENCODE=$(echo ${references} | grep -o ${refName}.* | cut -d '.' -f3)
- query=$(echo 'https://'${referenceBase}'/ermrest/catalog/2/entity/RNASeq:Reference_Genome/Reference_Version='${GRCv}'.'${GRCp}'/Annotation_Version=GENCODE%20'${GENCODE})
- curl --request GET ${query} > refQuery.json
- refURL=$(python ./workflow/scripts/extractRefData.py --returnParam URL)
- loc=$(dirname ${refURL})
- if [ "${loc}" = "/hatrac/*" ]; then echo "reference not present in hatrac"; exit 1; fi
- filename=$(echo $(basename ${refURL}) | grep -oP '.*(?=:)')
- test=$(singularity run 'docker://gudmaprbk/deriva1.3:1.0.0' deriva-hatrac-cli --host ${referenceBase} ls ${loc}/)
- test=$(echo ${test} | grep -o ${filename})
- if [ "${test}" == "" ]; then echo "reference file not present"; exit 1; fi
integration_se:
stage: integration
only: [merge_requests]
......
# v0.0.4 (in development)
**User Facing**
*
* Add option to pull references from datahub
**Background**
*
* Remove (comment out) option to pull references from S3
* Make pull references from BioHPC default (including in biohpc.config)
* Start using new gudmaprbk dockerhub (images autobuilt)
*Known Bugs*
* Datahub reference pull uses dev.gudmap.org as source until referencencs are placed on production
* outputBag does not contain fetch for processed data
* Does not include automatic data upload
* Override params (inputBag, fastq, species) aren't checked for integrity
......
......@@ -49,6 +49,9 @@ To Run:
* *current mouse* **38.p6.vM22** = GRCm38.p6 with GENCODE annotation release M22
* *current human* **38.p6.v31** = GRCh38.p12 with GENCODE annotation release 31
* ***Optional*** input overrides
* `--refSource` source for pulling references
* **biohpc** = source references from BICF_Core gudmap reference local location (workflow must be run on BioHPC system)
* **datahub** = source references from GUDMAP/RBK reference_table location (currently uses dev.gudmap.org)
* `--inputBagForce` utilizes a local replicate inputBag instead of downloading from the data-hub (still requires accurate repRID input)
* eg: `--inputBagForce test_data/bag/Replicate_Q-Y5F6.zip` (must be the expected bag structure)
* `--fastqsForce` utilizes local fastq's instead of downloading from the data-hub (still requires accurate repRID input)
......
docs/dag.png

693 KB | W: | H:

docs/dag.png

769 KB | W: | H:

docs/dag.png
docs/dag.png
docs/dag.png
docs/dag.png
  • 2-up
  • Swipe
  • Onion skin
params {
refSource = "aws"
}
workDir = 's3://gudmap-rbk.output/work'
aws.client.storageEncryption = 'AES256'
aws {
......
params {
refSource = "biohpc"
}
process {
executor = 'slurm'
queue = 'super'
......
......@@ -26,13 +26,13 @@ process {
container = 'bicf/gudmaprbkfilexfer:2.0.1_indev'
}
withName: parseMetadata {
container = 'bicf/python3:2.0.1_indev'
container = 'gudmaprbk/python3:1.0.0'
}
withName: trimData {
container = 'bicf/trimgalore:1.1'
}
withName: getRefInfer {
container = 'bicf/awscli:1.1'
container = 'gudmaprbk/deriva1.3:1.0.0'
}
withName: downsampleData {
container = 'bicf/seqtk:2.0.1_indev'
......@@ -44,7 +44,7 @@ process {
container = 'bicf/rseqc3.0:2.0.1_indev'
}
withName: getRef {
container = 'bicf/awscli:1.1'
container = 'gudmaprbk/deriva1.3:1.0.0'
}
withName: alignData {
container = 'bicf/gudmaprbkaligner:2.0.1_indev'
......
This diff is collapsed.
#!/usr/bin/env python3
import argparse
import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--returnParam',
help="The parameter to return (URL or MD5).", required=True)
args = parser.parse_args()
return args
def main():
args = get_args()
refQuery = pd.read_json("refQuery.json")
if refQuery["File_URL"].count() == 1:
if args.returnParam == "URL":
print(refQuery["File_URL"].values[0])
elif args.returnParam == "fName":
print(refQuery["File_Name"].values[0])
elif args.returnParam == "MD5":
print(refQuery["File_MD5"].values[0])
else:
raise Exception("Multple references found: \n%s" %
refQuery["RID"])
if __name__ == '__main__':
main()
......@@ -5,52 +5,61 @@ import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True)
parser.add_argument('-m', '--metaFile',help="The metadata file to extract.",required=True)
parser.add_argument('-p', '--parameter',help="The parameter to extract.",required=True)
parser.add_argument(
'-r', '--repRID', help="The replicate RID.", required=True)
parser.add_argument('-m', '--metaFile',
help="The metadata file to extract.", required=True)
parser.add_argument('-p', '--parameter',
help="The parameter to extract.", required=True)
args = parser.parse_args()
return args
def main():
args = get_args()
metaFile = pd.read_csv(args.metaFile,sep=",",header=0)
metaFile = pd.read_csv(args.metaFile, sep=",", header=0)
# Check replicate RID metadata from 'File.csv'
if (args.parameter == "repRID"):
if (len(metaFile.Replicate_RID.unique()) > 1):
print("There are multiple replicate RID's in the metadata: " + " ".join(metaFile.Replicate_RID.unique()))
print("There are multiple replicate RID's in the metadata: " +
" ".join(metaFile.Replicate_RID.unique()))
exit(1)
if not (metaFile.Replicate_RID.unique() == args.repRID):
print("Replicate RID in metadata does not match run parameters: " + metaFile.Replicate_RID.unique() + " vs " + args.repRID)
print("Replicate RID in metadata does not match run parameters: " +
metaFile.Replicate_RID.unique() + " vs " + args.repRID)
exit(1)
else:
rep=metaFile["Replicate_RID"].unique()[0]
rep = metaFile["Replicate_RID"].unique()[0]
print(rep)
if (len(metaFile[metaFile["File_Type"] == "FastQ"]) > 2):
print("There are more then 2 fastq's in the metadata: " + " ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID))
print("There are more then 2 fastq's in the metadata: " +
" ".join(metaFile[metaFile["File_Type"] == "FastQ"].RID))
exit(1)
# Check experiment RID metadata from 'Experiment.csv'
if (args.parameter == "expRID"):
if (len(metaFile.Experiment_RID.unique()) > 1):
print("There are multiple experoment RID's in the metadata: " + " ".join(metaFile.Experiment_RID.unique()))
print("There are multiple experoment RID's in the metadata: " +
" ".join(metaFile.Experiment_RID.unique()))
exit(1)
else:
exp=metaFile["Experiment_RID"].unique()[0]
exp = metaFile["Experiment_RID"].unique()[0]
print(exp)
# Check study RID metadata from 'Experiment.csv'
if (args.parameter == "studyRID"):
if (len(metaFile.Study_RID.unique()) > 1):
print("There are multiple study RID's in the metadata: " + " ".join(metaFile.Study_RID.unique()))
print("There are multiple study RID's in the metadata: " +
" ".join(metaFile.Study_RID.unique()))
exit(1)
else:
study=metaFile["Study_RID"].unique()[0]
study = metaFile["Study_RID"].unique()[0]
print(study)
# Get endedness metadata from 'Experiment Settings.csv'
if (args.parameter == "endsMeta"):
if (metaFile.Paired_End.unique() == "Single End"):
......@@ -60,7 +69,7 @@ def main():
else:
endsMeta = "uk"
print(endsMeta)
# Manually get endness count from 'File.csv'
if (args.parameter == "endsManual"):
if (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 1):
......@@ -68,7 +77,7 @@ def main():
elif (len(metaFile[metaFile["File_Type"] == "FastQ"]) == 2):
endsManual = "pe"
print(endsManual)
# Get strandedness metadata from 'Experiment Settings.csv'
if (args.parameter == "stranded"):
if (metaFile.Has_Strand_Specific_Information.unique() == "yes"):
......@@ -76,10 +85,11 @@ def main():
elif (metaFile.Has_Strand_Specific_Information.unique() == "no"):
stranded = "unstranded"
else:
print("Stranded metadata not match expected options: " + metaFile.Has_Strand_Specific_Information.unique())
print("Stranded metadata not match expected options: " +
metaFile.Has_Strand_Specific_Information.unique())
exit(1)
print(stranded)
# Get spike-in metadata from 'Experiment Settings.csv'
if (args.parameter == "spike"):
if (metaFile.Used_Spike_Ins.unique() == "yes"):
......@@ -87,7 +97,8 @@ def main():
elif (metaFile.Used_Spike_Ins.unique() == "no"):
spike = "no"
else:
print("Spike-ins metadata not match expected options: " + metaFile.Used_Spike_Ins.unique())
print("Spike-ins metadata not match expected options: " +
metaFile.Used_Spike_Ins.unique())
exit(1)
print(spike)
......@@ -98,7 +109,8 @@ def main():
elif (metaFile.Species.unique() == "Homo sapiens"):
species = "Homo sapiens"
else:
print("Species metadata not match expected options: " + metaFile.Species.unique())
print("Species metadata not match expected options: " +
metaFile.Species.unique())
exit(1)
print(species)
......@@ -107,5 +119,6 @@ def main():
readLength = metaFile.Read_Length.unique()
print(str(readLength).strip('[]'))
if __name__ == '__main__':
main()
......@@ -5,20 +5,25 @@ import pandas as pd
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-s', '--studyRID',help="The study RID.",required=True)
parser.add_argument('-s', '--studyRID',
help="The study RID.", required=True)
args = parser.parse_args()
return args
def main():
args = get_args()
studyRID=pd.read_json(args.studyRID+"_studyRID.json")
studyRID = pd.read_json(args.studyRID+"_studyRID.json")
if studyRID["RID"].count() > 0:
studyRID["RID"].to_csv(args.studyRID+"_studyRID.csv",header=False,index=False)
studyRID["RID"].to_csv(
args.studyRID+"_studyRID.csv", header=False, index=False)
else:
raise Exception("No associated replicates found: %s" %
studyRID)
studyRID)
if __name__ == '__main__':
main()
......@@ -6,38 +6,47 @@ import numpy as np
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('-r', '--repRID',help="The replicate RID.",required=True)
parser.add_argument(
'-r', '--repRID', help="The replicate RID.", required=True)
args = parser.parse_args()
return args
def main():
args = get_args()
tin = pd.read_csv(args.repRID + '.sorted.deduped.tin.xls',sep="\t",header=0)
hist = pd.cut(tin['TIN'],bins=pd.interval_range(start=0,freq=10,end=100,closed='right')).value_counts(sort=False)
tin = pd.read_csv(args.repRID + '.sorted.deduped.tin.xls',
sep="\t", header=0)
hist = pd.cut(tin['TIN'], bins=pd.interval_range(
start=0, freq=10, end=100, closed='right')).value_counts(sort=False)
labels = ["{0} - {1}".format(i, i + 9) for i in range(1, 100, 10)]
#labels[0] = '0 - 10'
binned = tin.assign(Bins=lambda x: pd.cut(tin['TIN'],range(0,105,10),labels=labels,include_lowest=False,right=True))
binned['chrom'] = binned['chrom'] = binned['chrom'].replace('chr1','chr01')
binned['chrom'] = binned['chrom'].replace('chr2','chr02')
binned['chrom'] = binned['chrom'].replace('chr3','chr03')
binned['chrom'] = binned['chrom'].replace('chr4','chr04')
binned['chrom'] = binned['chrom'].replace('chr5','chr05')
binned['chrom'] = binned['chrom'].replace('chr6','chr06')
binned['chrom'] = binned['chrom'].replace('chr7','chr07')
binned['chrom'] = binned['chrom'].replace('chr8','chr08')
binned['chrom'] = binned['chrom'].replace('chr9','chr09')
hist = pd.pivot_table(binned, values='geneID', index = 'Bins', columns = 'chrom', aggfunc=np.size)
binned = tin.assign(Bins=lambda x: pd.cut(tin['TIN'], range(
0, 105, 10), labels=labels, include_lowest=False, right=True))
binned['chrom'] = binned['chrom'] = binned['chrom'].replace(
'chr1', 'chr01')
binned['chrom'] = binned['chrom'].replace('chr2', 'chr02')
binned['chrom'] = binned['chrom'].replace('chr3', 'chr03')
binned['chrom'] = binned['chrom'].replace('chr4', 'chr04')
binned['chrom'] = binned['chrom'].replace('chr5', 'chr05')
binned['chrom'] = binned['chrom'].replace('chr6', 'chr06')
binned['chrom'] = binned['chrom'].replace('chr7', 'chr07')
binned['chrom'] = binned['chrom'].replace('chr8', 'chr08')
binned['chrom'] = binned['chrom'].replace('chr9', 'chr09')
hist = pd.pivot_table(binned, values='geneID',
index='Bins', columns='chrom', aggfunc=np.size)
hist['TOTAL'] = hist.sum(axis=1)
hist = hist[['TOTAL'] + [ i for i in hist.columns if i != 'TOTAL']]
hist = hist[['TOTAL'] + [i for i in hist.columns if i != 'TOTAL']]
hist = hist.T.fillna(0.0).astype(int)
#hist = hist.apply(lambda x: x/x.sum()*100, axis=1)
hist.to_csv(args.repRID + '.tin.hist.tsv',sep='\t')
medFile = open(args.repRID + '.tin.med.csv',"w")
medFile.write(str(round(tin['TIN'][(tin['TIN']!=0)].median(),2)))
hist.to_csv(args.repRID + '.tin.hist.tsv', sep='\t')
medFile = open(args.repRID + '.tin.med.csv', "w")
medFile.write(str(round(tin['TIN'][(tin['TIN'] != 0)].median(), 2)))
medFile.close()
if __name__ == '__main__':
main()
#!/usr/bin/env python3
#
# * --------------------------------------------------------------------------
# * Licensed under MIT (https://git.biohpc.swmed.edu/BICF/Astrocyte/chipseq_analysis/LICENSE.md)
# * --------------------------------------------------------------------------
#
'''General utilities.'''
import shlex
import logging
import subprocess
......@@ -32,7 +23,8 @@ def run_pipe(steps, outfile=None):
if n == first_step_n:
if n == last_step_n and outfile: # one-step pipeline with outfile
with open(outfile, 'w') as fh:
print("one step shlex: %s to file: %s" % (shlex.split(step), outfile))
print("one step shlex: %s to file: %s" %
(shlex.split(step), outfile))
p = Popen(shlex.split(step), stdout=fh)
break
print("first step shlex to stdout: %s" % (shlex.split(step)))
......@@ -40,12 +32,14 @@ def run_pipe(steps, outfile=None):
p = Popen(shlex.split(step), stdout=PIPE)
elif n == last_step_n and outfile: # only treat the last step specially if you're sending stdout to a file
with open(outfile, 'w') as fh:
print("last step shlex: %s to file: %s" % (shlex.split(step), outfile))
print("last step shlex: %s to file: %s" %
(shlex.split(step), outfile))
p_last = Popen(shlex.split(step), stdin=p.stdout, stdout=fh)
p.stdout.close()
p = p_last
else: # handles intermediate steps and, in the case of a pipe to stdout, the last step
print("intermediate step %d shlex to stdout: %s" % (n, shlex.split(step)))
print("intermediate step %d shlex to stdout: %s" %
(n, shlex.split(step)))
p_next = Popen(shlex.split(step), stdin=p.stdout, stdout=PIPE)
p.stdout.close()
p = p_next
......@@ -54,7 +48,8 @@ def run_pipe(steps, outfile=None):
def block_on(command):
process = subprocess.Popen(shlex.split(command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
process = subprocess.Popen(shlex.split(
command), stderr=subprocess.STDOUT, stdout=subprocess.PIPE)
for line in iter(process.stdout.readline, b''):
sys.stdout.write(line.decode('utf-8'))
process.communicate()
......@@ -77,7 +72,7 @@ def count_lines(filename):
"compress",
"bzip2",
"gzip"
]
]
mime_type = mimetypes.guess_type(filename)[1]
if mime_type in compressed_mimetypes:
catcommand = 'gzip -dc'
......@@ -86,7 +81,7 @@ def count_lines(filename):
out, err = run_pipe([
'%s %s' % (catcommand, filename),
'wc -l'
])
])
return int(out)
......
......@@ -6,18 +6,24 @@ import os
import utils
data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../../'
'/../../'
@pytest.mark.alignData
def test_alignData_se():
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.unal.gz'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.bam'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.bam.bai'))
assert os.path.exists(os.path.join(
data_output_path, 'Q-Y5F6_1M.se.unal.gz'))
assert os.path.exists(os.path.join(
data_output_path, 'Q-Y5F6_1M.se.sorted.bam'))
assert os.path.exists(os.path.join(
data_output_path, 'Q-Y5F6_1M.se.sorted.bam.bai'))
@pytest.mark.alignData
def test_alignData_pe():
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.pe.unal.gz'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.pe.sorted.bam'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.pe.sorted.bam.bai'))
assert os.path.exists(os.path.join(
data_output_path, 'Q-Y5F6_1M.pe.unal.gz'))
assert os.path.exists(os.path.join(
data_output_path, 'Q-Y5F6_1M.pe.sorted.bam'))
assert os.path.exists(os.path.join(
data_output_path, 'Q-Y5F6_1M.pe.sorted.bam.bai'))
......@@ -6,19 +6,24 @@ from io import StringIO
import os
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../../'
'/../../'
@pytest.mark.consistencySE
def test_consistencySE():
assert os.path.exists(os.path.join(test_output_path, 'SE_multiqc_data.json'))
assert readAssigned("assignedSE.txt","assignedExpectSE.txt")
assert os.path.exists(os.path.join(
test_output_path, 'SE_multiqc_data.json'))
assert readAssigned("assignedSE.txt", "assignedExpectSE.txt")
@pytest.mark.consistencyPE
def test_consistencyPE():
assert os.path.exists(os.path.join(test_output_path, 'PE_multiqc_data.json'))
assert readAssigned("assignedPE.txt","assignedExpectPE.txt")
assert os.path.exists(os.path.join(
test_output_path, 'PE_multiqc_data.json'))
assert readAssigned("assignedPE.txt", "assignedExpectPE.txt")
def readAssigned(fileAssigned,fileExpectAssigned):
def readAssigned(fileAssigned, fileExpectAssigned):
data = False
assigned = open(fileAssigned, "r")
expect = open(fileExpectAssigned, "r")
......
......@@ -6,12 +6,16 @@ from io import StringIO
import os
test_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../../'
'/../../'
@pytest.mark.dataQC
def test_dataQC():
assert os.path.exists(os.path.join(test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.tin.xls'))
assert countLines(os.path.join(test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.tin.xls'))
assert os.path.exists(os.path.join(
test_output_path, 'Q-Y5F6_1M.se.sorted.deduped.tin.xls'))
assert countLines(os.path.join(test_output_path,
'Q-Y5F6_1M.se.sorted.deduped.tin.xls'))
def countLines(fileName):
data = False
......
......@@ -6,16 +6,24 @@ import os
import utils
data_output_path = os.path.dirname(os.path.abspath(__file__)) + \
'/../../'
'/../../'
@pytest.mark.dedupData
def test_dedupData():
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.bam'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.bam.bai'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr8.bam'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr8.bam.bai'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr4.bam'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chr4.bam.bai'))
assert os.path.exists(os.path.join(data_output_path, 'Q-Y5F6_1M.se.sorted.deduped.chrY.bam'))