Unverified Commit bae42edd authored by Alokito's avatar Alokito Committed by GitHub
Browse files

Merge pull request #17 from Novartis/annotations

Support for Annotations
parents f41db75a cc09ac3b
......@@ -33,12 +33,12 @@ class BackendCache:
contents = self.entry_list
return [c.port for c in contents]
def check_entry(self, dataset):
def check_entry(self, key):
contents = self.entry_list
matches = [
c
for c in contents
if c.dataset == dataset and c.status != "terminated"
if c.key.dataset == key.dataset and c.key.annotation_file == key.annotation_file and c.status != "terminated"
]
if len(matches) == 0:
......@@ -51,13 +51,14 @@ class BackendCache:
"Found " + str(len(matches)) + " for " + dataset,
)
def create_entry(self, dataset, file_path, scripts):
def create_entry(self, key, scripts):
port = 8000
existing_ports = self.get_ports()
while (port in existing_ports) or is_port_in_use(port):
port += 1
entry = CacheEntry.for_dataset(dataset, file_path, port)
entry = CacheEntry.for_key(key, port)
background_thread = Thread(
target=process_backend.launch,
......
......@@ -8,20 +8,21 @@
# the specific language governing permissions and limitations under the License.
import psutil
import logging
import datetime
from flask import make_response, request
from flask import make_response, request, render_template
from requests import get, post, put
from cellxgene_gateway import env
from cellxgene_gateway.cellxgene_exception import CellxgeneException
from cellxgene_gateway.util import current_time_stamp
from cellxgene_gateway.flask_util import querystring
class CacheEntry:
def __init__(
self,
pid,
dataset,
file_path,
key,
port,
launchtime,
timestamp,
......@@ -32,8 +33,7 @@ class CacheEntry:
http_status,
):
self.pid = pid
self.dataset = dataset
self.file_path = file_path
self.key = key
self.port = port
self.launchtime = launchtime
self.timestamp = timestamp
......@@ -44,11 +44,11 @@ class CacheEntry:
self.http_status = http_status
@classmethod
def for_dataset(cls, dataset, file_path, port):
def for_key(cls, key, port):
return cls(
None,
dataset,
file_path,
key,
port,
current_time_stamp(),
current_time_stamp(),
......@@ -93,45 +93,62 @@ class CacheEntry:
self.status = "terminated"
def serve_content(self, path):
dataset = self.dataset
gateway_basepath = (
f"{env.external_protocol}://{env.external_host}/view/{dataset}/"
f"{env.external_protocol}://{env.external_host}/view/{self.key.pathpart}/"
)
subpath = path[len(dataset) :] # noqa: E203
subpath = path[len(self.key.pathpart) :] # noqa: E203
if len(subpath) == 0:
r = make_response(f"Redirect to {gateway_basepath}\n", 301)
r.headers["location"] = gateway_basepath
r.headers["location"] = gateway_basepath+querystring()
return r
elif self.status == "loading":
launch_time = datetime.datetime.fromtimestamp(self.launchtime)
return render_template(
"loading.html", launchtime=launch_time, all_output=self.all_output
)
port = self.port
cellxgene_basepath = f"http://127.0.0.1:{port}"
headers = {}
if "accept" in request.headers:
headers["accept"] = request.headers["accept"]
if "user-agent" in request.headers:
headers["user-agent"] = request.headers["user-agent"]
if "content-type" in request.headers:
headers["content-type"] = request.headers["content-type"]
copy_headers = [
'accept',
'accept-encoding',
'accept-language',
'cache-control',
'connection',
'content-length',
'content-type',
'cookie',
'host',
'origin',
'pragma',
'referer',
'sec-fetch-mode',
'sec-fetch-site',
'user-agent'
]
for h in copy_headers:
if h in request.headers:
headers[h] = request.headers[h]
full_path = cellxgene_basepath + subpath + querystring()
if request.method in ["GET", "HEAD", "OPTIONS"]:
cellxgene_response = get(
cellxgene_basepath + subpath, headers=headers
full_path, headers=headers
)
elif request.method == "PUT":
cellxgene_response = put(
cellxgene_basepath + subpath,
full_path,
headers=headers,
data=request.data.decode(),
data=request.data,
)
elif request.method == "POST":
cellxgene_response = post(
cellxgene_basepath + subpath,
full_path,
headers=headers,
data=request.data.decode(),
data=request.data,
)
else:
raise CellxgeneException(
......@@ -146,10 +163,15 @@ class CacheEntry:
else:
gateway_content = cellxgene_response.content
resp_headers = {}
for h in copy_headers:
if h in cellxgene_response.headers:
resp_headers[h] = cellxgene_response.headers[h]
gateway_response = make_response(
gateway_content,
cellxgene_response.status_code,
{"Content-Type": content_type},
resp_headers,
)
return gateway_response
# Copyright 2019 Novartis Institutes for BioMedical Research Inc. Licensed
# under the Apache License, Version 2.0 (the "License"); you may not use
# this file except in compliance with the License. You may obtain a copy
# of the License at http://www.apache.org/licenses/LICENSE-2.0. Unless
# required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. See the License for
# the specific language governing permissions and limitations under the License.
import os
from flask_api import status
from cellxgene_gateway import env
from cellxgene_gateway.cellxgene_exception import CellxgeneException
# There are three kinds of CacheKey:
# 1) somedir/dataset.h5ad: a dataset
# in this case, pathpart == dataset == 'somedir/dataset.h5ad'
# 2) somedir/dataset_annotations/saldaal1-T5HMVBNV.csv : an actual annotaitons file.
# in this case, pathpart == 'dataset_annotations/saldaal1-T5HMVBNV.csv', dataset == 'somedir/dataset.h5ad'
# 3) somedir/dataset_annotations: an annotation directory. The corresponding h5ad must exist, but the directory may not.
# in this case, pathpart == 'dataset_annotations', dataset == 'somedir/dataset.h5ad'
class CacheKey:
def __init__(self, pathpart, dataset, annotation_file):
self.pathpart = pathpart
self.dataset = dataset
self.annotation_file = annotation_file
......@@ -51,45 +51,8 @@ def create_dir(parent_path, dir_name):
else:
os.mkdir(full_path)
def recurse_dir(path):
if not os.path.exists(path):
raise CellxgeneException(
"The given path does not exist.", status.HTTP_400_BAD_REQUEST
)
def make_entry(el):
full_path = os.path.join(path, el)
if os.path.isfile(full_path):
return {
"path": full_path.replace(env.cellxgene_data, ""),
"name": el,
"type": "file",
}
elif os.path.isdir(full_path):
return {
"path": full_path.replace(env.cellxgene_data, ""),
"name": el,
"type": "directory",
"children": recurse_dir(full_path),
}
else:
raise CellxgeneException(
"Given path is neither file nor directory.",
status.HTTP_400_BAD_REQUEST,
)
return [make_entry(x) for x in os.listdir(path)]
def render_entries(entries):
return "<ul>" + "\n".join([render_entry(e) for e in entries]) + "</ul>"
def render_entry(entry):
if entry["type"] == "file":
url = f"/view/{entry['path'].lstrip('/')}"
return f"<li> <a href='{ url}'>{entry['name']}</a></li>"
elif entry["type"] == "directory":
url = f"/filecrawl/{entry['path'].lstrip('/')}"
return f"<li><a href='{url}'>{entry['name']}</a>{render_entries(entry['children'])}</li>"
annotations_suffix = '_annotations'
def make_h5ad(el):
return el[:-len(annotations_suffix)]+'.h5ad'
def make_annotations(el):
return el[:-5]+annotations_suffix
......@@ -20,6 +20,7 @@ ip = os.environ.get("GATEWAY_IP")
extra_scripts = os.environ.get("GATEWAY_EXTRA_SCRIPTS")
ttl = os.environ.get("GATEWAY_TTL")
enable_upload = os.environ.get("GATEWAY_ENABLE_UPLOAD", "").lower() in ['true', '1']
enable_annotations = os.environ.get("GATEWAY_ENABLE_ANNOTATIONS", "").lower() in ['true', '1']
env_vars = {
"CELLXGENE_LOCATION": cellxgene_location,
......@@ -34,6 +35,7 @@ optional_env_vars = {
"GATEWAY_EXTRA_SCRIPTS": extra_scripts,
"GATEWAY_TTL": ttl,
"GATEWAY_ENABLE_UPLOAD": enable_upload,
"GATEWAY_ENABLE_ANNOTATIONS": enable_annotations,
}
def validate():
......
import os
from cellxgene_gateway import env
from cellxgene_gateway.dir_util import make_h5ad, make_annotations, annotations_suffix
def recurse_dir(path):
if not os.path.exists(path):
raise CellxgeneException(
"The given path does not exist.", status.HTTP_400_BAD_REQUEST
)
all_entries = os.listdir(path)
def is_h5ad(el):
return el.endswith('.h5ad') and os.path.isfile(os.path.join(path, el))
h5ad_entries = [x for x in all_entries if is_h5ad(x)]
annotation_dir_entries = [x for x in all_entries if x.endswith(annotations_suffix) and make_h5ad(x) in h5ad_entries]
def list_annotations(el):
full_path = os.path.join(path, el)
if not os.path.isdir(full_path):
entries = []
else:
entries = [{
"name": x[:-13] if (len(x) > 13 and x[-13] in ['-','_']) else (
x[:-4] if x.endswith('.csv') else x),
"path": os.path.join(full_path, x).replace(env.cellxgene_data, ""),
} for x in os.listdir(full_path) if x.endswith('.csv') and os.path.isfile(os.path.join(full_path, x))]
return [{"name":'new', "class":'new', "path":full_path.replace(env.cellxgene_data, "")}] + entries
def make_entry(el):
full_path = os.path.join(path, el)
if el in h5ad_entries:
return {
"path": full_path.replace(env.cellxgene_data, ""),
"name": el,
"type": "file",
"annotations": list_annotations(make_annotations(el)),
}
elif os.path.isdir(full_path) and el not in annotation_dir_entries:
return {
"path": full_path.replace(env.cellxgene_data, ""),
"name": el,
"type": "directory",
"children": recurse_dir(full_path),
}
else:
return {
"path": full_path,
"name": el,
"type": "neither",
}
return [make_entry(x) for x in os.listdir(path)]
def render_entries(entries):
return "<ul>" + "\n".join([render_entry(e) for e in entries]) + "</ul>"
def get_url(entry):
return f"/view/{ entry['path'].lstrip('/') }"
def get_class(entry):
return f" class='{entry['class']}'" if 'class' in entry else ''
def render_annotations(entry):
if len(entry['annotations']) > 0:
return ' | annotations: ' + ", ".join([f"<a href='{get_url(a)}'{get_class(a)}>{a['name']}</a>" for a in entry['annotations']])
else:
return ''
def render_entry(entry):
if entry["type"] == "file":
return f"<li> <a href='{ get_url(entry) }'>{entry['name']}</a> {render_annotations(entry)}</li>"
elif entry["type"] == "directory":
url = f"/filecrawl/{entry['path'].lstrip('/')}"
return f"<li><a href='{url}'>{entry['name']}</a>{render_entries(entry['children'])}</li>"
else:
return ""
from flask import request
def querystring():
qs = request.query_string.decode()
return f'?{qs}' if len(qs) > 0 else ''
......@@ -8,7 +8,6 @@
# the specific language governing permissions and limitations under the License.
# import BaseHTTPServer
import datetime
import os
import logging
from threading import Thread, Lock
......@@ -17,6 +16,7 @@ import json
from flask import (
Flask,
redirect,
make_response,
render_template,
request,
send_from_directory,
......@@ -28,14 +28,23 @@ from werkzeug import secure_filename
from cellxgene_gateway import env
from cellxgene_gateway.backend_cache import BackendCache
from cellxgene_gateway.cellxgene_exception import CellxgeneException
from cellxgene_gateway.dir_util import create_dir, recurse_dir, render_entries, is_subdir
from cellxgene_gateway.dir_util import create_dir, is_subdir
from cellxgene_gateway.filecrawl import recurse_dir, render_entries
from cellxgene_gateway.extra_scripts import get_extra_scripts
from cellxgene_gateway.path_util import get_dataset, get_file_path
from cellxgene_gateway.process_exception import ProcessException
from cellxgene_gateway.prune_process_cache import PruneProcessCache
from cellxgene_gateway.util import current_time_stamp
from cellxgene_gateway.path_util import get_key
app = Flask(__name__)
def _force_https(app):
def wrapper(environ, start_response):
environ['wsgi.url_scheme'] = env.external_protocol
return app(environ, start_response)
return wrapper
app.wsgi_app = _force_https(app.wsgi_app)
cache = BackendCache()
location = f"{env.external_protocol}://{env.external_host}"
......@@ -73,7 +82,8 @@ def handle_invalid_process(error):
http_status=error.http_status,
stdout=error.stdout,
stderr=error.stderr,
dataset=error.dataset,
dataset=error.key.dataset,
annotation_file=error.key.annotation_file,
),
error.http_status,
)
......@@ -149,7 +159,7 @@ def upload_file():
"Invalid directory.", status.HTTP_400_BAD_REQUEST
)
return redirect(env.location, code=302)
return redirect(location, code=302)
if env.enable_upload:
......@@ -159,14 +169,18 @@ if env.enable_upload:
@app.route("/filecrawl.html")
def filecrawl():
entries = recurse_dir(env.cellxgene_data)
rendered_html = render_entries(entries)
return render_template(
resp = make_response(render_template(
"filecrawl.html",
extra_scripts=get_extra_scripts(),
rendered_html=rendered_html,
)
))
resp.headers["Cache-Control"] = "no-cache, no-store, must-revalidate"
resp.headers["Pragma"] = "no-cache"
resp.headers["Expires"] = "0"
resp.headers['Cache-Control'] = 'public, max-age=0'
return resp
@app.route("/filecrawl/<path:path>")
def do_filecrawl(path):
......@@ -187,23 +201,18 @@ def do_filecrawl(path):
entry_lock = Lock()
@app.route("/view/<path:path>", methods=["GET", "PUT", "POST"])
def do_view(path):
dataset = get_dataset(path)
file_path = get_file_path(dataset)
key = get_key(path)
print(f"view path={path}, dataset={key.dataset}, annotation_file= {key.annotation_file}, key={key.pathpart}")
with entry_lock:
match = cache.check_entry(dataset)
match = cache.check_entry(key)
if match is None:
uascripts = get_extra_scripts()
match = cache.create_entry(dataset, file_path, uascripts)
match = cache.create_entry(key, uascripts)
match.timestamp = current_time_stamp()
if match.status == "loaded":
if match.status == "loaded" or match.status == "loading":
return match.serve_content(path)
elif match.status == "loading":
launch_time = datetime.datetime.fromtimestamp(match.launchtime)
return render_template(
"loading.html", launchtime=launch_time, all_output=match.all_output
)
elif match.status == "error":
raise ProcessException.from_cache_entry(match)
......@@ -216,7 +225,8 @@ def do_GET_status():
def do_GET_status_json():
return json.dumps({'launchtime':app.launchtime,
'entry_list':[{
'dataset': entry.dataset,
'dataset': entry.key.dataset,
'annotation_file': entry.key.annotation_file,
'launchtime': entry.launchtime,
'last_access': entry.timestamp,
'status': entry.status
......@@ -224,16 +234,17 @@ def do_GET_status_json():
@app.route("/relaunch/<path:path>", methods=["GET"])
def do_relaunch(path):
dataset = get_dataset(path)
match = cache.check_entry(dataset)
key = get_key(path)
match = cache.check_entry(key)
if not match is None:
match.terminate()
return redirect(url_for("do_view", path=path), code=302)
qs = request.query_string.decode()
return redirect(url_for("do_view", path=path) + (f'?{qs}' if len(qs) > 0 else ''), code=302)
@app.route("/terminate/<path:path>", methods=["GET"])
def do_terminate(path):
dataset = get_dataset(path)
match = cache.check_entry(dataset)
key = get_key(path)
match = cache.check_entry(key)
if not match is None:
match.terminate()
return redirect(url_for("do_GET_status"), code=302)
......
......@@ -13,37 +13,83 @@ from flask_api import status
from cellxgene_gateway import env
from cellxgene_gateway.cellxgene_exception import CellxgeneException
from cellxgene_gateway.dir_util import make_h5ad
from cellxgene_gateway.cache_key import CacheKey
def get_dataset(path):
def get_key(path):
if path == "/" or path == "":
raise CellxgeneException(
"No matching dataset found.", status.HTTP_404_NOT_FOUND
)
trimmed = path[:-1] if path[-1] == "/" else path
try:
get_file_path(trimmed)
return trimmed
except CellxgeneException:
split = os.path.split(trimmed)
return get_dataset(split[0])
# valid paths come in three forms:
if trimmed.endswith('.h5ad') and data_file_exists(trimmed):
# 1) somedir/dataset.h5ad: a dataset
return CacheKey(trimmed, trimmed, None)
elif trimmed.endswith('.csv'):
# 2) somedir/dataset_annotations/saldaal1-T5HMVBNV.csv : an actual annotations file.
annotations_dir = os.path.split(trimmed)[0]
dataset = make_h5ad(annotations_dir)
if data_file_exists(dataset):
data_dir_ensure(annotations_dir)
return CacheKey(trimmed, dataset, trimmed)
elif trimmed.endswith('_annotations') and data_dir_exists(trimmed):
# 3) somedir/dataset_annotations: an annotation directory. The corresponding h5ad must exist, but the directory may not.
dataset = make_h5ad(trimmed)
if data_file_exists(dataset):
return CacheKey(trimmed, dataset, '')
except CellxgeneException:
pass
split = os.path.split(trimmed)
return get_key(split[0])
def validate_path(file_path):
def validate_exists(file_path):
if not os.path.exists(file_path):
raise CellxgeneException(
"File does not exist: " + file_path, status.HTTP_400_BAD_REQUEST
)
def validate_is_file(file_path):
validate_exists(file_path)
if not os.path.isfile(file_path):
raise CellxgeneException(
"Path is not file: " + file_path, status.HTTP_400_BAD_REQUEST
)
return
def validate_is_dir(file_path):
validate_exists(file_path)
if not os.path.isdir(file_path):
raise CellxgeneException(
"Path is not dir: " + file_path, status.HTTP_400_BAD_REQUEST
)
return
def data_file_exists(dataset):
file_path = os.path.join(env.cellxgene_data, dataset)
validate_is_file(file_path)
return True
def data_dir_exists(dataset):
file_path = os.path.join(env.cellxgene_data, dataset)
validate_is_dir(file_path)
return True
def data_dir_ensure(dataset):
file_path = os.path.join(env.cellxgene_data, dataset)
if not os.path.exists(file_path):
os.makedirs(file_path)
def get_file_path(dataset):
def get_file_path(key):
dataset = key.dataset
file_path = os.path.join(env.cellxgene_data, dataset)
validate_path(file_path)
validate_is_file(file_path)
return file_path
def get_annotation_file_path(key):
if key.annotation_file is None:
return None
if key.annotation_file == '':
return ''