Unverified Commit 638923fb authored by Alokito's avatar Alokito Committed by GitHub
Browse files

Merge pull request #44 from Novartis/itemsource

Itemsource
parents 5404a8b0 169da934
......@@ -22,7 +22,7 @@ jobs:
pip install black
- name: Run black
run: |
black -l 79 . --check
black . --check
# This job is copied over from `deploy.yaml`
run-tests:
runs-on: ubuntu-18.04
......
......@@ -59,7 +59,11 @@ cellxgene-gateway
Here's what the environment variables mean:
* `CELLXGENE_LOCATION` - the location of the cellxgene executable, e.g. `~/anaconda2/envs/cellxgene/bin/cellxgene`
At least one of the following is required:
* `CELLXGENE_DATA` - a directory that can contain subdirectories with `.h5ad` data files, *without* trailing slash, e.g. `/mnt/cellxgene_data`
* `CELLXGENE_BUCKET` - an s3 bucket that can contain keys with `.h5ad` data files, e.g. `my-cellxgene-data-bucket`
Cellxgene Gateway is designed to make it easy to add additional data sources, please see the source code for gateway.py and the ItemSource interface in items/item_source.py
Optional environment variables:
* `CELLXGENE_ARGS` - catch-all variable that can be used to pass additional command line args to cellxgene server
......@@ -68,7 +72,6 @@ Optional environment variables:
* `GATEWAY_IP` - ip addess of instance gateway is running on, mostly used to display SSH instructions. Defaults to `socket.gethostbyname(socket.gethostname())`
* `GATEWAY_PORT` - local port that the gateway should bind to, defaults to 5005
* `GATEWAY_EXTRA_SCRIPTS` - JSON array of script paths, will be embedded into each page and forwarded with `--scripts` to cellxgene server
* `GATEWAY_ENABLE_UPLOAD` - Set to `true` or `1` to enable HTTP uploads. This is not recommended for a public server.
* `GATEWAY_ENABLE_ANNOTATIONS` - Set to `true` or to `1` to enable cellxgene annotations.
* `GATEWAY_ENABLE_BACKED_MODE` - Set to `true` or to `1` to load AnnData in file-backed mode. This saves memory and speeds up launch time but may reduce overall performance.
......@@ -142,9 +145,8 @@ pre-commit install
pip install isort flake8 black
```bash
isort -rc .
flake8 .
black -l 79 .
isort -rc . # rc means recursive, and was deprecated in dev version of isort
black .
```
# Getting Help
......
......@@ -9,11 +9,13 @@
import time
from threading import Thread
from typing import List
from flask_api import status
from cellxgene_gateway import env
from cellxgene_gateway.cache_entry import CacheEntry, CacheEntryStatus
from cellxgene_gateway.cache_key import CacheKey
from cellxgene_gateway.cellxgene_exception import CellxgeneException
from cellxgene_gateway.subprocess_backend import SubprocessBackend
......@@ -35,13 +37,13 @@ class BackendCache:
contents = self.entry_list
return [c.port for c in contents]
def check_entry(self, key):
def check_path(self, source, path):
contents = self.entry_list
matches = [
c
for c in contents
if c.key.dataset == key.dataset
and c.key.annotation_file == key.annotation_file
if c.key.source.name == source.name
and path.startswith(c.key.descriptor)
and c.status != CacheEntryStatus.terminated
]
......@@ -52,10 +54,28 @@ class BackendCache:
else:
raise CellxgeneException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
"Found " + str(len(matches)) + " for " + dataset,
"Found " + str(len(matches)) + " for " + path,
)
def check_entry(self, key):
contents = self.entry_list
matches = [
c
for c in contents
if c.key.equals(key) and c.status != CacheEntryStatus.terminated
]
if len(matches) == 0:
return None
elif len(matches) == 1:
return matches[0]
else:
raise CellxgeneException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
"Found " + str(len(matches)) + " for " + key.dataset,
)
def create_entry(self, key, scripts):
def create_entry(self, key: CacheKey, scripts: List[str]):
port = 8000
existing_ports = self.get_ports()
......
......@@ -9,11 +9,11 @@
import datetime
import logging
import re
import urllib.parse
from enum import Enum
import psutil
from flask import make_response, render_template, request
from flask.helpers import url_for
from flask.wrappers import Response
from requests import get, post, put
......@@ -71,6 +71,10 @@ class CacheEntry:
None,
)
@property
def source_name(self):
return self.key.source_name
def set_loaded(self, pid):
self.pid = pid
self.status = CacheEntryStatus.loaded
......@@ -100,9 +104,13 @@ class CacheEntry:
for child in children:
child.terminate()
psutil.wait_procs(children, callback=on_terminate)
terminated.append(p.pid)
p.terminate()
psutil.wait_procs([p], callback=on_terminate)
# the parent process may automatically die once its children have --
try:
p.terminate()
psutil.wait_procs([p], callback=on_terminate)
except psutil.NoSuchProcess:
pass
logging.getLogger("cellxgene_gateway").info(f"terminated {terminated}")
self.status = CacheEntryStatus.terminated
......@@ -111,24 +119,20 @@ class CacheEntry:
gateway_content = (
re.sub(
'(="|\()/static/',
f"\\1{self.gateway_basepath()}static/",
f"\\1{self.key.gateway_basepath()}static/",
cellxgene_content,
)
.replace("http://fonts.gstatic.com", "https://fonts.gstatic.com")
.replace(self.cellxgene_basepath(), self.gateway_basepath())
.replace(self.cellxgene_basepath(), self.key.gateway_basepath())
)
return gateway_content
def gateway_basepath(self):
return url_for("do_view", path=self.key.pathpart) + "/"
def cellxgene_basepath(self):
return f"http://127.0.0.1:{self.port}"
def serve_content(self, path):
gateway_basepath = self.gateway_basepath()
subpath = path[len(self.key.pathpart) :] # noqa: E203
gateway_basepath = self.key.gateway_basepath()
subpath = path[len(self.key.descriptor) :] # noqa: E203
if len(subpath) == 0:
r = make_response(f"Redirect to {gateway_basepath}\n", 302)
r.headers["location"] = gateway_basepath + querystring()
......@@ -199,5 +203,4 @@ class CacheEntry:
cellxgene_response.status_code,
resp_headers,
)
return gateway_response
......@@ -9,15 +9,73 @@
# There are three kinds of CacheKey:
# 1) somedir/dataset.h5ad: a dataset
# in this case, pathpart == dataset == 'somedir/dataset.h5ad'
# 2) somedir/dataset_annotations/my_annotations.csv : an actual annotaitons file.
# in this case, pathpart == 'dataset_annotations/my_annotations.csv', dataset == 'somedir/dataset.h5ad'
# in this case, descriptor == dataset == 'somedir/dataset.h5ad'
# 2) somedir/dataset_annotations/my_annotations.csv : an actual annotations file.
# in this case, descriptor == 'somedir/dataset_annotations/my_annotations.csv', dataset == 'somedir/dataset.h5ad'
# 3) somedir/dataset_annotations: an annotation directory. The corresponding h5ad must exist, but the directory may not.
# in this case, pathpart == 'dataset_annotations', dataset == 'somedir/dataset.h5ad'
# in this case, descriptor == 'somedir/dataset_annotations', dataset == 'somedir/dataset.h5ad'
from cellxgene_gateway import flask_util
from cellxgene_gateway.items.item import Item
from cellxgene_gateway.items.item_source import ItemSource, LookupResult
class CacheKey:
def __init__(self, pathpart, dataset, annotation_file):
self.pathpart = pathpart
self.dataset = dataset
self.annotation_file = annotation_file
@property
def descriptor(self):
if self.annotation_item is None:
return self.h5ad_item.descriptor
else:
return self.annotation_item.descriptor
@property
def file_path(self):
return self.source.get_local_path(self.h5ad_item)
@property
def annotation_file_path(self):
if self.annotation_item is None:
return None
else:
return self.source.get_local_path(self.annotation_item)
def relaunch_url(self):
return flask_util.relaunch_url(self.descriptor, self.source_name)
def gateway_basepath(self):
return self.view_url + "/"
@property
def view_url(self):
return flask_util.view_url(self.descriptor, self.source_name)
@property
def source_name(self):
return self.source.name
@property
def annotation_descriptor(self):
if self.annotation_item is None:
return None
else:
return self.annotation_item.descriptor
def equals(self, other):
return (
(self.source.name == other.source.name)
and (self.h5ad_item.descriptor == other.h5ad_item.descriptor)
and (self.annotation_descriptor == other.annotation_descriptor)
)
def __init__(
self, h5ad_item: Item, source: ItemSource, annotation_item: Item = None
):
assert h5ad_item is not None
assert source is not None
self.h5ad_item = h5ad_item
self.annotation_item = annotation_item
self.source = source
@classmethod
def for_lookup(cls, source: ItemSource, lookup: LookupResult):
return CacheKey(lookup.h5ad_item, source, lookup.annotation_item)
......@@ -53,11 +53,17 @@ def create_dir(parent_path, dir_name):
annotations_suffix = "_annotations"
h5ad_suffix = ".h5ad"
def make_h5ad(el):
return el[: -len(annotations_suffix)] + ".h5ad"
return el[: -len(annotations_suffix)] + h5ad_suffix
def make_annotations(el):
return el[:-5] + annotations_suffix
def ensure_dir_exists(file_path):
if not os.path.exists(file_path):
os.makedirs(file_path)
......@@ -12,7 +12,7 @@ import os
import socket
cellxgene_location = os.environ.get("CELLXGENE_LOCATION")
cellxgene_data = os.environ.get("CELLXGENE_DATA")
cellxgene_data = os.environ.get("CELLXGENE_DATA", "")
cellxgene_args = os.environ.get("CELLXGENE_ARGS", None)
gateway_port = int(os.environ.get("GATEWAY_PORT", "5005"))
external_host = os.environ.get(
......@@ -22,13 +22,9 @@ external_host = os.environ.get(
external_protocol = os.environ.get(
"EXTERNAL_PROTOCOL", os.environ.get("GATEWAY_PROTOCOL", "http")
)
ip = os.environ.get("GATEWAY_IP", "127.0.0.1")
ip = os.environ.get("GATEWAY_IP")
extra_scripts = os.environ.get("GATEWAY_EXTRA_SCRIPTS")
ttl = os.environ.get("GATEWAY_TTL")
enable_upload = os.environ.get("GATEWAY_ENABLE_UPLOAD", "").lower() in [
"true",
"1",
]
enable_annotations = os.environ.get("GATEWAY_ENABLE_ANNOTATIONS", "").lower() in [
"true",
"1",
......@@ -40,7 +36,6 @@ enable_backed_mode = os.environ.get("GATEWAY_ENABLE_BACKED_MODE", "").lower() in
env_vars = {
"CELLXGENE_LOCATION": cellxgene_location,
"CELLXGENE_DATA": cellxgene_data,
}
proxy_fix_for = int(os.environ.get("PROXY_FIX_FOR", "0"))
......@@ -56,10 +51,10 @@ optional_env_vars = {
"GATEWAY_PORT": gateway_port,
"GATEWAY_EXTRA_SCRIPTS": extra_scripts,
"GATEWAY_TTL": ttl,
"GATEWAY_ENABLE_UPLOAD": enable_upload,
"GATEWAY_ENABLE_ANNOTATIONS": enable_annotations,
"GATEWAY_ENABLE_BACKED_MODE": enable_backed_mode,
"CELLXGENE_ARGS": cellxgene_args,
"CELLXGENE_DATA": cellxgene_data,
"PROXY_FIX_FOR": proxy_fix_for,
"PROXY_FIX_PROTO": proxy_fix_proto,
"PROXY_FIX_HOST": proxy_fix_host,
......
......@@ -8,111 +8,59 @@
# the specific language governing permissions and limitations under the License.
import os
import urllib.parse
from flask import url_for
from cellxgene_gateway import env
from cellxgene_gateway import env, flask_util
from cellxgene_gateway.cache_key import CacheKey
from cellxgene_gateway.dir_util import annotations_suffix, make_annotations, make_h5ad
def recurse_dir(path):
if not os.path.exists(path):
raise CellxgeneException(
"The given path does not exist.", status.HTTP_400_BAD_REQUEST
)
all_entries = sorted(os.listdir(path))
def is_h5ad(el):
return el.endswith(".h5ad") and os.path.isfile(os.path.join(path, el))
h5ad_entries = [x for x in all_entries if is_h5ad(x)]
annotation_dir_entries = [
x
for x in all_entries
if x.endswith(annotations_suffix) and make_h5ad(x) in h5ad_entries
]
def list_annotations(el):
full_path = os.path.join(path, el)
if not os.path.isdir(full_path):
entries = []
else:
entries = [
{
"name": x[:-13]
if (len(x) > 13 and x[-13] in ["-", "_"])
else (x[:-4] if x.endswith(".csv") else x),
"path": os.path.join(full_path, x + "/").replace(
env.cellxgene_data, ""
),
}
for x in sorted(os.listdir(full_path))
if x.endswith(".csv") and os.path.isfile(os.path.join(full_path, x))
]
return [
{
"name": "new",
"class": "new",
"path": full_path.replace(env.cellxgene_data, "").rstrip("/"),
}
] + entries
def make_entry(el):
full_path = os.path.join(path, el)
if el in h5ad_entries:
return {
"path": full_path.replace(env.cellxgene_data, ""),
"name": el,
"type": "file",
"annotations": list_annotations(make_annotations(el)),
}
elif os.path.isdir(full_path) and el not in annotation_dir_entries:
return {
"path": full_path.replace(env.cellxgene_data, ""),
"name": el,
"type": "directory",
"children": recurse_dir(full_path),
}
else:
return {
"path": full_path,
"name": el,
"type": "neither",
}
return [make_entry(x) for x in all_entries]
def render_entries(entries):
return "<ul>" + "\n".join([render_entry(e) for e in entries]) + "</ul>"
def get_url(entry):
return url_for("do_view", path=entry["path"].lstrip("/"))
def get_class(entry):
return f" class='{entry['class']}'" if "class" in entry else ""
def render_annotations(entry):
if len(entry["annotations"]) > 0:
return " | annotations: " + ", ".join(
def render_annotations(item, item_source):
url = flask_util.view_url(
item_source.get_annotations_subpath(item), item_source.name
)
new_annotation = f"<a class='new' href='{url}'>new</a>"
annotations = (
", ".join(
[
f"<a href='{get_url(a)}'{get_class(a)}>{a['name']}</a>"
for a in entry["annotations"]
f"<a href='{CacheKey(item, item_source, a).view_url}/'>{a.name}</a>"
for a in item.annotations
]
)
+ ", "
if item.annotations
else ""
)
return " | annotations: " + annotations + new_annotation
def render_item(item, item_source):
item_string = f"<li> <a href='{ CacheKey(item, item_source).view_url }/'>{item.name}</a> {render_annotations(item, item_source)}</li>"
return item_string
def render_item_tree(item_tree, item_source):
items = (
"\n".join([render_item(i, item_source) for i in item_tree.items])
if item_tree.items
else ""
)
branches = (
"\n".join([render_item_tree(b, item_source) for b in item_tree.branches])
if item_tree.branches
else ""
)
html = "<ul>" + items + branches + "</ul>"
if item_tree.descriptor:
descriptor = item_tree.descriptor.lstrip("/")
url = f"/filecrawl/{descriptor}?source={item_source.name}"
name = descriptor.rsplit("/")[1] if descriptor.find("/") >= 0 else descriptor
return f"<li><a href='{url}'>{name}</a>{html}</li>"
else:
return ""
return html
def render_entry(entry):
if entry["type"] == "file":
return f"<li> <a href='{ get_url(entry) }/'>{entry['name']}</a> {render_annotations(entry)}</li>"
elif entry["type"] == "directory":
url = f"/filecrawl/{entry['path'].lstrip('/')}"
return f"<li><a href='{url}'>{entry['name']}</a>{render_entries(entry['children'])}</li>"
else:
return ""
def render_item_source(item_source, filter=None):
item_tree = item_source.list_items(filter)
heading = f"<h6><a href='/filecrawl.html?source={urllib.parse.quote_plus(item_source.name)}'>{item_source.name}</a></h6>"
return heading + render_item_tree(item_tree, item_source)
......@@ -7,9 +7,27 @@
# OR CONDITIONS OF ANY KIND, either express or implied. See the License for
# the specific language governing permissions and limitations under the License.
from flask import request
from flask import request, url_for
def querystring():
qs = request.query_string.decode()
return f"?{qs}" if len(qs) > 0 else ""
include_source_in_url = False
def url(endpoint, descriptor, source_name):
if include_source_in_url:
return url_for(endpoint, source_name=source_name, path=descriptor)
else:
return url_for(endpoint, path=descriptor)
def view_url(descriptor, source_name):
return url("do_view", descriptor, source_name)
def relaunch_url(descriptor, source_name):
return url("do_relaunch", descriptor, source_name)
......@@ -6,10 +6,11 @@
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. See the License for
# the specific language governing permissions and limitations under the License.
# import BaseHTTPServer
import json
import logging
import os
import urllib.parse
from threading import Lock, Thread
from flask import (
......@@ -25,20 +26,23 @@ from flask_api import status
from werkzeug.middleware.proxy_fix import ProxyFix
from werkzeug.utils import secure_filename
from cellxgene_gateway import env
from cellxgene_gateway import env, flask_util
from cellxgene_gateway.backend_cache import BackendCache
from cellxgene_gateway.cache_entry import CacheEntryStatus
from cellxgene_gateway.cache_key import CacheKey
from cellxgene_gateway.cellxgene_exception import CellxgeneException
from cellxgene_gateway.dir_util import create_dir, is_subdir
from cellxgene_gateway.extra_scripts import get_extra_scripts
from cellxgene_gateway.filecrawl import recurse_dir, render_entries
from cellxgene_gateway.path_util import get_key
from cellxgene_gateway.filecrawl import render_item_source
from cellxgene_gateway.process_exception import ProcessException
from cellxgene_gateway.prune_process_cache import PruneProcessCache
from cellxgene_gateway.util import current_time_stamp
app = Flask(__name__)
item_sources = []
default_item_source = None
def _force_https(app):
def wrapper(environ, start_response):
......@@ -101,8 +105,8 @@ def handle_invalid_process(error):
http_status=error.http_status,
stdout=error.stdout,
stderr=error.stderr,
dataset=error.key.dataset,
annotation_file=error.key.annotation_file,
relaunch_url=error.key.relaunch_url(),
annotation_file=error.key.annotation_descriptor,
),
error.http_status,
)
......@@ -119,127 +123,85 @@ def favicon():
@app.route("/")
def index():
users = [
name
for name in os.listdir(env.cellxgene_data)
if os.path.isdir(os.path.join(env.cellxgene_data, name))
]
return render_template(
"index.html",
ip=env.ip,
cellxgene_data=env.cellxgene_data,
extra_scripts=get_extra_scripts(),
users=users,
enable_upload=env.enable_upload,
)
def make_user():
dir_name = request.form["directory"]
create_dir(env.cellxgene_data, dir_name)
return redirect(url_for("index"), code=302)
def make_subdir():
parent_path = os.path.join(env.cellxgene_data, request.form["usernames"])
dir_name = request.form["directory"]
create_dir(parent_path, dir_name)
return redirect(url_for("index"), code=302)
def upload_file():
upload_dir = request.form["path"]