Unverified Commit 638923fb authored by Alokito's avatar Alokito Committed by GitHub
Browse files

Merge pull request #44 from Novartis/itemsource

Itemsource
parents 5404a8b0 169da934
...@@ -22,7 +22,7 @@ jobs: ...@@ -22,7 +22,7 @@ jobs:
pip install black pip install black
- name: Run black - name: Run black
run: | run: |
black -l 79 . --check black . --check
# This job is copied over from `deploy.yaml` # This job is copied over from `deploy.yaml`
run-tests: run-tests:
runs-on: ubuntu-18.04 runs-on: ubuntu-18.04
......
...@@ -59,7 +59,11 @@ cellxgene-gateway ...@@ -59,7 +59,11 @@ cellxgene-gateway
Here's what the environment variables mean: Here's what the environment variables mean:
* `CELLXGENE_LOCATION` - the location of the cellxgene executable, e.g. `~/anaconda2/envs/cellxgene/bin/cellxgene` * `CELLXGENE_LOCATION` - the location of the cellxgene executable, e.g. `~/anaconda2/envs/cellxgene/bin/cellxgene`
At least one of the following is required:
* `CELLXGENE_DATA` - a directory that can contain subdirectories with `.h5ad` data files, *without* trailing slash, e.g. `/mnt/cellxgene_data` * `CELLXGENE_DATA` - a directory that can contain subdirectories with `.h5ad` data files, *without* trailing slash, e.g. `/mnt/cellxgene_data`
* `CELLXGENE_BUCKET` - an s3 bucket that can contain keys with `.h5ad` data files, e.g. `my-cellxgene-data-bucket`
Cellxgene Gateway is designed to make it easy to add additional data sources, please see the source code for gateway.py and the ItemSource interface in items/item_source.py
Optional environment variables: Optional environment variables:
* `CELLXGENE_ARGS` - catch-all variable that can be used to pass additional command line args to cellxgene server * `CELLXGENE_ARGS` - catch-all variable that can be used to pass additional command line args to cellxgene server
...@@ -68,7 +72,6 @@ Optional environment variables: ...@@ -68,7 +72,6 @@ Optional environment variables:
* `GATEWAY_IP` - ip addess of instance gateway is running on, mostly used to display SSH instructions. Defaults to `socket.gethostbyname(socket.gethostname())` * `GATEWAY_IP` - ip addess of instance gateway is running on, mostly used to display SSH instructions. Defaults to `socket.gethostbyname(socket.gethostname())`
* `GATEWAY_PORT` - local port that the gateway should bind to, defaults to 5005 * `GATEWAY_PORT` - local port that the gateway should bind to, defaults to 5005
* `GATEWAY_EXTRA_SCRIPTS` - JSON array of script paths, will be embedded into each page and forwarded with `--scripts` to cellxgene server * `GATEWAY_EXTRA_SCRIPTS` - JSON array of script paths, will be embedded into each page and forwarded with `--scripts` to cellxgene server
* `GATEWAY_ENABLE_UPLOAD` - Set to `true` or `1` to enable HTTP uploads. This is not recommended for a public server.
* `GATEWAY_ENABLE_ANNOTATIONS` - Set to `true` or to `1` to enable cellxgene annotations. * `GATEWAY_ENABLE_ANNOTATIONS` - Set to `true` or to `1` to enable cellxgene annotations.
* `GATEWAY_ENABLE_BACKED_MODE` - Set to `true` or to `1` to load AnnData in file-backed mode. This saves memory and speeds up launch time but may reduce overall performance. * `GATEWAY_ENABLE_BACKED_MODE` - Set to `true` or to `1` to load AnnData in file-backed mode. This saves memory and speeds up launch time but may reduce overall performance.
...@@ -142,9 +145,8 @@ pre-commit install ...@@ -142,9 +145,8 @@ pre-commit install
pip install isort flake8 black pip install isort flake8 black
```bash ```bash
isort -rc . isort -rc . # rc means recursive, and was deprecated in dev version of isort
flake8 . black .
black -l 79 .
``` ```
# Getting Help # Getting Help
......
...@@ -9,11 +9,13 @@ ...@@ -9,11 +9,13 @@
import time import time
from threading import Thread from threading import Thread
from typing import List
from flask_api import status from flask_api import status
from cellxgene_gateway import env from cellxgene_gateway import env
from cellxgene_gateway.cache_entry import CacheEntry, CacheEntryStatus from cellxgene_gateway.cache_entry import CacheEntry, CacheEntryStatus
from cellxgene_gateway.cache_key import CacheKey
from cellxgene_gateway.cellxgene_exception import CellxgeneException from cellxgene_gateway.cellxgene_exception import CellxgeneException
from cellxgene_gateway.subprocess_backend import SubprocessBackend from cellxgene_gateway.subprocess_backend import SubprocessBackend
...@@ -35,13 +37,13 @@ class BackendCache: ...@@ -35,13 +37,13 @@ class BackendCache:
contents = self.entry_list contents = self.entry_list
return [c.port for c in contents] return [c.port for c in contents]
def check_entry(self, key): def check_path(self, source, path):
contents = self.entry_list contents = self.entry_list
matches = [ matches = [
c c
for c in contents for c in contents
if c.key.dataset == key.dataset if c.key.source.name == source.name
and c.key.annotation_file == key.annotation_file and path.startswith(c.key.descriptor)
and c.status != CacheEntryStatus.terminated and c.status != CacheEntryStatus.terminated
] ]
...@@ -52,10 +54,28 @@ class BackendCache: ...@@ -52,10 +54,28 @@ class BackendCache:
else: else:
raise CellxgeneException( raise CellxgeneException(
status.HTTP_500_INTERNAL_SERVER_ERROR, status.HTTP_500_INTERNAL_SERVER_ERROR,
"Found " + str(len(matches)) + " for " + dataset, "Found " + str(len(matches)) + " for " + path,
)
def check_entry(self, key):
contents = self.entry_list
matches = [
c
for c in contents
if c.key.equals(key) and c.status != CacheEntryStatus.terminated
]
if len(matches) == 0:
return None
elif len(matches) == 1:
return matches[0]
else:
raise CellxgeneException(
status.HTTP_500_INTERNAL_SERVER_ERROR,
"Found " + str(len(matches)) + " for " + key.dataset,
) )
def create_entry(self, key, scripts): def create_entry(self, key: CacheKey, scripts: List[str]):
port = 8000 port = 8000
existing_ports = self.get_ports() existing_ports = self.get_ports()
......
...@@ -9,11 +9,11 @@ ...@@ -9,11 +9,11 @@
import datetime import datetime
import logging import logging
import re import re
import urllib.parse
from enum import Enum from enum import Enum
import psutil import psutil
from flask import make_response, render_template, request from flask import make_response, render_template, request
from flask.helpers import url_for
from flask.wrappers import Response from flask.wrappers import Response
from requests import get, post, put from requests import get, post, put
...@@ -71,6 +71,10 @@ class CacheEntry: ...@@ -71,6 +71,10 @@ class CacheEntry:
None, None,
) )
@property
def source_name(self):
return self.key.source_name
def set_loaded(self, pid): def set_loaded(self, pid):
self.pid = pid self.pid = pid
self.status = CacheEntryStatus.loaded self.status = CacheEntryStatus.loaded
...@@ -100,9 +104,13 @@ class CacheEntry: ...@@ -100,9 +104,13 @@ class CacheEntry:
for child in children: for child in children:
child.terminate() child.terminate()
psutil.wait_procs(children, callback=on_terminate) psutil.wait_procs(children, callback=on_terminate)
terminated.append(p.pid) # the parent process may automatically die once its children have --
p.terminate() try:
psutil.wait_procs([p], callback=on_terminate) p.terminate()
psutil.wait_procs([p], callback=on_terminate)
except psutil.NoSuchProcess:
pass
logging.getLogger("cellxgene_gateway").info(f"terminated {terminated}") logging.getLogger("cellxgene_gateway").info(f"terminated {terminated}")
self.status = CacheEntryStatus.terminated self.status = CacheEntryStatus.terminated
...@@ -111,24 +119,20 @@ class CacheEntry: ...@@ -111,24 +119,20 @@ class CacheEntry:
gateway_content = ( gateway_content = (
re.sub( re.sub(
'(="|\()/static/', '(="|\()/static/',
f"\\1{self.gateway_basepath()}static/", f"\\1{self.key.gateway_basepath()}static/",
cellxgene_content, cellxgene_content,
) )
.replace("http://fonts.gstatic.com", "https://fonts.gstatic.com") .replace("http://fonts.gstatic.com", "https://fonts.gstatic.com")
.replace(self.cellxgene_basepath(), self.gateway_basepath()) .replace(self.cellxgene_basepath(), self.key.gateway_basepath())
) )
return gateway_content return gateway_content
def gateway_basepath(self):
return url_for("do_view", path=self.key.pathpart) + "/"
def cellxgene_basepath(self): def cellxgene_basepath(self):
return f"http://127.0.0.1:{self.port}" return f"http://127.0.0.1:{self.port}"
def serve_content(self, path): def serve_content(self, path):
gateway_basepath = self.gateway_basepath() gateway_basepath = self.key.gateway_basepath()
subpath = path[len(self.key.pathpart) :] # noqa: E203 subpath = path[len(self.key.descriptor) :] # noqa: E203
if len(subpath) == 0: if len(subpath) == 0:
r = make_response(f"Redirect to {gateway_basepath}\n", 302) r = make_response(f"Redirect to {gateway_basepath}\n", 302)
r.headers["location"] = gateway_basepath + querystring() r.headers["location"] = gateway_basepath + querystring()
...@@ -199,5 +203,4 @@ class CacheEntry: ...@@ -199,5 +203,4 @@ class CacheEntry:
cellxgene_response.status_code, cellxgene_response.status_code,
resp_headers, resp_headers,
) )
return gateway_response return gateway_response
...@@ -9,15 +9,73 @@ ...@@ -9,15 +9,73 @@
# There are three kinds of CacheKey: # There are three kinds of CacheKey:
# 1) somedir/dataset.h5ad: a dataset # 1) somedir/dataset.h5ad: a dataset
# in this case, pathpart == dataset == 'somedir/dataset.h5ad' # in this case, descriptor == dataset == 'somedir/dataset.h5ad'
# 2) somedir/dataset_annotations/my_annotations.csv : an actual annotaitons file. # 2) somedir/dataset_annotations/my_annotations.csv : an actual annotations file.
# in this case, pathpart == 'dataset_annotations/my_annotations.csv', dataset == 'somedir/dataset.h5ad' # in this case, descriptor == 'somedir/dataset_annotations/my_annotations.csv', dataset == 'somedir/dataset.h5ad'
# 3) somedir/dataset_annotations: an annotation directory. The corresponding h5ad must exist, but the directory may not. # 3) somedir/dataset_annotations: an annotation directory. The corresponding h5ad must exist, but the directory may not.
# in this case, pathpart == 'dataset_annotations', dataset == 'somedir/dataset.h5ad' # in this case, descriptor == 'somedir/dataset_annotations', dataset == 'somedir/dataset.h5ad'
from cellxgene_gateway import flask_util
from cellxgene_gateway.items.item import Item
from cellxgene_gateway.items.item_source import ItemSource, LookupResult
class CacheKey: class CacheKey:
def __init__(self, pathpart, dataset, annotation_file): @property
self.pathpart = pathpart def descriptor(self):
self.dataset = dataset if self.annotation_item is None:
self.annotation_file = annotation_file return self.h5ad_item.descriptor
else:
return self.annotation_item.descriptor
@property
def file_path(self):
return self.source.get_local_path(self.h5ad_item)
@property
def annotation_file_path(self):
if self.annotation_item is None:
return None
else:
return self.source.get_local_path(self.annotation_item)
def relaunch_url(self):
return flask_util.relaunch_url(self.descriptor, self.source_name)
def gateway_basepath(self):
return self.view_url + "/"
@property
def view_url(self):
return flask_util.view_url(self.descriptor, self.source_name)
@property
def source_name(self):
return self.source.name
@property
def annotation_descriptor(self):
if self.annotation_item is None:
return None
else:
return self.annotation_item.descriptor
def equals(self, other):
return (
(self.source.name == other.source.name)
and (self.h5ad_item.descriptor == other.h5ad_item.descriptor)
and (self.annotation_descriptor == other.annotation_descriptor)
)
def __init__(
self, h5ad_item: Item, source: ItemSource, annotation_item: Item = None
):
assert h5ad_item is not None
assert source is not None
self.h5ad_item = h5ad_item
self.annotation_item = annotation_item
self.source = source
@classmethod
def for_lookup(cls, source: ItemSource, lookup: LookupResult):
return CacheKey(lookup.h5ad_item, source, lookup.annotation_item)
...@@ -53,11 +53,17 @@ def create_dir(parent_path, dir_name): ...@@ -53,11 +53,17 @@ def create_dir(parent_path, dir_name):
annotations_suffix = "_annotations" annotations_suffix = "_annotations"
h5ad_suffix = ".h5ad"
def make_h5ad(el): def make_h5ad(el):
return el[: -len(annotations_suffix)] + ".h5ad" return el[: -len(annotations_suffix)] + h5ad_suffix
def make_annotations(el): def make_annotations(el):
return el[:-5] + annotations_suffix return el[:-5] + annotations_suffix
def ensure_dir_exists(file_path):
if not os.path.exists(file_path):
os.makedirs(file_path)
...@@ -12,7 +12,7 @@ import os ...@@ -12,7 +12,7 @@ import os
import socket import socket
cellxgene_location = os.environ.get("CELLXGENE_LOCATION") cellxgene_location = os.environ.get("CELLXGENE_LOCATION")
cellxgene_data = os.environ.get("CELLXGENE_DATA") cellxgene_data = os.environ.get("CELLXGENE_DATA", "")
cellxgene_args = os.environ.get("CELLXGENE_ARGS", None) cellxgene_args = os.environ.get("CELLXGENE_ARGS", None)
gateway_port = int(os.environ.get("GATEWAY_PORT", "5005")) gateway_port = int(os.environ.get("GATEWAY_PORT", "5005"))
external_host = os.environ.get( external_host = os.environ.get(
...@@ -22,13 +22,9 @@ external_host = os.environ.get( ...@@ -22,13 +22,9 @@ external_host = os.environ.get(
external_protocol = os.environ.get( external_protocol = os.environ.get(
"EXTERNAL_PROTOCOL", os.environ.get("GATEWAY_PROTOCOL", "http") "EXTERNAL_PROTOCOL", os.environ.get("GATEWAY_PROTOCOL", "http")
) )
ip = os.environ.get("GATEWAY_IP", "127.0.0.1") ip = os.environ.get("GATEWAY_IP")
extra_scripts = os.environ.get("GATEWAY_EXTRA_SCRIPTS") extra_scripts = os.environ.get("GATEWAY_EXTRA_SCRIPTS")
ttl = os.environ.get("GATEWAY_TTL") ttl = os.environ.get("GATEWAY_TTL")
enable_upload = os.environ.get("GATEWAY_ENABLE_UPLOAD", "").lower() in [
"true",
"1",
]
enable_annotations = os.environ.get("GATEWAY_ENABLE_ANNOTATIONS", "").lower() in [ enable_annotations = os.environ.get("GATEWAY_ENABLE_ANNOTATIONS", "").lower() in [
"true", "true",
"1", "1",
...@@ -40,7 +36,6 @@ enable_backed_mode = os.environ.get("GATEWAY_ENABLE_BACKED_MODE", "").lower() in ...@@ -40,7 +36,6 @@ enable_backed_mode = os.environ.get("GATEWAY_ENABLE_BACKED_MODE", "").lower() in
env_vars = { env_vars = {
"CELLXGENE_LOCATION": cellxgene_location, "CELLXGENE_LOCATION": cellxgene_location,
"CELLXGENE_DATA": cellxgene_data,
} }
proxy_fix_for = int(os.environ.get("PROXY_FIX_FOR", "0")) proxy_fix_for = int(os.environ.get("PROXY_FIX_FOR", "0"))
...@@ -56,10 +51,10 @@ optional_env_vars = { ...@@ -56,10 +51,10 @@ optional_env_vars = {
"GATEWAY_PORT": gateway_port, "GATEWAY_PORT": gateway_port,
"GATEWAY_EXTRA_SCRIPTS": extra_scripts, "GATEWAY_EXTRA_SCRIPTS": extra_scripts,
"GATEWAY_TTL": ttl, "GATEWAY_TTL": ttl,
"GATEWAY_ENABLE_UPLOAD": enable_upload,
"GATEWAY_ENABLE_ANNOTATIONS": enable_annotations, "GATEWAY_ENABLE_ANNOTATIONS": enable_annotations,
"GATEWAY_ENABLE_BACKED_MODE": enable_backed_mode, "GATEWAY_ENABLE_BACKED_MODE": enable_backed_mode,
"CELLXGENE_ARGS": cellxgene_args, "CELLXGENE_ARGS": cellxgene_args,
"CELLXGENE_DATA": cellxgene_data,
"PROXY_FIX_FOR": proxy_fix_for, "PROXY_FIX_FOR": proxy_fix_for,
"PROXY_FIX_PROTO": proxy_fix_proto, "PROXY_FIX_PROTO": proxy_fix_proto,
"PROXY_FIX_HOST": proxy_fix_host, "PROXY_FIX_HOST": proxy_fix_host,
......
...@@ -8,111 +8,59 @@ ...@@ -8,111 +8,59 @@
# the specific language governing permissions and limitations under the License. # the specific language governing permissions and limitations under the License.
import os import os
import urllib.parse
from flask import url_for from cellxgene_gateway import env, flask_util
from cellxgene_gateway.cache_key import CacheKey
from cellxgene_gateway import env
from cellxgene_gateway.dir_util import annotations_suffix, make_annotations, make_h5ad from cellxgene_gateway.dir_util import annotations_suffix, make_annotations, make_h5ad
def recurse_dir(path): def render_annotations(item, item_source):
if not os.path.exists(path): url = flask_util.view_url(
raise CellxgeneException( item_source.get_annotations_subpath(item), item_source.name
"The given path does not exist.", status.HTTP_400_BAD_REQUEST )
) new_annotation = f"<a class='new' href='{url}'>new</a>"
annotations = (
all_entries = sorted(os.listdir(path)) ", ".join(
def is_h5ad(el):
return el.endswith(".h5ad") and os.path.isfile(os.path.join(path, el))
h5ad_entries = [x for x in all_entries if is_h5ad(x)]
annotation_dir_entries = [
x
for x in all_entries
if x.endswith(annotations_suffix) and make_h5ad(x) in h5ad_entries
]
def list_annotations(el):
full_path = os.path.join(path, el)
if not os.path.isdir(full_path):
entries = []
else:
entries = [
{
"name": x[:-13]
if (len(x) > 13 and x[-13] in ["-", "_"])
else (x[:-4] if x.endswith(".csv") else x),
"path": os.path.join(full_path, x + "/").replace(
env.cellxgene_data, ""
),
}
for x in sorted(os.listdir(full_path))
if x.endswith(".csv") and os.path.isfile(os.path.join(full_path, x))
]
return [
{
"name": "new",
"class": "new",
"path": full_path.replace(env.cellxgene_data, "").rstrip("/"),
}
] + entries
def make_entry(el):
full_path = os.path.join(path, el)
if el in h5ad_entries:
return {
"path": full_path.replace(env.cellxgene_data, ""),
"name": el,
"type": "file",
"annotations": list_annotations(make_annotations(el)),
}
elif os.path.isdir(full_path) and el not in annotation_dir_entries:
return {
"path": full_path.replace(env.cellxgene_data, ""),
"name": el,
"type": "directory",
"children": recurse_dir(full_path),
}
else:
return {
"path": full_path,
"name": el,
"type": "neither",
}
return [make_entry(x) for x in all_entries]
def render_entries(entries):
return "<ul>" + "\n".join([render_entry(e) for e in entries]) + "</ul>"
def get_url(entry):
return url_for("do_view", path=entry["path"].lstrip("/"))
def get_class(entry):
return f" class='{entry['class']}'" if "class" in entry else ""
def render_annotations(entry):
if len(entry["annotations"]) > 0:
return " | annotations: " + ", ".join(
[ [
f"<a href='{get_url(a)}'{get_class(a)}>{a['name']}</a>" f"<a href='{CacheKey(item, item_source, a).view_url}/'>{a.name}</a>"
for a in entry["annotations"] for a in item.annotations
] ]
) )
+ ", "
if item.annotations
else ""
)
return " | annotations: " + annotations + new_annotation
def render_item(item, item_source):
item_string = f"<li> <a href='{ CacheKey(item, item_source).view_url }/'>{item.name}</a> {render_annotations(item, item_source)}</li>"
return item_string
def render_item_tree(item_tree, item_source):
items = (
"\n".join([render_item(i, item_source) for i in item_tree.items])
if item_tree.items
else ""
)
branches = (
"\n".join([render_item_tree(b, item_source) for b in item_tree.branches])
if item_tree.branches
else ""
)
html = "<ul>" + items + branches + "</ul>"
if item_tree.descriptor:
descriptor = item_tree.descriptor.lstrip("/")
url = f"/filecrawl/{descriptor}?source={item_source.name}"
name = descriptor.rsplit("/")[1] if descriptor.find("/") >= 0 else descriptor
return f"<li><a href='{url}'>{name}</a>{html}</li>"
else: else:
return "" return html
def render_entry(entry): def render_item_source(item_source, filter=None):
if entry["type"] == "file": item_tree = item_source.list_items(filter)
return f"<li> <a href='{ get_url(entry) }/'>{entry['name']}</a> {render_annotations(entry)}</li>" heading = f"<h6><a href='/filecrawl.html?source={urllib.parse.quote_plus(item_source.name)}'>{item_source.name}</a></h6>"
elif entry["type"] == "directory": return heading + render_item_tree(item_tree, item_source)
url = f"/filecrawl/{entry['path'].lstrip('/')}"
return f"<li><a href='{url}'>{entry['name']}</a>{render_entries(entry['children'])}</li>"
else:
return ""