Skip to content
Snippets Groups Projects
Commit 0fa0daf5 authored by abaumann's avatar abaumann Committed by GitHub
Browse files

Merge pull request #5 from broadinstitute/ab_methods_repo_groups

Ab methods repo groups and tsv for large entities
parents aea7fd67 d4784b24
No related branches found
No related tags found
No related merge requests found
## Update the permissions on a given method or config from a file with list of emails
This script takes in a file of email addresses (one per line) and uses those to set permissions on a given method or config. All users in that file will get the same access level, so if other access levels are needed per "group" of users, you must use different files.
Run this as follows (from the main directory):
```./run.sh add_many_emails_to_method/add_many_emails_to_method.py -s <method/config namespace> -n <method/config name> -i <snapshot id> -t <type, either method or config> -r <access level, either READER or OWNER> -f <file with one email per line>```
\ No newline at end of file
##!/usr/bin/env python
from common import *
def main():
setup()
# The main argument parser
parser = DefaultArgsParser(description="Use a file with a list of emails to add permissions to a given method/config in the repo.")
# Core application arguments
parser.add_argument('-s', '--namespace', dest='namespace', action='store', required=True, help='Method/config namespace')
parser.add_argument('-n', '--name', dest='name', action='store', required=True, help='Method/config name')
parser.add_argument('-i', '--snapshot_id', dest='snapshot_id', action='store', required=True, help='Method/config snapshot id')
parser.add_argument('-t', '--type', dest='type', action='store', required=True, choices=['method', 'config'], help='Type of entity to modify: either method or config')
parser.add_argument('-r', '--access_level', dest='access_level', action='store', choices=['OWNER', 'READER'], required=True, help='Access level to give each user in the file.')
parser.add_argument('-f', '--input_file', dest='input_file', action='store', required=True, help='Input file containing one email address per line.')
args = parser.parse_args()
permissions = []
with open(args.input_file, "r") as emailFile:
for e in emailFile:
permissions.append({"user":"%s"%e.replace("\n", ""), "role":"%s"%args.access_level})
if args.type == 'method':
existing = firecloud_api.get_repository_method_acl(args.namespace, args.name, args.snapshot_id).json()
permissions.extend(existing)
response = firecloud_api.update_repository_method_acl(args.namespace, args.name, args.snapshot_id, permissions)
if response.status_code != 200:
fail("Unable to update permissions on %s/%s/%s" % (args.namespace, args.name, args.snapshot_id))
print "new permissions:", firecloud_api.get_repository_method_acl(args.namespace, args.name, args.snapshot_id).json()
elif args.type == 'config':
existing = firecloud_api.get_repository_config_acl(args.namespace, args.name, args.snapshot_id).json()
permissions.extend(existing)
response = firecloud_api.update_repository_config_acl(args.namespace, args.name, args.snapshot_id,
permissions)
if response.status_code != 200:
fail("Unable to update permissions on %s/%s/%s" % (args.namespace, args.name, args.snapshot_id))
print "new permissions:", firecloud_api.get_repository_config_acl(args.namespace, args.name,
args.snapshot_id).json()
else:
fail("Type was not config or method.")
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -2,7 +2,7 @@
from common import *
def get_pricing(ws_namespace, ws_name, query_sub_id = None, query_workflow_id = None):
def get_pricing(ws_namespace, ws_name, query_sub_id = None, query_workflow_id = None, show_all_calls=False):
print "Retrieving submissions in workspace..."
workspace_request = firecloud_api.get_workspace(ws_namespace, ws_name)
......@@ -12,6 +12,7 @@ def get_pricing(ws_namespace, ws_name, query_sub_id = None, query_workflow_id =
submissions_json = firecloud_api.list_submissions(ws_namespace, ws_name).json()
workflow_dict = {}
submission_dict = {}
for submission_json in submissions_json:
sub_id = submission_json["submissionId"]
......@@ -19,14 +20,16 @@ def get_pricing(ws_namespace, ws_name, query_sub_id = None, query_workflow_id =
continue;
sub_details_json = firecloud_api.get_submission(ws_namespace, ws_name, sub_id).json()
for wf in sub_details_json["workflows"]:
submission_dict[sub_id] = sub_details_json
for wf in (wf for wf in sub_details_json["workflows"] if "workflowId" in wf and "workflows" in sub_details_json):
wf_id = wf["workflowId"]
if query_workflow_id and wf_id not in query_workflow_id:
continue;
workflow_dict[wf_id] = {"submission_id":sub_id, "workflow":wf}
get_workflow_pricing(ws_namespace, ws_name, workflow_dict, query_workflow_id!=None and len(query_workflow_id) > 0)
get_workflow_pricing(ws_namespace, ws_name, workflow_dict, submission_dict, query_workflow_id!=None and len(query_workflow_id) > 0, show_all_calls)
class CostRow():
......@@ -75,7 +78,7 @@ def _fiss_access_headers_local(headers=None):
return fiss_headers
def get_workflow_pricing(ws_namespace, ws_name, workflow_dict, singleWorkflowMode):
def get_workflow_pricing(ws_namespace, ws_name, workflow_dict, submission_dict, singleWorkflowMode, show_all_calls):
firecloud_api._fiss_access_headers = _fiss_access_headers_local
if len(workflow_dict) == 0:
fail("No submissions or workflows matching the criteria were found.")
......@@ -168,7 +171,13 @@ def get_workflow_pricing(ws_namespace, ws_name, workflow_dict, singleWorkflowMod
wf_count = 1
wf_total_count = len(workflow_dict)
for submission_id, workflows in submission_id_to_workflows.iteritems():
print ".--- Submission:", submission_id
if len(workflows) == 0:
print "No Workflows."
continue;
submitter = submission_dict[submission_id]["submitter"]
print ".--- Submission: %s (submitted by %s)" % (submission_id, submitter)
submission_total = 0.0
submission_pd_cost = 0.0
submission_cpu_cost = 0.0
......@@ -183,24 +192,37 @@ def get_workflow_pricing(ws_namespace, ws_name, workflow_dict, singleWorkflowMod
workflow_ids_with_no_cost.add(wf_id)
continue
workflow_metadata_json = get_workflow_metadata(ws_namespace, ws_name, submission_id, wf_id)
if "calls" not in workflow_metadata_json:
#print ws_namespace, ws_name, submission_id, wf_id, workflow_metadata_json
continue
# print json.dumps(workflow_metadata_json, indent=4, sort_keys=True)
print workflow_metadata_json["calls"].keys()
#workflow_metadata_json = workflow_id_to_metadata_json[wf_id]
calls_lower_json = dict((k.split(".")[-1].lower(), v) for k, v in workflow_metadata_json["calls"].iteritems())
calls_lower_translated_json = {}
for calls_name, call_json in calls_lower_json.iteritems():
# from the Cromwell documentation call names can be translated into a different format
# under certain circumstances. We will try translating it here and see if we get a match.
# Rules:
# Any capital letters are lowercased.
# Any character which is not one of [a-z], [0-9] or - will be replaced with -.
# If the start character does not match [a-z] then prefix with x--
# If the final character does not match [a-z0-9] then suffix with --x
# If the string is too long, only take the first 30 and last 30 characters and add --- between them.
# TODO: this is not a complete implementation - however I requested that cromwell metadata includes label
# TODO: so that this translation is not necessary
cromwell_translated_callname = re.sub("[^a-z0-9\-]", "-", call_name.lower())
calls_lower_translated_json[cromwell_translated_callname] = call_json
if len(calls_lower_json) == 0:
print "\tNo Calls."
else:
calls_lower_translated_json = {}
for calls_name, call_json in calls_lower_json.iteritems():
# from the Cromwell documentation call names can be translated into a different format
# under certain circumstances. We will try translating it here and see if we get a match.
# Rules:
# Any capital letters are lowercased.
# Any character which is not one of [a-z], [0-9] or - will be replaced with -.
# If the start character does not match [a-z] then prefix with x--
# If the final character does not match [a-z0-9] then suffix with --x
# If the string is too long, only take the first 30 and last 30 characters and add --- between them.
# TODO: this is not a complete implementation - however I requested that cromwell metadata includes label
# TODO: so that this translation is not necessary
cromwell_translated_callname = re.sub("[^a-z0-9\-]", "-", call_name.lower())
print call_name, cromwell_translated_callname
calls_lower_translated_json[cromwell_translated_callname] = call_json
print "|\t.--- Workflow %d of %d: %s (%s)" % (wf_count, wf_total_count, wf_id, workflow_json["status"])
wf_count += 1
......@@ -214,26 +236,42 @@ def get_workflow_pricing(ws_namespace, ws_name, workflow_dict, singleWorkflowMod
cpu_cost = 0.0
other_cost = 0.0
for call_name in call_name_to_cost:
print call_name, calls_lower_json.keys(), calls_lower_translated_json.keys()
calls = calls_lower_json[call_name] if call_name in calls_lower_json else calls_lower_translated_json[call_name]
call_pricing = call_name_to_cost[call_name]
resource_type_to_pricing = defaultdict(int)
for pricing in call_pricing:
resource_type_to_pricing[pricing.resource_type] += pricing.cost
if pricing.cost > 0:
resource_type_to_pricing[pricing.resource_type] += pricing.cost
if call_name in calls_lower_json:
num_calls = len(calls_lower_json[call_name])
num_calls = len(calls)
else:
if call_name in calls_lower_translated_json:
num_calls = len(calls_lower_translated_json[call_name])
else:
num_calls = 0
print "|\t|\t%-10s%s:" % ("(%dx)" % num_calls, call_name)
if show_all_calls:
for call in sorted(calls, key=lambda call: call["attempt"]):
start = dateutil.parser.parse(call["start"])
end = dateutil.parser.parse(call["end"])
preempted = "[preempted]" if call["preemptible"] == True and call["executionStatus"] == "RetryableFailure" else ""
print "|\t|\t * Attempt #%d: start: %s - end: %s (elapsed: %s) %s" % (call["attempt"], start, end, end-start, preempted)
sorted_costs = sorted(resource_type_to_pricing, key=lambda rt: resource_type_to_pricing[rt], reverse=True)
for resource_type in sorted_costs:
cost = resource_type_to_pricing[resource_type]
if cost > 0:
if len(sorted_costs) > 0:
for resource_type in sorted_costs:
cost = resource_type_to_pricing[resource_type]
if cost == 0:
continue
total += cost
if "pd" in resource_type.lower():
pd_cost += cost
......@@ -243,6 +281,10 @@ def get_workflow_pricing(ws_namespace, ws_name, workflow_dict, singleWorkflowMod
other_cost += cost
print "|\t|\t\t\t%s%s" % (("$%f" % cost).ljust(15), resource_type)
else:
workflow_ids_with_no_cost.add(wf_id)
print "|\t|\t\t\t(missing cost information)"
print "|\t| %s"%("-"*100)
print "|\t'--> Workflow Cost: $%f (cpu: $%f | disk: $%f | other: $%f)\n|" % (total, cpu_cost, pd_cost, other_cost)
......@@ -256,7 +298,7 @@ def get_workflow_pricing(ws_namespace, ws_name, workflow_dict, singleWorkflowMod
else:
print "| %s" % ("-" * 100)
missing_workflows = len(workflow_ids_with_no_cost) > 0
caveat_text = (" (** for %d out of %d workflows)")%(len(workflow_ids_with_no_cost), wf_count) if missing_workflows else ""
caveat_text = (" (** for %d out of %d workflows)")%(wf_count-len(workflow_ids_with_no_cost), wf_count) if missing_workflows else ""
print "'--> Submission Cost%s: $%f (cpu: $%f | disk: $%f | other: $%f)\n" % (caveat_text, submission_total, submission_cpu_cost, submission_pd_cost, submission_other_cost)
if missing_workflows:
print " ** %d workflows without cost information, e.g. %s" % (len(workflow_ids_with_no_cost), next(iter(workflow_ids_with_no_cost)))
......@@ -274,13 +316,16 @@ def main():
parser.add_argument('-s', '--subid', dest='submission_id', action='store', required=False, help='Optional Submission ID to limit the pricing breakdown to. If not provided pricing for all submissions in this workspace will be reported.')
parser.add_argument('-w', '--wfid', dest='workflow_id', default=[], action='append', required=False, help='Optional Workflow ID to limit the pricing breakdown to. Note that this requires a submission id to be passed as well."')
parser.add_argument('-c', '--calls', dest='show_all_calls', action='store_true', required=False, help='Expand information about each call.')
args = parser.parse_args()
if args.workflow_id and not args.submission_id:
fail("Submission ID must also be provided when querying for a Workflow ID")
print "Note that this script expects the billing export table to be named 'billing_export'. "
get_pricing(args.ws_namespace, args.ws_name, args.submission_id, args.workflow_id)
get_pricing(args.ws_namespace, args.ws_name, args.submission_id, args.workflow_id, args.show_all_calls)
if __name__ == "__main__":
......
......@@ -34,6 +34,7 @@ import yaml
import csv
import xlrd
from xlrd.sheet import ctype_text
import dateutil.parser
# firecloud python bindings
from firecloud import api as firecloud_api
......@@ -41,6 +42,34 @@ from firecloud import api as firecloud_api
processes = []
class ProgressBar:
def __init__(self, min, max, description="", num_tabs=0):
self.val = mp.Value('i', min)
self.lock = mp.Lock()
self.min = min
self.max = max
self.description = description
self.num_tabs = num_tabs
def print_bar(self):
percent = float(self.val.value - self.min) / (self.max - self.min)
width = 50
bar_width = int(math.ceil(width * percent))
print "\r%s[%s%s] %s/%s %s" % (
"\t" * self.num_tabs, "=" * bar_width, " " * (width - bar_width), self.val.value, self.max,
self.description),
if self.val.value >= max:
print "\n"
sys.stdout.flush()
def increment(self):
self.val.value += 1
def print_fields(obj):
for item in vars(obj).items():
print item
......@@ -87,7 +116,6 @@ def get_workflow_metadata(namespace, name, submission_id, workflow_id, *include_
uri = "{0}/workspaces/{1}/{2}/submissions/{3}/workflows/{4}?&{5}expandSubWorkflows=false".format(
firecloud_api.PROD_API_ROOT, namespace, name, submission_id, workflow_id, include_key_string)
#print requests.get(uri, headers=headers).text
return requests.get(uri, headers=headers).json()
......
## Write a TSV for a given entity in a workspace
This script writes a TSV file for a given entity when that entity is too large to download from the GUI.
Run this as follows (from the main directory):
```./run.sh entity_to_tsv_for_large_data/entity_to_tsv_for_large_data.py -p <workspace project> -n <workspace name> -e <entity type, e.g. sample> -o <output_tsv_file_name>```
\ No newline at end of file
##!/usr/bin/env python
from common import *
def main():
setup()
# The main argument parser
parser = DefaultArgsParser(description="Export a TSV file for a given entity when that entity is too large to download from FireCloud.")
# Core application arguments
parser.add_argument('-p', '--namespace', dest='ws_namespace', action='store', required=True, help='Workspace namespace')
parser.add_argument('-n', '--name', dest='ws_name', action='store', required=True, help='Workspace name')
parser.add_argument('-e', '--entity_type', dest='entity_type', action='store', required=True, help='Type of the entity to get TSV data for.')
parser.add_argument('-o', '--out_file', dest='output_file', action='store', required=True, help='Name of the tsv export file.')
args = parser.parse_args()
request = firecloud_api.list_entity_types(args.ws_namespace, args.ws_name)
if request.status_code != 200:
fail(request.text)
entity_types_json = request.json()
entity_count = entity_types_json[args.entity_type]["count"]
print "%d %s(s) to gather..." % (entity_count,args.entity_type)
attribute_names = entity_types_json[args.entity_type]["attributeNames"]
with open(args.output_file, "w") as tsvfile:
tsvfile.write("\t".join(attribute_names)+"\n")
entity_data = []
row_num = 0
page_size = 1000
num_pages = int(math.ceil(float(entity_count) / page_size))
pool = mp.Pool(processes=2)
entity_requests = []
for i in range(1, num_pages + 1):
entity_requests.append(pool.apply_async(get_entity_by_page,
args=(args.ws_namespace, args.ws_name, args.entity_type, i, page_size)))
pb = ProgressBar(0, entity_count, "Entities gathered")
for request in entity_requests:
for entity_json in request.get(timeout=100)["results"]:
attributes = entity_json["attributes"]
values = []
for attribute_name in attribute_names:
value = ""
if attribute_name in attributes:
value = attributes[attribute_name]
if attribute_name == "participant" or attribute_name == "sample":
value = value["entityName"]
values.append(value)
tsvfile.write("\t".join(values)+"\n")
row_num += 1
pb.increment()
pb.print_bar()
if __name__ == "__main__":
main()
\ No newline at end of file
......@@ -17,12 +17,45 @@ def main():
if args.ws_name:
print "%s: %s" % (args.ws_name, get_workspace_storage_estimate(args.ws_namespace, args.ws_name).json()["estimate"])
else:
for ws in firecloud_api.list_workspaces().json():
workspaces_json = firecloud_api.list_workspaces().json()
longest_name = len(max([ws["workspace"]["name"] for ws in workspaces_json], key=len))
requests = {}
pool = mp.Pool(processes=20)
for ws in workspaces_json:
if ws["workspace"]["namespace"] == args.ws_namespace:
workspace_name = ws["workspace"]["name"]
estimate = get_workspace_storage_estimate(args.ws_namespace, workspace_name)
if estimate.status_code == 200:
print "%s: %s" % (workspace_name, get_workspace_storage_estimate(args.ws_namespace, workspace_name).json()["estimate"])
workflow_id = ws["workspace"]["workspaceId"]
requests[workflow_id] = pool.apply_async(get_workspace_storage_estimate, (args.ws_namespace, workspace_name))
print "Getting cost information..."
pb = ProgressBar(0, len(requests), "Workspace bucket prices gathered")
workflow_id_to_cost = {}
for workflow_id, request in requests.iteritems():
result = request.get(timeout=100)
estimate_request = get_workspace_storage_estimate(args.ws_namespace, workspace_name)
if result.status_code == 200 and estimate_request.status_code == 200:
cost = estimate_request.json()["estimate"]
workflow_id_to_cost[workflow_id] = cost
pb.increment()
pb.print_bar()
pool.close()
cost_info = []
for ws in [ws_json for ws_json in workspaces_json if ws_json["workspace"]["workspaceId"] in workflow_id_to_cost]:
workflow_id = ws["workspace"]["workspaceId"]
workspace_name = ws["workspace"]["name"]
cost_info.append( (longest_name, workspace_name + ":", 15, workflow_id_to_cost[workflow_id], ws["workspace"]["bucketName"]) )
# sorted by cost
for cost in sorted(cost_info, key=lambda cost: cost[3], reverse=True):
print "%-*s%-*sgs://%s" % cost
if __name__ == "__main__":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment