Merge pull request #5 from broadinstitute/ab_methods_repo_groups

Ab methods repo groups and tsv for large entities

Merge pull request #5 from broadinstitute/ab_methods_repo_groups
Ab methods repo groups and tsv for large entities
0fa0daf5 · abaumann · GitHub · aea7fd67 · d4784b24 · 0fa0daf5
Commit 0fa0daf5 authored 7 years ago by abaumann Committed by GitHub 7 years ago
--- a/add_many_emails_to_method/README.md
+++ b/add_many_emails_to_method/README.md
+## Update the permissions on a given method or config from a file with list of emails
+This script takes in a file of email addresses (one per line) and uses those to set permissions on a given method or config.  All users in that file will get the same access level, so if other access levels are needed per "group" of users, you must use different files.
+
+Run this as follows (from the main directory):
+```./run.sh add_many_emails_to_method/add_many_emails_to_method.py -s <method/config namespace> -n <method/config name> -i <snapshot id> -t <type, either method or config> -r <access level, either READER or OWNER> -f <file with one email per line>```
\ No newline at end of file
--- a/add_many_emails_to_method/add_many_emails_to_method.py
+++ b/add_many_emails_to_method/add_many_emails_to_method.py
+##!/usr/bin/env python
+from common import *
+
+
+def main():
+    setup()
+
+    # The main argument parser
+    parser = DefaultArgsParser(description="Use a file with a list of emails to add permissions to a given method/config in the repo.")
+
+    # Core application arguments
+    parser.add_argument('-s', '--namespace', dest='namespace', action='store', required=True, help='Method/config namespace')
+    parser.add_argument('-n', '--name', dest='name', action='store', required=True, help='Method/config name')
+    parser.add_argument('-i', '--snapshot_id', dest='snapshot_id', action='store', required=True, help='Method/config snapshot id')
+    parser.add_argument('-t', '--type', dest='type', action='store', required=True, choices=['method', 'config'], help='Type of entity to modify: either method or config')
+    parser.add_argument('-r', '--access_level', dest='access_level', action='store', choices=['OWNER', 'READER'], required=True, help='Access level to give each user in the file.')
+    parser.add_argument('-f', '--input_file', dest='input_file', action='store', required=True, help='Input file containing one email address per line.')
+
+    args = parser.parse_args()
+
+    permissions = []
+    with open(args.input_file, "r") as emailFile:
+        for e in emailFile:
+            permissions.append({"user":"%s"%e.replace("\n", ""), "role":"%s"%args.access_level})
+
+        if args.type == 'method':
+            existing = firecloud_api.get_repository_method_acl(args.namespace, args.name, args.snapshot_id).json()
+            permissions.extend(existing)
+            response = firecloud_api.update_repository_method_acl(args.namespace, args.name, args.snapshot_id, permissions)
+
+            if response.status_code != 200:
+                fail("Unable to update permissions on %s/%s/%s" % (args.namespace, args.name, args.snapshot_id))
+
+            print "new permissions:", firecloud_api.get_repository_method_acl(args.namespace, args.name, args.snapshot_id).json()
+        elif args.type == 'config':
+            existing = firecloud_api.get_repository_config_acl(args.namespace, args.name, args.snapshot_id).json()
+            permissions.extend(existing)
+            response = firecloud_api.update_repository_config_acl(args.namespace, args.name, args.snapshot_id,
+                                                                  permissions)
+
+            if response.status_code != 200:
+                fail("Unable to update permissions on %s/%s/%s" % (args.namespace, args.name, args.snapshot_id))
+
+            print "new permissions:", firecloud_api.get_repository_config_acl(args.namespace, args.name,
+                                                                              args.snapshot_id).json()
+        else:
+            fail("Type was not config or method.")
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/bigquery_billing_report/bigquery_billing_report.py
+++ b/bigquery_billing_report/bigquery_billing_report.py
@@ -2,7 +2,7 @@
 from common import *


-def get_pricing(ws_namespace, ws_name, query_sub_id = None, query_workflow_id = None):
+def get_pricing(ws_namespace, ws_name, query_sub_id = None, query_workflow_id = None, show_all_calls=False):
    print "Retrieving submissions in workspace..."       
    workspace_request = firecloud_api.get_workspace(ws_namespace, ws_name)
    
@@ -12,6 +12,7 @@ def get_pricing(ws_namespace, ws_name, query_sub_id = None, query_workflow_id =
    submissions_json = firecloud_api.list_submissions(ws_namespace, ws_name).json()
    
    workflow_dict = {}
+    submission_dict = {}
    for submission_json in submissions_json:
        sub_id = submission_json["submissionId"]
        
@@ -19,14 +20,16 @@ def get_pricing(ws_namespace, ws_name, query_sub_id = None, query_workflow_id =
            continue;
        
        sub_details_json = firecloud_api.get_submission(ws_namespace, ws_name, sub_id).json()
-        for wf in sub_details_json["workflows"]:
+        submission_dict[sub_id] = sub_details_json
+
+        for wf in (wf for wf in sub_details_json["workflows"] if "workflowId" in wf and "workflows" in sub_details_json):
            wf_id = wf["workflowId"]
            if query_workflow_id and wf_id not in query_workflow_id:
                continue;

            workflow_dict[wf_id] = {"submission_id":sub_id, "workflow":wf}

-    get_workflow_pricing(ws_namespace, ws_name, workflow_dict, query_workflow_id!=None and len(query_workflow_id) > 0)
+    get_workflow_pricing(ws_namespace, ws_name, workflow_dict, submission_dict, query_workflow_id!=None and len(query_workflow_id) > 0, show_all_calls)


 class CostRow():
@@ -75,7 +78,7 @@ def _fiss_access_headers_local(headers=None):
    return fiss_headers


-def get_workflow_pricing(ws_namespace, ws_name, workflow_dict, singleWorkflowMode):
+def get_workflow_pricing(ws_namespace, ws_name, workflow_dict, submission_dict, singleWorkflowMode, show_all_calls):
    firecloud_api._fiss_access_headers = _fiss_access_headers_local
    if len(workflow_dict) == 0:
        fail("No submissions or workflows matching the criteria were found.")
@@ -168,7 +171,13 @@ def get_workflow_pricing(ws_namespace, ws_name, workflow_dict, singleWorkflowMod
    wf_count = 1
    wf_total_count = len(workflow_dict)
    for submission_id, workflows in submission_id_to_workflows.iteritems():
-        print ".--- Submission:", submission_id
+        if len(workflows) == 0:
+            print "No Workflows."
+            continue;
+
+        submitter = submission_dict[submission_id]["submitter"]
+
+        print ".--- Submission: %s (submitted by %s)" % (submission_id, submitter)
        submission_total = 0.0
        submission_pd_cost = 0.0
        submission_cpu_cost = 0.0
@@ -183,24 +192,37 @@ def get_workflow_pricing(ws_namespace, ws_name, workflow_dict, singleWorkflowMod
                workflow_ids_with_no_cost.add(wf_id)
                continue

+
            workflow_metadata_json = get_workflow_metadata(ws_namespace, ws_name, submission_id, wf_id)

+            if "calls" not in workflow_metadata_json:
+                #print ws_namespace, ws_name, submission_id, wf_id, workflow_metadata_json
+                continue
+            # print json.dumps(workflow_metadata_json, indent=4, sort_keys=True)
+            print workflow_metadata_json["calls"].keys()
+
+
            #workflow_metadata_json = workflow_id_to_metadata_json[wf_id]
            calls_lower_json = dict((k.split(".")[-1].lower(), v) for k, v in workflow_metadata_json["calls"].iteritems())
-            calls_lower_translated_json = {}
-            for calls_name, call_json in calls_lower_json.iteritems():
-                # from the Cromwell documentation call names can be translated into a different format
-                # under certain circumstances.  We will try translating it here and see if we get a match.
-                # Rules:
-                # Any capital letters are lowercased.
-                # Any character which is not one of [a-z], [0-9] or - will be replaced with -.
-                # If the start character does not match [a-z] then prefix with x--
-                # If the final character does not match [a-z0-9] then suffix with --x
-                # If the string is too long, only take the first 30 and last 30 characters and add --- between them.
-                # TODO: this is not a complete implementation - however I requested that cromwell metadata includes label
-                # TODO: so that this translation is not necessary
-                cromwell_translated_callname = re.sub("[^a-z0-9\-]", "-", call_name.lower())
-                calls_lower_translated_json[cromwell_translated_callname] = call_json
+
+            if len(calls_lower_json) == 0:
+                print "\tNo Calls."
+            else:
+                calls_lower_translated_json = {}
+                for calls_name, call_json in calls_lower_json.iteritems():
+                    # from the Cromwell documentation call names can be translated into a different format
+                    # under certain circumstances.  We will try translating it here and see if we get a match.
+                    # Rules:
+                    # Any capital letters are lowercased.
+                    # Any character which is not one of [a-z], [0-9] or - will be replaced with -.
+                    # If the start character does not match [a-z] then prefix with x--
+                    # If the final character does not match [a-z0-9] then suffix with --x
+                    # If the string is too long, only take the first 30 and last 30 characters and add --- between them.
+                    # TODO: this is not a complete implementation - however I requested that cromwell metadata includes label
+                    # TODO: so that this translation is not necessary
+                    cromwell_translated_callname = re.sub("[^a-z0-9\-]", "-", call_name.lower())
+                    print call_name, cromwell_translated_callname
+                    calls_lower_translated_json[cromwell_translated_callname] = call_json

            print "|\t.--- Workflow %d of %d: %s (%s)" % (wf_count, wf_total_count, wf_id, workflow_json["status"])
            wf_count += 1
@@ -214,26 +236,42 @@ def get_workflow_pricing(ws_namespace, ws_name, workflow_dict, singleWorkflowMod
            cpu_cost = 0.0
            other_cost = 0.0
            for call_name in call_name_to_cost:
+                print call_name, calls_lower_json.keys(), calls_lower_translated_json.keys()
+
+                calls = calls_lower_json[call_name] if call_name in calls_lower_json else calls_lower_translated_json[call_name]
+
                call_pricing = call_name_to_cost[call_name]

                resource_type_to_pricing = defaultdict(int)
                for pricing in call_pricing:
-                    resource_type_to_pricing[pricing.resource_type] += pricing.cost
+                    if pricing.cost > 0:
+                        resource_type_to_pricing[pricing.resource_type] += pricing.cost

                if call_name in calls_lower_json:
-                    num_calls = len(calls_lower_json[call_name])
+                    num_calls = len(calls)
                else:
                    if call_name in calls_lower_translated_json:
                        num_calls = len(calls_lower_translated_json[call_name])
                    else:
                        num_calls = 0
-
                print "|\t|\t%-10s%s:" % ("(%dx)" % num_calls, call_name)

+                if show_all_calls:
+                    for call in sorted(calls, key=lambda call: call["attempt"]):
+                        start = dateutil.parser.parse(call["start"])
+                        end = dateutil.parser.parse(call["end"])
+                        preempted = "[preempted]" if call["preemptible"] == True and call["executionStatus"] == "RetryableFailure" else ""
+                        print "|\t|\t             * Attempt #%d: start: %s - end: %s (elapsed: %s) %s" % (call["attempt"], start, end, end-start, preempted)
+
                sorted_costs = sorted(resource_type_to_pricing, key=lambda rt: resource_type_to_pricing[rt], reverse=True)
-                for resource_type in sorted_costs:
-                    cost = resource_type_to_pricing[resource_type]
-                    if cost > 0:
+
+                if len(sorted_costs) > 0:
+                    for resource_type in sorted_costs:
+                        cost = resource_type_to_pricing[resource_type]
+
+                        if cost == 0:
+                            continue
+
                        total += cost
                        if "pd" in resource_type.lower():
                            pd_cost += cost
@@ -243,6 +281,10 @@ def get_workflow_pricing(ws_namespace, ws_name, workflow_dict, singleWorkflowMod
                            other_cost += cost

                        print "|\t|\t\t\t%s%s" % (("$%f" % cost).ljust(15), resource_type)
+                else:
+                    workflow_ids_with_no_cost.add(wf_id)
+                    print "|\t|\t\t\t(missing cost information)"
+
            print "|\t|    %s"%("-"*100)
            print "|\t'--> Workflow Cost: $%f (cpu: $%f | disk: $%f | other: $%f)\n|" % (total, cpu_cost, pd_cost, other_cost)

@@ -256,7 +298,7 @@ def get_workflow_pricing(ws_namespace, ws_name, workflow_dict, singleWorkflowMod
        else:
            print "|    %s" % ("-" * 100)
            missing_workflows = len(workflow_ids_with_no_cost) > 0
-            caveat_text = (" (** for %d out of %d workflows)")%(len(workflow_ids_with_no_cost), wf_count) if missing_workflows else ""
+            caveat_text = (" (** for %d out of %d workflows)")%(wf_count-len(workflow_ids_with_no_cost), wf_count) if missing_workflows else ""
            print "'--> Submission Cost%s: $%f (cpu: $%f | disk: $%f | other: $%f)\n" % (caveat_text, submission_total, submission_cpu_cost, submission_pd_cost, submission_other_cost)
            if missing_workflows:
                print "     ** %d workflows without cost information, e.g. %s" % (len(workflow_ids_with_no_cost), next(iter(workflow_ids_with_no_cost)))
@@ -274,13 +316,16 @@ def main():
    parser.add_argument('-s', '--subid', dest='submission_id', action='store', required=False, help='Optional Submission ID to limit the pricing breakdown to.  If not provided pricing for all submissions in this workspace will be reported.')
    parser.add_argument('-w', '--wfid', dest='workflow_id', default=[], action='append', required=False, help='Optional Workflow ID to limit the pricing breakdown to.  Note that this requires a submission id to be passed as well."')

+    parser.add_argument('-c', '--calls', dest='show_all_calls', action='store_true', required=False, help='Expand information about each call.')
+
    args = parser.parse_args()

    if args.workflow_id and not args.submission_id:
        fail("Submission ID must also be provided when querying for a Workflow ID")

    print "Note that this script expects the billing export table to be named 'billing_export'.  "
-    get_pricing(args.ws_namespace, args.ws_name, args.submission_id, args.workflow_id)
+
+    get_pricing(args.ws_namespace, args.ws_name, args.submission_id, args.workflow_id, args.show_all_calls)


 if __name__ == "__main__":

--- a/common.py
+++ b/common.py
@@ -34,6 +34,7 @@ import yaml
 import csv
 import xlrd
 from xlrd.sheet import ctype_text
+import dateutil.parser

 # firecloud python bindings
 from firecloud import api as firecloud_api
@@ -41,6 +42,34 @@ from firecloud import api as firecloud_api
 processes = []


+class ProgressBar:
+    def __init__(self, min, max, description="", num_tabs=0):
+        self.val = mp.Value('i', min)
+        self.lock = mp.Lock()
+
+        self.min = min
+        self.max = max
+        self.description = description
+        self.num_tabs = num_tabs
+
+    def print_bar(self):
+        percent = float(self.val.value - self.min) / (self.max - self.min)
+        width = 50
+
+        bar_width = int(math.ceil(width * percent))
+        print "\r%s[%s%s] %s/%s %s" % (
+            "\t" * self.num_tabs, "=" * bar_width, " " * (width - bar_width), self.val.value, self.max,
+            self.description),
+
+        if self.val.value >= max:
+            print "\n"
+
+        sys.stdout.flush()
+
+    def increment(self):
+        self.val.value += 1
+
+
 def print_fields(obj):
    for item in vars(obj).items():
        print item
@@ -87,7 +116,6 @@ def get_workflow_metadata(namespace, name, submission_id, workflow_id, *include_
    uri = "{0}/workspaces/{1}/{2}/submissions/{3}/workflows/{4}?&{5}expandSubWorkflows=false".format(
        firecloud_api.PROD_API_ROOT, namespace, name, submission_id, workflow_id, include_key_string)

-    #print requests.get(uri, headers=headers).text

    return requests.get(uri, headers=headers).json()


--- a/entity_to_tsv_for_large_data/README.md
+++ b/entity_to_tsv_for_large_data/README.md
+## Write a TSV for a given entity in a workspace
+This script writes a TSV file for a given entity when that entity is too large to download from the GUI.
+
+Run this as follows (from the main directory):
+```./run.sh entity_to_tsv_for_large_data/entity_to_tsv_for_large_data.py -p <workspace project> -n <workspace name> -e <entity type, e.g. sample> -o <output_tsv_file_name>```
\ No newline at end of file
--- a/entity_to_tsv_for_large_data/entity_to_tsv_for_large_data.py
+++ b/entity_to_tsv_for_large_data/entity_to_tsv_for_large_data.py
+##!/usr/bin/env python
+from common import *
+
+
+def main():
+    setup()
+
+    # The main argument parser
+    parser = DefaultArgsParser(description="Export a TSV file for a given entity when that entity is too large to download from FireCloud.")
+
+    # Core application arguments
+    parser.add_argument('-p', '--namespace', dest='ws_namespace', action='store', required=True, help='Workspace namespace')
+    parser.add_argument('-n', '--name', dest='ws_name', action='store', required=True, help='Workspace name')
+    parser.add_argument('-e', '--entity_type', dest='entity_type', action='store', required=True, help='Type of the entity to get TSV data for.')
+    parser.add_argument('-o', '--out_file', dest='output_file', action='store', required=True, help='Name of the tsv export file.')
+
+    args = parser.parse_args()
+
+    request = firecloud_api.list_entity_types(args.ws_namespace, args.ws_name)
+    if request.status_code != 200:
+        fail(request.text)
+
+    entity_types_json = request.json()
+    entity_count = entity_types_json[args.entity_type]["count"]
+    print "%d %s(s) to gather..." % (entity_count,args.entity_type)
+    attribute_names = entity_types_json[args.entity_type]["attributeNames"]
+
+    with open(args.output_file, "w") as tsvfile:
+        tsvfile.write("\t".join(attribute_names)+"\n")
+
+        entity_data = []
+        row_num = 0
+        page_size = 1000
+        num_pages = int(math.ceil(float(entity_count) / page_size))
+
+        pool = mp.Pool(processes=2)
+        entity_requests = []
+
+        for i in range(1, num_pages + 1):
+            entity_requests.append(pool.apply_async(get_entity_by_page,
+                                                    args=(args.ws_namespace, args.ws_name, args.entity_type, i, page_size)))
+
+        pb = ProgressBar(0, entity_count, "Entities gathered")
+
+        for request in entity_requests:
+            for entity_json in request.get(timeout=100)["results"]:
+                attributes = entity_json["attributes"]
+                values = []
+                for attribute_name in attribute_names:
+                    value = ""
+
+                    if attribute_name in attributes:
+                        value = attributes[attribute_name]
+                    if attribute_name == "participant" or attribute_name == "sample":
+                        value = value["entityName"]
+
+                    values.append(value)
+
+                tsvfile.write("\t".join(values)+"\n")
+                row_num += 1
+                pb.increment()
+                pb.print_bar()
+if __name__ == "__main__":
+    main()
\ No newline at end of file
--- a/storage_costs_estimates/storage_costs_estimates.py
+++ b/storage_costs_estimates/storage_costs_estimates.py
@@ -17,12 +17,45 @@ def main():
    if args.ws_name:
        print "%s: %s" % (args.ws_name, get_workspace_storage_estimate(args.ws_namespace, args.ws_name).json()["estimate"])
    else:
-        for ws in firecloud_api.list_workspaces().json():
+        workspaces_json = firecloud_api.list_workspaces().json()
+        longest_name = len(max([ws["workspace"]["name"] for ws in workspaces_json], key=len))
+
+        requests = {}
+        pool = mp.Pool(processes=20)
+
+        for ws in workspaces_json:
            if ws["workspace"]["namespace"] == args.ws_namespace:
                workspace_name = ws["workspace"]["name"]
-                estimate = get_workspace_storage_estimate(args.ws_namespace, workspace_name)
-                if estimate.status_code == 200:
-                    print "%s: %s" % (workspace_name, get_workspace_storage_estimate(args.ws_namespace, workspace_name).json()["estimate"])
+                workflow_id = ws["workspace"]["workspaceId"]
+                requests[workflow_id] = pool.apply_async(get_workspace_storage_estimate, (args.ws_namespace, workspace_name))
+
+        print "Getting cost information..."
+
+        pb = ProgressBar(0, len(requests), "Workspace bucket prices gathered")
+        workflow_id_to_cost = {}
+        for workflow_id, request in requests.iteritems():
+            result = request.get(timeout=100)
+            estimate_request = get_workspace_storage_estimate(args.ws_namespace, workspace_name)
+            if result.status_code == 200 and estimate_request.status_code == 200:
+                cost = estimate_request.json()["estimate"]
+                workflow_id_to_cost[workflow_id] = cost
+            pb.increment()
+            pb.print_bar()
+
+        pool.close()
+
+        cost_info = []
+        for ws in [ws_json for ws_json in workspaces_json if ws_json["workspace"]["workspaceId"] in workflow_id_to_cost]:
+            workflow_id = ws["workspace"]["workspaceId"]
+
+            workspace_name = ws["workspace"]["name"]
+
+            cost_info.append( (longest_name, workspace_name + ":", 15, workflow_id_to_cost[workflow_id], ws["workspace"]["bucketName"]) )
+
+
+        # sorted by cost
+        for cost in sorted(cost_info, key=lambda cost: cost[3], reverse=True):
+            print "%-*s%-*sgs://%s" % cost


 if __name__ == "__main__":