|
| 1 | +fromaugur.ioimportread_metadata |
| 2 | +fromaugur.utilsimportwrite_json |
| 3 | +importrequests |
| 4 | +importjson |
| 5 | +importpandasaspd |
| 6 | +importargparse |
| 7 | +importmath |
| 8 | + |
| 9 | +# Set up argument parser |
| 10 | +parser=argparse.ArgumentParser(description="Process metadata and growth advantage data.") |
| 11 | +parser.add_argument("--metadata", required=True, help="Path to the metadata file (TSV or compressed .tsv.xz format).") |
| 12 | +parser.add_argument("--metadata-id-columns", default=["strain", "name", "Virus name"], nargs="+", help="List of columns to use as identifiers in the metadata file.") |
| 13 | +parser.add_argument("--metadata-clade-attribute", default="Nextclade_pango", help="Matched attribute to MLR variants.") |
| 14 | +parser.add_argument("--mlr-url", default="https://data.nextstrain.org/files/workflows/forecasts-ncov/gisaid/pango_lineages/global/mlr/latest_results.json", help="URL to fetch the forecasts JSON data.") |
| 15 | +parser.add_argument("--output-node-data", required=True, help="Path to save the output JSON node data.") |
| 16 | + |
| 17 | +args=parser.parse_args() |
| 18 | + |
| 19 | +deffetch_growth_advantages(mlr_url): |
| 20 | +try: |
| 21 | +response=requests.get(mlr_url) |
| 22 | +response.raise_for_status() # Raise an exception for HTTP errors |
| 23 | +json_data=response.json() # Parse the JSON content |
| 24 | +data=json_data["data"] |
| 25 | + |
| 26 | +growth_advantages= {} |
| 27 | +forentryindata: |
| 28 | +ifall(keyinentryforkeyin ["location", "site", "variant", "value", "ps"]): |
| 29 | +ifentry["location"] =="hierarchical"andentry["site"] =="ga"andentry["ps"] =="median": |
| 30 | +growth_advantages[entry["variant"]] =entry["value"] |
| 31 | +returngrowth_advantages |
| 32 | +exceptExceptionase: |
| 33 | +print(f"Error fetching the JSON file: {e}") |
| 34 | +returnNone |
| 35 | + |
| 36 | +try: |
| 37 | +# Fetch the growth advantages |
| 38 | +growth_advantages=fetch_growth_advantages(args.mlr_url) |
| 39 | + |
| 40 | +# Load the local metadata file |
| 41 | +metadata_file=args.metadata |
| 42 | +metadata=read_metadata( |
| 43 | +metadata_file, |
| 44 | +id_columns=args.metadata_id_columns |
| 45 | + ) |
| 46 | + |
| 47 | +# Match Nextclade_pango entries to the growth advantage |
| 48 | +ifgrowth_advantages: |
| 49 | +metadata[args.metadata_clade_attribute] =metadata[args.metadata_clade_attribute].map(growth_advantages) |
| 50 | +else: |
| 51 | +metadata[args.metadata_clade_attribute] =math.nan |
| 52 | + |
| 53 | +# Output rows with matched data |
| 54 | +print(metadata.head()) # Display the first few rows as an example |
| 55 | + |
| 56 | +# Create a node data object with growth advantages |
| 57 | +node_data= {} |
| 58 | +forindex, recordinmetadata.iterrows(): |
| 59 | +node_data[index] = { |
| 60 | +"mlr_lineage_fitness": record[args.metadata_clade_attribute] |
| 61 | + } |
| 62 | + |
| 63 | +# Save node data |
| 64 | +write_json({"nodes": node_data}, args.output_node_data) |
| 65 | + |
| 66 | +exceptFileNotFoundErrorase: |
| 67 | +print(f"Error reading metadata file: {e}") |
| 68 | +exceptExceptionase: |
| 69 | +print(f"An unexpected error occurred: {e}") |
0 commit comments