Download a gene data package

Download an NCBI Datasets Gene Data Package, including FASTA sequences and metadata

Download a gene data package

Download an NCBI Datasets Gene Data Package, including FASTA sequences and metadata

Gene metadata and FASTA sequence are available as an NCBI Datasets Gene Data Package .

Using Gene IDs

Run the following command to download a zip archive containing gene FASTA sequence given an NCBI Gene ID:
datasets download gene symbol --taxon human ACRV1 A2M

To get started with the Python library, see the Datasets Python API reference documentation.

First download the data package for the chosen Gene IDs using the download_gene_package method. Next, open the zip file and extract some data from the protein fasta and data report files using the GeneDataset class in ncbi.datasets.package.dataset.

import sys
from typing import List

from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GeneApi as DatasetsGeneApi

from ncbi.datasets.package import dataset


# the list of gene-ids to be downloaded
gene_ids: List[int] = [1, 2, 3, 9, 10, 11, 12, 13, 14, 15, 16, 17]

zipfile_name = "gene_ds.zip"

# download the data package through the api-client.
with DatasetsApiClient() as api_client:
    gene_api = DatasetsGeneApi(api_client)
    try:
        gene_dataset_download = gene_api.download_gene_package(
            gene_ids,
            include_annotation_type=["FASTA_GENE", "FASTA_PROTEIN"],
            _preload_content=False,
        )

        with open(zipfile_name, "wb") as f:
            f.write(gene_dataset_download.data)
    except DatasetsApiException as e:
        sys.exit(f"Exception when calling GeneApi: {e}\n")

# open the package zip archive so we can retrieve files from it
package = dataset.GeneDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())

# Use file types or names from the catalog to retrieve contents from specific files, e.g. protein fasta
for protein_fasta, file_name in package.get_files_by_type("PROTEIN_FASTA"):
    print(file_name, protein_fasta[:100])

# get the data report and print the id and symbol for each downloaded gene
for report in package.get_data_reports():
    print(f"{report.gene_id}\t{report.symbol}")

Using Gene symbols

Run the following command to download a zip archive containing gene FASTA sequence given a gene symbol and organism name:
datasets download gene symbol --taxon human ACRV1 A2M

To get started with the Python library, see the Datasets Python API reference documentation.

First transform gene symbols into NCBI Gene IDs using the gene_metadata_by_tax_and_symbol method from ncbi-datasets-pylib. Next, download the data package for those Gene IDs using the download_gene_package method. Lastly, open the zip file and extract some metadata from the data report file using the GeneDataset class in ncbi.datasets.package.dataset.

import sys
from typing import List

from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GeneApi as DatasetsGeneApi

from ncbi.datasets.package import dataset

# the list of gene symbols and their associated taxon to download
symbols: List[str] = ["ACRV1", "A2M"]
taxon = "human"

zipfile_name = "gene_ds.zip"

with DatasetsApiClient() as api_client:
    gene_api = DatasetsGeneApi(api_client)

    # download only takes NCBI gene-ids so call metadata function first to convert symbols to ids
    gene_ids_for_symbols: List[int] = []
    try:
        gene_reply = gene_api.gene_metadata_by_tax_and_symbol(symbols=symbols, taxon=taxon, returned_content="IDS_ONLY")
        gene_ids_for_symbols = [int(gene_rec.gene.gene_id) for gene_rec in gene_reply.genes]
    except DatasetsApiException as e:
        sys.exit(f"Exception when calling GeneApi: {e}\n")

    # download the data package through the api-client.
    try:
        gene_ds_download = gene_api.download_gene_package(
            gene_ids_for_symbols,
            include_annotation_type=["FASTA_GENE", "FASTA_PROTEIN"],
            _preload_content=False,
        )

        # save the file
        with open(zipfile_name, "wb") as f:
            f.write(gene_ds_download.data)
    except DatasetsApiException as e:
        sys.exit(f"Exception when calling GeneApi: {e}\n")

# open the package zip archive so we can retrieve files from it
package = dataset.GeneDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())

# get the data report and print the id and symbol for each downloaded gene
for report in package.get_data_reports():
    print(f"{report.gene_id}\t{report.symbol}")

Using Transcript or Protein accessions

Run the following command to download a zip archive containing gene FASTA sequence given a RefSeq nucleotide or protein accession:
datasets download gene accession NM_020107.5 NP_001334352.2

To get started with the Python library, see the Datasets Python API reference documentation.

First transform gene symbols into NCBI Gene IDs using the gene_metadata_by_accession() method from ncbi-datasets-pylib. Next, download the data package for those Gene IDs using the download_gene_package method. Lastly, open the zip file ane extract some data from the genomic fasta and data report files using the GeneDataset class in ncbi.datasets.package.dataset.

import sys
from typing import List

from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GeneApi as DatasetsGeneApi

from ncbi.datasets.package import dataset

# Provide accessions as a list of strings
accessions: List[str] = ["NM_020107.5", "NP_001334352.2"]

zipfile_name = "gene_ds.zip"

with DatasetsApiClient() as api_client:
    gene_api = DatasetsGeneApi(api_client)

    # download only takes NCBI gene-ids so call gene_metadata_by_accession() to find ids for the selected accessions
    gene_ids_for_accessions: List[int] = []
    try:
        gene_reply = gene_api.gene_metadata_by_accession(accessions=accessions, returned_content="IDS_ONLY")
        gene_ids_for_accessions = [int(gene_rec.gene.gene_id) for gene_rec in gene_reply.genes]
    except DatasetsApiException as e:
        sys.exit(f"Exception when calling GeneApi: {e}\n")

    # download the data package through the api-client.
    try:
        gene_ds_download = gene_api.download_gene_package(
            gene_ids_for_accessions,
            include_annotation_type=["FASTA_GENE", "FASTA_PROTEIN"],
            _preload_content=False,
        )

        # save the file
        with open(zipfile_name, "wb") as f:
            f.write(gene_ds_download.data)
    except DatasetsApiException as e:
        sys.exit(f"Exception when calling GeneApi: {e}\n")


# open the package zip archive so we can retrieve files from it
package = dataset.GeneDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())

# Use file types or names from the catalog to retrieve contents from specific files, e.g. nucleotide fasta
for nucleotide_fasta, file_name in package.get_files_by_type("GENOMIC_NUCLEOTIDE_FASTA"):
    print(file_name + ":\n", nucleotide_fasta[:100])

# get the data report and print the id and symbol for each downloaded gene
for report in package.get_data_reports():
    print(f"{report.gene_id}\t{report.symbol}")
Generated November 4, 2024