Download a gene data package
Download an NCBI Datasets Gene Data Package, including FASTA sequences and metadata
Download a gene data package
Gene metadata and FASTA sequence are available as an NCBI Datasets Gene Data Package .
Using Gene IDs
datasets download gene symbol --taxon human ACRV1 A2M
To get started with the Python library, see the Datasets Python API reference documentation.
First download the data package for the chosen Gene IDs using the
download_gene_package method. Next, open the zip file and extract some data from the protein fasta and data report files using the GeneDataset
class in ncbi.datasets.package.dataset.
import sys
from typing import List
from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GeneApi as DatasetsGeneApi
from ncbi.datasets.package import dataset
# the list of gene-ids to be downloaded
gene_ids: List[int] = [1, 2, 3, 9, 10, 11, 12, 13, 14, 15, 16, 17]
zipfile_name = "gene_ds.zip"
# download the data package through the api-client.
with DatasetsApiClient() as api_client:
gene_api = DatasetsGeneApi(api_client)
try:
gene_dataset_download = gene_api.download_gene_package(
gene_ids,
include_annotation_type=["FASTA_GENE", "FASTA_PROTEIN"],
_preload_content=False,
)
with open(zipfile_name, "wb") as f:
f.write(gene_dataset_download.data)
except DatasetsApiException as e:
sys.exit(f"Exception when calling GeneApi: {e}\n")
# open the package zip archive so we can retrieve files from it
package = dataset.GeneDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())
# Use file types or names from the catalog to retrieve contents from specific files, e.g. protein fasta
for protein_fasta, file_name in package.get_files_by_type("PROTEIN_FASTA"):
print(file_name, protein_fasta[:100])
# get the data report and print the id and symbol for each downloaded gene
for report in package.get_data_reports():
print(f"{report.gene_id}\t{report.symbol}")
Using Gene symbols
datasets download gene symbol --taxon human ACRV1 A2M
To get started with the Python library, see the Datasets Python API reference documentation.
First transform gene symbols into NCBI Gene IDs using the gene_metadata_by_tax_and_symbol method from ncbi-datasets-pylib.
Next, download the data package for those Gene IDs using the download_gene_package method. Lastly, open the zip file and extract some metadata from the data report file using
the GeneDataset
class in ncbi.datasets.package.dataset.
import sys
from typing import List
from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GeneApi as DatasetsGeneApi
from ncbi.datasets.package import dataset
# the list of gene symbols and their associated taxon to download
symbols: List[str] = ["ACRV1", "A2M"]
taxon = "human"
zipfile_name = "gene_ds.zip"
with DatasetsApiClient() as api_client:
gene_api = DatasetsGeneApi(api_client)
# download only takes NCBI gene-ids so call metadata function first to convert symbols to ids
gene_ids_for_symbols: List[int] = []
try:
gene_reply = gene_api.gene_metadata_by_tax_and_symbol(symbols=symbols, taxon=taxon, returned_content="IDS_ONLY")
gene_ids_for_symbols = [int(gene_rec.gene.gene_id) for gene_rec in gene_reply.genes]
except DatasetsApiException as e:
sys.exit(f"Exception when calling GeneApi: {e}\n")
# download the data package through the api-client.
try:
gene_ds_download = gene_api.download_gene_package(
gene_ids_for_symbols,
include_annotation_type=["FASTA_GENE", "FASTA_PROTEIN"],
_preload_content=False,
)
# save the file
with open(zipfile_name, "wb") as f:
f.write(gene_ds_download.data)
except DatasetsApiException as e:
sys.exit(f"Exception when calling GeneApi: {e}\n")
# open the package zip archive so we can retrieve files from it
package = dataset.GeneDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())
# get the data report and print the id and symbol for each downloaded gene
for report in package.get_data_reports():
print(f"{report.gene_id}\t{report.symbol}")
Using Transcript or Protein accessions
datasets download gene accession NM_020107.5 NP_001334352.2
To get started with the Python library, see the Datasets Python API reference documentation.
First transform gene symbols into NCBI Gene IDs using the gene_metadata_by_accession()
method from ncbi-datasets-pylib.
Next, download the data package for those Gene IDs using the download_gene_package method. Lastly, open the zip file ane extract some data from the genomic fasta and data report files using the
GeneDataset
class in ncbi.datasets.package.dataset.
import sys
from typing import List
from ncbi.datasets.openapi import ApiClient as DatasetsApiClient
from ncbi.datasets.openapi import ApiException as DatasetsApiException
from ncbi.datasets import GeneApi as DatasetsGeneApi
from ncbi.datasets.package import dataset
# Provide accessions as a list of strings
accessions: List[str] = ["NM_020107.5", "NP_001334352.2"]
zipfile_name = "gene_ds.zip"
with DatasetsApiClient() as api_client:
gene_api = DatasetsGeneApi(api_client)
# download only takes NCBI gene-ids so call gene_metadata_by_accession() to find ids for the selected accessions
gene_ids_for_accessions: List[int] = []
try:
gene_reply = gene_api.gene_metadata_by_accession(accessions=accessions, returned_content="IDS_ONLY")
gene_ids_for_accessions = [int(gene_rec.gene.gene_id) for gene_rec in gene_reply.genes]
except DatasetsApiException as e:
sys.exit(f"Exception when calling GeneApi: {e}\n")
# download the data package through the api-client.
try:
gene_ds_download = gene_api.download_gene_package(
gene_ids_for_accessions,
include_annotation_type=["FASTA_GENE", "FASTA_PROTEIN"],
_preload_content=False,
)
# save the file
with open(zipfile_name, "wb") as f:
f.write(gene_ds_download.data)
except DatasetsApiException as e:
sys.exit(f"Exception when calling GeneApi: {e}\n")
# open the package zip archive so we can retrieve files from it
package = dataset.GeneDataset(zipfile_name)
# print the names and types of all files in the downloaded zip file
print(package.get_catalog())
# Use file types or names from the catalog to retrieve contents from specific files, e.g. nucleotide fasta
for nucleotide_fasta, file_name in package.get_files_by_type("GENOMIC_NUCLEOTIDE_FASTA"):
print(file_name + ":\n", nucleotide_fasta[:100])
# get the data report and print the id and symbol for each downloaded gene
for report in package.get_data_reports():
print(f"{report.gene_id}\t{report.symbol}")