Source code for aoptk.literature.databases.pmc

import json
import os
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Any
from urllib.error import HTTPError
from urllib.parse import urlparse
import boto3
import pandas as pd
from Bio import Entrez
from botocore import UNSIGNED
from botocore.client import Config
from requests.adapters import MaxRetryError
from aoptk.literature.abstract import Abstract
from aoptk.literature.databases.ncbi import NCBI
from aoptk.literature.get_abstract import GetAbstract
from aoptk.literature.get_id import GetID
from aoptk.literature.get_metadata import GetMetadata
from aoptk.literature.get_pdf import GetPDF
from aoptk.literature.get_publication import GetPublication
from aoptk.literature.id import DOI
from aoptk.literature.id import ID
from aoptk.literature.id import PMCID
from aoptk.literature.id import PMID
from aoptk.literature.metadata import Metadata
from aoptk.literature.pdf import PDF
from aoptk.literature.publication import Publication
from aoptk.literature.query import Query
from aoptk.literature.utils import convert_image_format
from aoptk.literature.utils import remove_pmc_prefix

Entrez.api_key = os.environ.get("NCBI_API_KEY")  # type: ignore[assignment]



[docs]
class PMC(GetPublication, GetPDF, GetID, GetAbstract, GetMetadata):
    """Class to get data from PMC based on a query."""


[docs]
    aws_region = "us-east-1"


[docs]
    s3 = boto3.client(
        "s3",
        config=Config(signature_version=UNSIGNED),
        region_name=aws_region,
    )


[docs]
    bucket = "pmc-oa-opendata"


[docs]
    paginator = s3.get_paginator("list_objects_v2")



[docs]
    image_extensions = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".tiff", ".tif")


[docs]
    unified_image_format = "png"


    def __init__(
        self,
        storage: Path,
        figure_storage: Path,
        query: Query | None = None,
    ):
        if not query:
            query = Query(search_term="queryblank")

[docs]
        self.search_term = self.build_search_term(query)



[docs]
        self._ncbi = NCBI(database="pmc")



[docs]
        self.storage = storage

        Path(self.storage).mkdir(parents=True, exist_ok=True)


[docs]
        self.figure_storage = figure_storage

        Path(self.figure_storage).mkdir(parents=True, exist_ok=True)


[docs]
    def build_search_term(self, query: Query) -> str:
        """Convert Query to PMC search syntax."""
        search_term = query.search_term
        if query.full_text_subset:
            search_term += " open access[filter]"
        if query.only_preprint:
            search_term += " ahead of print[filter]"
        if query.exclude_preprint:
            search_term += " NOT ahead of print[filter]"
        if query.date:
            search_term += f" {query.date[0]}/{query.date[1]}/{query.date[2]}[dp]"
        if query.licensing:
            search_term += self._get_license_filter(query.licensing)
        return search_term



[docs]
    def _get_license_filter(self, licensing: str) -> str:
        """Get the license filter string for a given licensing type.

        Args:
            licensing (str): The licensing type.

        Returns:
            str: The license filter string for PMC search.
        """
        license_map = {
            "open-access": ' "open access"[filter]',
            "CC0": ' "cc0 license"[filter]',
            "CC-BY": ' "cc by license"[filter]',
            "CC-BY-SA": ' "cc by-nc-sa license"[filter]',
            "CC-BY-ND": ' "cc by-nd license"[filter]',
            "CC-BY-NC": ' "cc by-nc license"[filter]',
            "CC-BY-NC-ND": ' "cc by-nc-nd license"[filter]',
            "CC-BY-NC-SA": ' "cc by-nc-sa license"[filter]',
        }
        return license_map.get(licensing, "")



[docs]
    def get_pdfs(self, ids: list[ID]) -> list[PDF]:
        """Retrieve PDFs.

        Returns:
            list[PDF]: A list of PDF objects.
        """
        pdfs = []
        for publication_id in ids:
            try:
                if pdf := self._get_pdf(publication_id):
                    pdfs.append(pdf)
            except (HTTPError, MaxRetryError):
                continue
        return pdfs



[docs]
    def get_publications(self, ids: list[ID], download_figures_enabled: bool = True) -> list[Publication]:
        """Get a list of publications.

        Args:
            ids (list[ID]): A list of publication IDs to retrieve.
            download_figures_enabled (bool): Whether to download figures
            and include their paths in the Publication objects.

        Returns:
            list[Publication]: A list of Publication objects.
        """
        publications = []
        for publication_id in ids:
            try:
                if publication := self._get_publication(publication_id, download_figures_enabled):
                    publications.append(publication)
                    with (Path(self.storage) / f"{publication.id}.txt").open("w", encoding="utf-8") as f:
                        f.write(publication.full_text)
            except (HTTPError, MaxRetryError):
                continue
        return publications



[docs]
    def get_ids(self) -> list[ID]:
        """Retrieve a list of publication IDs based on the search term."""
        ids = self._ncbi.get_ids(self.search_term)
        return [ID(f"PMC{pmcid}") for pmcid in ids]



[docs]
    def get_abstracts(self, ids: list[ID]) -> list[Abstract]:
        """Retrieve Abstracts based on the list of IDs."""
        abstracts = []
        try:
            records = self._ncbi.get_abstract_records(ids)
            abstracts = self._parse_pmc_abstract_records(records)
            for abstract in abstracts:
                with (Path(self.storage) / f"{abstract.id}.txt").open("w", encoding="utf-8") as f:
                    f.write(abstract.text)
        except (HTTPError, MaxRetryError):
            pass
        return abstracts



[docs]
    def _parse_pmc_abstract_records(self, records: list[Any]) -> list[Abstract]:
        """Parse PMC abstract handles and return a list of Abstract objects.

        Args:
            records (list[Any]): A list of PMC Entrez fetch handles.
        """
        abstracts: list[Abstract] = []
        for record in records:
            root = ET.fromstring(record)
            for article in root.findall(".//article"):
                pmc_id = article.findtext(".//article-id")
                if not pmc_id or (abstract_node := article.find(".//abstract")) is None:
                    continue
                abstract_text = " ".join(" ".join(abstract_node.itertext()).split())
                abstracts.append(Abstract(text=abstract_text, id=ID(pmc_id)))
        return abstracts



[docs]
    def get_publications_metadata(self, ids: list[ID]) -> list[Metadata]:
        """Retrieve Publication metadata.

        Args:
            ids (list[ID]): A list of publication IDs for which to retrieve metadata.
        """
        metadata = []
        try:
            records = self._ncbi.get_publications_metadata_records(remove_pmc_prefix(ids))
            metadata = self._parse_pmc_metadata_records(records)
        except (HTTPError, MaxRetryError):
            pass
        return metadata



[docs]
    def _parse_pmc_metadata_records(self, records: list[str]) -> list[Metadata]:
        """Parse PMC metadata records and return a list of PublicationMetadata objects.

        Args:
            records (list): A list of PMC XML summary payloads.
        """
        publications_metadata: list[Metadata] = []

        for record in records:
            root = ET.fromstring(record)
            for article in root.findall(".//DocSum"):
                if not (pmcid := article.findtext("./Item[@Name='ArticleIds']/Item[@Name='pmcid']")):
                    continue
                pmid = article.findtext("./Item[@Name='ArticleIds']/Item[@Name='pmid']")
                doi = article.findtext("./Item[@Name='ArticleIds']/Item[@Name='doi']")
                year = int(pub_date.split()[0]) if (pub_date := article.findtext("./Item[@Name='PubDate']")) else None
                title = article.findtext("./Item[@Name='Title']")
                authors = [
                    author.text
                    for author in article.findall("./Item[@Name='AuthorList']/Item[@Name='Author']")
                    if author.text
                ]
                publications_metadata.append(
                    Metadata(
                        id=ID(pmcid),
                        pmcid=PMCID(pmcid),
                        pmid=PMID(pmid) if pmid else None,
                        doi=DOI(doi) if doi else None,
                        year=year,
                        title=title,
                        authors=authors,
                    ),
                )
        return publications_metadata



[docs]
    def _get_publication(self, publication_id: ID, download_figures_enabled: bool = True) -> Publication | None:
        """Parse a single PDF and return a Publication object.

        Args:
            publication_id (str): The publication ID to retrieve and parse.
            download_figures_enabled (bool): Whether to download figures
            and include their paths in the Publication object.
        """
        abstract = Abstract(id=publication_id, text="")

        full_text = self._get_full_text(publication_id)
        if full_text is None:
            return None

        figures = self._get_figures(publication_id) if download_figures_enabled else []
        figure_descriptions: list[str] = []
        tables: list[pd.DataFrame] = []
        return Publication(
            id=publication_id,
            abstract=abstract,
            full_text=full_text,
            figures=figures,
            figure_descriptions=figure_descriptions,
            tables=tables,
        )



[docs]
    def _get_full_text(self, publication_id: ID) -> str | None:
        """Retrieve the full text for a given publication ID.

        Args:
            publication_id (str): The publication ID to retrieve the full text for.
        """
        if txt_path := self._get_file(publication_id, "txt"):
            with Path.open(txt_path, encoding="utf-8") as f:
                txt = f.read()
            Path.unlink(txt_path)
            return txt
        return None



[docs]
    def _get_file(self, publication_id: ID, file_format: str) -> Path | None:
        """Retrieve the file for a given publication ID and format.

        Args:
            publication_id (str): The publication ID to retrieve the file for.
            file_format (str): The format of the file to retrieve (pdf, xml, json, or txt).
            Formats txt, xml, pdf contain full-text, while json contains metadata.
        """
        prefix = f"{publication_id}.1/{publication_id}.1.{file_format}"
        response = self.s3.list_objects_v2(Bucket=self.bucket, Prefix=prefix, MaxKeys=1)
        if contents := response.get("Contents", []):
            if key := contents[0]["Key"]:
                filepath = Path(self.storage) / f"{publication_id}.{file_format}"
                self.s3.download_file(self.bucket, key, str(filepath))
                return filepath
            return None
        return None



[docs]
    def _get_figures(self, publication_id: ID) -> list[Path]:
        """Retrieve the figure files for a given publication ID.

        Args:
            publication_id (ID): The publication ID to retrieve the figure files for.
        """
        if metadata := self._get_json(publication_id):
            supplementary_files = metadata.get("media_urls", [])
            return self._extract_figures_from_supplements(publication_id, supplementary_files)

        return []



[docs]
    def _extract_figures_from_supplements(self, publication_id: ID, supplementary_files: list[str]) -> list[Path]:
        """Extract figure files from the supplementary files.

        Args:
            publication_id (ID): The publication ID to retrieve the figure files for.
            supplementary_files (list[str]): A list of supplementary file URLs to extract figures from.
        """
        figures_paths = []

        base_dir = Path(self.figure_storage) / f"{publication_id}"
        base_dir.mkdir(parents=True, exist_ok=True)

        for supplement in supplementary_files:
            parsed = urlparse(supplement)

            key = parsed.path.lstrip("/")
            if key.lower().endswith(self.image_extensions):
                image_name = Path(parsed.path).name
                image_path = base_dir / image_name
                image_path.parent.mkdir(parents=True, exist_ok=True)
                self.s3.download_file(self.bucket, key, str(image_path))
                figures_paths.append(str(image_path))
        return convert_image_format([Path(path) for path in figures_paths], self.unified_image_format)



[docs]
    def _get_json(self, publication_id: ID) -> dict[str, Any] | None:
        """Retrieve the json for a given publication ID.

        Args:
            publication_id (str): The publication ID to retrieve the json for.
        """
        if json_path := self._get_file(publication_id, "json"):
            metadata = json.load(json_path.open())
            Path.unlink(json_path)
            return metadata
        return None



[docs]
    def _get_pdf(self, publication_id: ID) -> PDF | None:
        """Retrieve the PDF for a given publication ID.

        Args:
            publication_id (str): The publication ID to retrieve the PDF for.
        """
        if pdf_path := self._get_file(publication_id, "pdf"):
            return PDF(pdf_path)
        return None