Source code for aoptk.literature.databases.europepmc

from __future__ import annotations
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path
from typing import ClassVar
from urllib.error import HTTPError
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from requests.adapters import MaxRetryError
from urllib3.util.retry import Retry
from aoptk.literature.abstract import Abstract
from aoptk.literature.get_abstract import GetAbstract
from aoptk.literature.get_id import GetID
from aoptk.literature.get_metadata import GetMetadata
from aoptk.literature.get_pdf import GetPDF
from aoptk.literature.get_publication import GetPublication
from aoptk.literature.id import DOI
from aoptk.literature.id import ID
from aoptk.literature.id import PMCID
from aoptk.literature.id import PMID
from aoptk.literature.metadata import Metadata
from aoptk.literature.pdf import PDF
from aoptk.literature.publication import Publication
from aoptk.literature.query import Query
from aoptk.literature.utils import convert_image_format
from aoptk.literature.utils import is_europepmc_id



[docs]
class EuropePMC(GetAbstract, GetPDF, GetID, GetPublication, GetMetadata):
    """Class to get data from Europe PMC based on a query."""


[docs]
    page_size = 1000


[docs]
    timeout = 30


[docs]
    headers: ClassVar = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "Connection": "keep-alive",
        "Upgrade-Insecure-Requests": "1",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Cache-Control": "max-age=0",
    }


[docs]
    image_extensions = (".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".tif")


[docs]
    unified_image_format = "png"


    def __init__(
        self,
        storage: Path,
        figure_storage: Path,
        query: Query | None = None,
    ):
        if not query:
            query = Query(search_term="")

[docs]
        self.search_term = self.build_search_term(query)


[docs]
        self.storage = storage


[docs]
        self.figure_storage = figure_storage

        Path(self.storage).mkdir(parents=True, exist_ok=True)
        Path(self.figure_storage).mkdir(parents=True, exist_ok=True)


[docs]
        self._session = requests.Session()

        self._session.headers.update(self.headers)

[docs]
        self.retry_strategy = Retry(
            total=10,
            backoff_factor=3,
            status_forcelist=[429, 408, 500, 502, 503, 504],
            allowed_methods=["GET", "POST"],
        )


[docs]
        self.adapter = HTTPAdapter(max_retries=self.retry_strategy)

        self._session.mount("https://", self.adapter)


[docs]
    def build_search_term(self, query: Query) -> str:
        """Convert Query to Europe PMC search syntax."""
        search_term = query.search_term
        if query.full_text_subset:
            search_term += " HAS_FT:Y"
        if query.only_preprint:
            search_term += " SRC:PPR"
        if query.exclude_preprint:
            search_term += " NOT SRC:PPR"
        if query.date:
            search_term += f" E_PDATE:{query.date[0]}-{query.date[1]}-{query.date[2]}"
        if query.licensing:
            search_term += self._get_license_filter(query.licensing)
        return search_term



[docs]
    def update_retry_strategy(self, strategy: Retry) -> None:
        """Update the retry strategy - allows customizing retry behaviour.

        This function updates the adapter and the session to ensure the new
        retry strategy is used for future requests.

        Args:
            strategy (Retry): Strategy to use.
        """
        self.retry_strategy = strategy
        self.adapter = HTTPAdapter(max_retries=self.retry_strategy)
        self._session.mount("https://", self.adapter)



[docs]
    def _get_license_filter(self, licensing: str) -> str:
        """Get the license filter string for a given licensing type.

        Args:
            licensing (str): The licensing type.

        Returns:
            str: The license filter string for Europe PMC search.
        """
        license_map = {
            "open-access": " LICENSE:CC",
            "CC0": " LICENSE:CC0",
            "CC-BY": " LICENSE:CC-BY",
            "CC-BY-SA": " LICENSE:CC-BY-SA",
            "CC-BY-ND": " LICENSE:CC-BY-ND",
            "CC-BY-NC": " LICENSE:CC-BY-NC",
            "CC-BY-NC-ND": " LICENSE:CC-BY-NC-ND",
            "CC-BY-NC-SA": " LICENSE:CC-BY-NC-SA",
        }
        return license_map.get(licensing, "")



[docs]
    def get_pdfs(self, ids: list[ID]) -> list[PDF]:
        """Retrieve PDFs."""
        pdfs = []
        pmc_ids = filter(is_europepmc_id, ids)
        for publication_id in pmc_ids:
            try:
                pdf = self._get_pdf(publication_id)
                pdfs.append(pdf)
            except (HTTPError, MaxRetryError):
                continue
        return pdfs



[docs]
    def get_abstracts(self, ids: list[ID]) -> list[Abstract]:
        """Retrieve Abstracts."""
        abstracts = []
        for publication_id in ids:
            try:
                if abstract := self._get_abstract(publication_id):
                    abstracts.append(abstract)
                    with (Path(self.storage) / f"{abstract.id}.txt").open("w", encoding="utf-8") as f:
                        f.write(abstract.text)
            except (HTTPError, MaxRetryError):
                continue
        return abstracts



[docs]
    def get_publications(self, ids: list[ID], download_figures_enabled: bool = True) -> list[Publication]:
        """Retrieve Publications.

        Args:
            ids (list[ID]): A list of publication IDs to retrieve.
            download_figures_enabled (bool): Whether to download figures and
            include their paths in the Publication objects.
        """
        publications = []
        for publication_id in ids:
            try:
                if publication := self._get_publication(publication_id, download_figures_enabled):
                    publications.append(publication)
                    with (Path(self.storage) / f"{publication.id}.txt").open("w", encoding="utf-8") as f:
                        f.write(publication.full_text)
            except (HTTPError, MaxRetryError):
                continue
        return publications



[docs]
    def get_publications_metadata(self, ids: list[ID]) -> list[Metadata]:
        """Retrieve Publication metadata."""
        metadata = []
        for publication_id in ids:
            try:
                if publication := self._get_publication_metadata(publication_id):
                    metadata.append(publication)
            except (HTTPError, MaxRetryError):
                continue
        return metadata



[docs]
    def get_ids(self) -> list[ID]:
        """Get a list of publication IDs from EuropePMC based on the search term."""
        cursor_mark = "*"
        id_list = []

        while True:
            data_europepmc = self._call_api(cursor_mark, "idlist", self.search_term)
            results = data_europepmc.get("resultList", {}).get("result", [])

            id_list.extend([_get_publication_id(result) for result in results])

            next_cursor = data_europepmc.get("nextCursorMark")
            if not next_cursor or next_cursor == cursor_mark:
                break
            cursor_mark = next_cursor

        return id_list



[docs]
    def _get_pdf(self, publication_id: ID) -> PDF:
        """Retrieve the PDF for a given publication ID.

        Args:
            publication_id (ID): The ID of the publication for which to retrieve the PDF.

        Returns:
            PDF | None: The PDF object if successful, None otherwise.
        """
        response = self._session.get(
            f"https://europepmc.org/api/getPdf?pmcid={publication_id}",
            stream=True,
            timeout=self.timeout,
        )
        response.raise_for_status()
        return self._write_pdf(publication_id, response)



[docs]
    def _write_pdf(self, publication_id: ID, response: requests.Response) -> PDF:
        """Write the PDF content to a file and return a PDF object.

        Args:
            publication_id (ID): The ID of the publication for which the PDF is being written.
            response (requests.Response): The HTTP response containing the PDF content.
        """
        filepath = Path(self.storage) / f"{publication_id}.pdf"
        with filepath.open("wb") as f:
            f.writelines(response.iter_content(chunk_size=8192))
        return PDF(filepath)



[docs]
    def _get_abstract(self, publication_id: ID) -> Abstract | None:
        """Return abstract from Europe PMC for a given publication ID.

        Args:
            publication_id (ID): The ID of the publication for which to retrieve the abstract.

        Returns:
            Abstract: The abstract object if successful, None otherwise.
        """
        cursor_mark = "*"

        json_data = self._call_api(cursor_mark, "core", publication_id)
        results = json_data.get("resultList", {}).get("result", [])

        if text := results[0].get("abstractText", None):
            return Abstract(text=text, id=publication_id)
        return None



[docs]
    def _call_api(self, cursor_mark: str, result_type: str, query: str | ID) -> dict:
        """Call the EuropePMC web api to query the search.

        Args:
            cursor_mark (str): Parameter for pagination.
            result_type (str): Whether to search for idlists or core.
            query (str | ID): main query to carry out - default self._query

        Returns:
            dict: JSON response
        """
        url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
        params = {
            "query": str(query),
            "format": "json",
            "pageSize": self.page_size,
            "cursorMark": cursor_mark,
            "resultType": result_type,
        }
        response = self._session.get(url, params=params, timeout=self.timeout)
        response.raise_for_status()
        return response.json()



[docs]
    def _get_publication_metadata(self, publication_id: ID) -> Metadata | None:
        """Return abstract from Europe PMC for a given publication ID.

        Args:
            publication_id (ID): The ID of the publication to retrieve metadata for.
        """
        cursor_mark = "*"

        json_data = self._call_api(cursor_mark, "core", publication_id)
        results = json_data.get("resultList", {}).get("result", [])

        if results:
            pmcid = results[0].get("pmcid", None)
            pmid = results[0].get("pmid", None)
            doi = results[0].get("doi", None)
            year = int(year) if (year := results[0].get("pubYear", None)) else None
            title = results[0].get("title", None)
            if authors := results[0].get("authorString", None):
                authors = [author.strip().rstrip(".") for author in authors.split(",") if author.strip()]
            else:
                authors = []
            return Metadata(
                id=publication_id,
                pmcid=PMCID(pmcid) if pmcid else None,
                pmid=PMID(pmid) if pmid else None,
                doi=DOI(doi) if doi else None,
                year=year,
                title=title,
                authors=authors,
            )
        return None



[docs]
    def _get_publication(self, publication_id: ID, download_figures_enabled: bool = True) -> Publication | None:
        """Return a Publication object for a given publication ID.

        Args:
            publication_id (ID): The ID of the publication to retrieve.
            download_figures_enabled (bool): Whether to download figures
            and include their paths in the Publication object.
        """
        if root := self._get_xml(publication_id):
            return Publication(
                id=publication_id,
                abstract=Abstract(text=self._parse_xml_abstract(root), id=publication_id),
                full_text=self._parse_xml_full_text(root),
                figures=self._get_figures(publication_id) if download_figures_enabled else [],
                figure_descriptions=self._parse_xml_figure_descriptions(root) if download_figures_enabled else [],
                tables=self._parse_xml_tables(root),
            )
        return None



[docs]
    def _parse_xml_abstract(self, root: ET.Element) -> str:
        """Return the full text content of the first <abstract> element as a single string.

        Args:
            root (ET.Element): The root element of the XML tree.
        """
        if (abstract_elem := root.find(".//abstract")) is not None:
            return " ".join(abstract_elem.itertext()).strip()
        return ""



[docs]
    def _parse_xml_full_text(self, root: ET.Element) -> str:
        """Parse the XML content to extract the full text.

        Args:
            root (ET.Element): The root element of the XML tree.
        """
        lines = []

        for element in root.iter():
            if element.tag in {"title", "p"}:
                text = "".join(element.itertext()).strip()
                if text:
                    lines.append(text)

        return "\n\n".join(lines)



[docs]
    def _parse_xml_figure_descriptions(self, root: ET.Element) -> str:
        """Parse the XML content to extract the figure descriptions.

        Args:
            root (ET.Element): The root element of the XML tree.
        """
        lines = []

        for element in root.iter():
            if element.tag == "fig":
                text = "".join(element.itertext()).strip()
                if text:
                    lines.append(text)

        return "\n\n".join(lines)



[docs]
    def _parse_xml_tables(self, root: ET.Element) -> list[pd.DataFrame]:
        """Parse the XML content to extract tables as a list of DataFrames, preserving order.

        Args:
            root (ET.Element): The root element of the XML tree.
        """
        tables = []
        for element in root.iter():
            if element.tag == "table-wrap" and (table_elem := element.find(".//table")) is not None:
                rows = self._extract_rows(table_elem)
                df = pd.DataFrame(rows)
                tables.append(df)
        return tables



[docs]
    def _extract_rows(self, table_elem: ET.Element) -> list[list[str]]:
        """Extract rows from a table element, preserving order.

        Args:
            table_elem (ET.Element): The XML element representing the table.
        """
        rows = []
        for row in table_elem.findall(".//tr"):
            cells = ["".join(cell.itertext()).strip() for cell in row.findall(".//td")]
            if not cells:
                cells = ["".join(cell.itertext()).strip() for cell in row.findall(".//th")]
            rows.append(cells)
        return rows



[docs]
    def _get_xml(self, publication_id: ID) -> ET.Element | None:
        """Retrieve the XML root element for a given publication ID.

        Args:
            publication_id (ID): The ID of the publication to retrieve XML for.
        """
        if is_europepmc_id(publication_id):
            response = self._session.get(
                f"https://www.ebi.ac.uk/europepmc/webservices/rest/{publication_id}/fullTextXML",
                stream=True,
                timeout=self.timeout,
            )
            response.raise_for_status()
            xml_path = Path(self.storage) / f"{publication_id}.xml"
            with xml_path.open("w", encoding="utf-8") as f:
                f.write(response.text)
            root = ET.parse(xml_path).getroot()
            xml_path.unlink()
            if root is None:
                return None
            return root
        return None



[docs]
    def _get_figures(self, publication_id: ID) -> list[Path]:
        """Retrieve the figure file paths for a given publication ID.

        Args:
            publication_id (ID): The ID of the publication to retrieve figures for.
        """
        if zip_path := self._get_supplementary_zip_path(publication_id):
            base_dir = Path(self.figure_storage) / f"{publication_id}"
            base_dir.mkdir(parents=True, exist_ok=True)
            image_paths = []
            with zipfile.ZipFile(zip_path, "r") as zip_ref:
                for file_info in zip_ref.infolist():
                    if file_info.filename.lower().endswith(self.image_extensions):
                        zip_ref.extract(file_info, base_dir)
                        image_paths.append(str(base_dir / file_info.filename))
            zip_path.unlink()
            return convert_image_format([Path(path) for path in image_paths], self.unified_image_format)
        return []



[docs]
    def _get_supplementary_zip_path(self, publication_id: ID) -> Path | None:
        """Download the supplementary files ZIP for a given publication ID and return the path to the ZIP file.

        Args:
            publication_id (ID): The ID of the publication to retrieve supplementary files for.
        """
        if is_europepmc_id(publication_id):
            zip_path = Path(self.storage) / f"{publication_id}_supplementary.zip"
            response = self._session.get(
                f"https://www.ebi.ac.uk/europepmc/webservices/rest/{publication_id}/supplementaryFiles",
                stream=True,
                timeout=self.timeout,
            )
            response.raise_for_status()
            with zip_path.open("wb") as f:
                f.write(response.content)
                return zip_path
        return None





[docs]
def _get_publication_id(result: dict) -> ID:
    """Extract the publication ID from the API result, checking for 'pmcid', 'pmid', and 'id' in order.

    Args:
    result (dict): The API result containing publication information.
    """
    if publication_id := result.get("pmcid") or result.get("pmid") or result.get("id"):
        return ID(publication_id)
    msg = "Europe PMC result is missing a publication id"
    raise ValueError(msg)