from __future__ import annotations
import xml.etree.ElementTree as ET
import zipfile
from datetime import UTC
from datetime import datetime
from pathlib import Path
from typing import ClassVar
import pandas as pd
import requests
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from aoptk.literature.abstract import Abstract
from aoptk.literature.get_abstract import GetAbstract
from aoptk.literature.get_id import GetID
from aoptk.literature.get_pdf import GetPDF
from aoptk.literature.get_publication import GetPublication
from aoptk.literature.get_publication_metadata import GetPublicationMetadata
from aoptk.literature.id import ID
from aoptk.literature.pdf import PDF
from aoptk.literature.publication import Publication
from aoptk.literature.publication_metadata import PublicationMetadata
from aoptk.literature.utils import is_europepmc_id
[docs]
class EuropePMC(GetAbstract, GetPDF, GetID, GetPublication, GetPublicationMetadata):
"""Class to get PDFs from EuropePMC based on a query."""
[docs]
image_extensions = (".jpg", ".jpeg", ".png", ".bmp", ".tiff")
def __init__(
self,
query: str,
storage: str,
figure_storage: str,
):
Path(self.storage).mkdir(parents=True, exist_ok=True)
Path(self.figure_storage).mkdir(parents=True, exist_ok=True)
[docs]
self._session = requests.Session()
self._session.headers.update(self.headers)
retry_strategy = Retry(
total=10,
backoff_factor=3,
status_forcelist=[429, 500, 502, 503, 504],
allowed_methods=["GET", "POST"],
)
adapter = HTTPAdapter(max_retries=retry_strategy)
self._session.mount("https://", adapter)
[docs]
self.id_list = self.get_ids()
[docs]
def get_pdfs(self) -> list[PDF]:
"""Retrieve PDFs based on the query."""
return [pdf for pdf in (self._get_pdf(publication_id) for publication_id in self.id_list) if pdf is not None]
[docs]
def get_abstracts(self) -> list[Abstract]:
"""Retrieve Abstracts based on the query."""
return [
abstract
for abstract in (self._get_abstract(publication_id) for publication_id in self.id_list)
if abstract is not None
]
[docs]
def get_publications(self) -> list[Publication]:
"""Retrieve Publications based on the query."""
return [
publication
for publication in (self._get_publication(publication_id) for publication_id in self.id_list)
if publication is not None
]
[docs]
def get_ids(self) -> list[ID]:
"""Get a list of publication IDs from EuropePMC based on the query."""
cursor_mark = "*"
id_list = []
while True:
data_europepmc = self._call_api(cursor_mark, "idlist", self._query)
results = data_europepmc.get("resultList", {}).get("result", [])
id_list.extend([_get_publication_id(result) for result in results])
next_cursor = data_europepmc.get("nextCursorMark")
if not next_cursor or next_cursor == cursor_mark:
break
cursor_mark = next_cursor
return id_list
[docs]
def remove_reviews(self) -> EuropePMC:
"""Modify the query to exclude review articles."""
self._query += ' NOT PUB_TYPE:"Review"'
return self
[docs]
def abstracts_only(self) -> EuropePMC:
"""Modify the query to search in the text of abstracts only."""
self._query = "ABSTRACT:(" + self._query + ")"
return self
[docs]
def _get_pdf(self, publication_id: str) -> PDF | None:
"""Retrieve the PDF for a given publication ID.
Args:
publication_id (str): The ID of the publication for which to retrieve the PDF.
Returns:
PDF | None: The PDF object if successful, None otherwise.
"""
if is_europepmc_id(publication_id):
response = self._session.get(
f"https://europepmc.org/backend/ptpmcrender.fcgi?accid={publication_id}&blobtype=pdf",
stream=True,
timeout=self.timeout,
)
if response.ok:
return self._write_pdf(publication_id, response)
return None
return None
[docs]
def _write_pdf(self, publication_id: str, response: requests.Response) -> PDF:
"""Write the PDF content to a file and return a PDF object.
Args:
publication_id (str): The ID of the publication for which the PDF is being written.
response (requests.Response): The HTTP response containing the PDF content.
"""
filepath = Path(self.storage) / f"{publication_id}.pdf"
with filepath.open("wb") as f:
f.writelines(response.iter_content(chunk_size=8192))
return PDF(filepath)
[docs]
def _get_abstract(self, publication_id: str) -> Abstract:
"""Return abstract from Europe PMC for a given publication ID.
Args:
publication_id (str): The ID of the publication for which to retrieve the abstract.
Returns:
Abstract: The abstract object if successful, None otherwise.
"""
cursor_mark = "*"
json_data = self._call_api(cursor_mark, "core", publication_id)
results = json_data.get("resultList", {}).get("result", [])
if results:
return Abstract(text=results[0].get("abstractText", ""), id=ID(publication_id))
return Abstract(text="", id=ID(publication_id))
[docs]
def _call_api(self, cursor_mark: str, result_type: str, query: str) -> dict:
"""Call the EuropePMC web api to query the search.
Args:
cursor_mark (str): Parameter for pagination.
result_type (str): Whether to search for idlists or core.
query (str): main query to carry out - default self._query
Returns:
dict: JSON response
"""
url = "https://www.ebi.ac.uk/europepmc/webservices/rest/search"
params = {
"query": query,
"format": "json",
"pageSize": self.page_size,
"cursorMark": cursor_mark,
"resultType": result_type,
}
response = self._session.get(url, params=params, timeout=self.timeout)
response.raise_for_status()
return response.json()
[docs]
def _get_publication(self, publication_id: str) -> Publication | None:
"""Return a Publication object for a given publication ID.
Args:
publication_id (str): The ID of the publication to retrieve.
"""
if xml_tree := self._get_xml(publication_id):
root = xml_tree.getroot()
return Publication(
id=publication_id,
abstract=self._parse_xml_abstract(root),
full_text=self._parse_xml_full_text(root),
figures=self._get_figures(publication_id),
figure_descriptions=self._parse_xml_figure_descriptions(root),
tables=self._parse_xml_tables(root),
)
return None
[docs]
def _parse_xml_abstract(self, root: ET.Element) -> str:
"""Return the full text content of the first <abstract> element as a single string.
Args:
root (ET.Element): The root element of the XML tree.
"""
if (abstract_elem := root.find(".//abstract")) is not None:
return " ".join(abstract_elem.itertext()).strip()
return ""
[docs]
def _parse_xml_full_text(self, root: ET.Element) -> str:
"""Parse the XML content to extract the full text.
Args:
root (ET.Element): The root element of the XML tree.
"""
lines = []
for element in root.iter():
if element.tag in {"title", "p"}:
text = "".join(element.itertext()).strip()
if text:
lines.append(text)
return "\n\n".join(lines)
[docs]
def _parse_xml_tables(self, root: ET.Element) -> list[pd.DataFrame]:
"""Parse the XML content to extract tables as a list of DataFrames, preserving order.
Args:
root (ET.Element): The root element of the XML tree.
"""
tables = []
for element in root.iter():
if element.tag == "table-wrap" and (table_elem := element.find(".//table")) is not None:
rows = self._extract_rows(table_elem)
df = pd.DataFrame(rows)
tables.append(df)
return tables
[docs]
def _get_xml(self, publication_id: str) -> str | None:
"""Retrieve the XML content for a given publication ID.
Args:
publication_id (str): The ID of the publication to retrieve XML for.
"""
if is_europepmc_id(publication_id):
response = self._session.get(
f"https://www.ebi.ac.uk/europepmc/webservices/rest/{publication_id}/fullTextXML",
stream=True,
timeout=self.timeout,
)
if response.ok:
xml_path = Path(self.storage) / f"{publication_id}.xml"
with xml_path.open("w", encoding="utf-8") as f:
f.write(response.text)
tree = ET.parse(xml_path)
Path.unlink(xml_path)
return tree
return None
return None
[docs]
def _get_supplementary_zip_path(self, publication_id: str) -> str | None:
"""Download the supplementary files ZIP for a given publication ID and return the path to the ZIP file.
Args:
publication_id (str): The ID of the publication to retrieve supplementary files for.
"""
if is_europepmc_id(publication_id):
zip_path = Path(self.storage) / f"{publication_id}_supplementary.zip"
response = self._session.get(
f"https://www.ebi.ac.uk/europepmc/webservices/rest/{publication_id}/supplementaryFiles",
stream=True,
timeout=self.timeout,
)
if response.ok:
with zip_path.open("wb") as f:
f.write(response.content)
return zip_path
return None
[docs]
def _get_publication_id(result: dict) -> str | None:
"""Extract the publication ID from the API result, checking for 'pmcid', 'pmid', and 'id' in order.
Args:
result (dict): The API result containing publication information.
"""
return result.get("pmcid") or result.get("pmid") or result.get("id")