Using as a Library

Beyond the command-line interface, you can import and use the core components of europmc-dev-tool directly in your Python scripts. This allows for greater flexibility and integration into your own custom workflows.

Key Components

  • API Clients: Located in europmc_dev_tool.api, these classes (ArticlesClient, AnnotationsClient, etc.) provide direct access to the Europe PMC APIs.

  • JATS Processor: The XMLProcessor class in europmc_dev_tool.jats_processor handles the conversion of JATS XML to structured JSON.

  • Accession Number Extractor: The extract_with_spacy function in europmc_dev_tool.spacy_extractor finds accession numbers in text.

Example Script

The following script, script_usage_example.py, demonstrates how to use these components to fetch an article from Europe PMC, process its XML, and extract accession numbers.

script_usage_example.py
"""
Example script demonstrating how to use the europmc-dev-tool package
as a library in your own Python scripts.
"""

import json
import os
import requests
import spacy
from europmc_dev_tool.api.articles import ArticlesClient
from europmc_dev_tool.jats_processor import XMLProcessor
from europmc_dev_tool.section_maps import ordered_labels
from europmc_dev_tool.spacy_extractor import extract_with_spacy


def get_xml_content(identifier):
    """
    Intelligently fetches JATS XML content from a PMCID, URL, or local file.

    :param identifier: A PMCID (e.g., "PMC12345"), a URL, or a local file path.
    :type identifier: str
    :return: The XML content as a string, or None if fetching fails.
    :rtype: str or None
    """
    xml_content = None
    try:
        if identifier.startswith('http://') or identifier.startswith('https://'):
            print(f"Identifier recognized as URL: {identifier}")
            response = requests.get(identifier)
            response.raise_for_status()  # Raises HTTPError for bad responses
            xml_content = response.text
        elif os.path.exists(identifier):
            print(f"Identifier recognized as local file: {identifier}")
            with open(identifier, 'r') as f:
                xml_content = f.read()
        elif identifier.upper().startswith('PMC'):
            print(f"Identifier recognized as PMCID: {identifier}")
            articles_client = ArticlesClient()
            xml_content = articles_client.get_fulltext_xml(identifier)
            if not xml_content:
                raise ValueError(f"Could not retrieve XML for PMCID {identifier}.")
        else:
            raise FileNotFoundError(f"Identifier '{identifier}' is not a valid file path, URL, or PMCID.")
    except (requests.exceptions.RequestException, FileNotFoundError, ValueError) as e:
        print(f"Error: {e}")
        return None
    
    print("Successfully fetched XML content.")
    return xml_content


def run_processing_pipeline(xml_content, offline=False):
    """Demonstrates JATS processing and accession number extraction."""
    print("\n--- Running Local Processing Examples ---")

    # 1. Convert JATS XML to structured JSON
    print("\n1. Processing JATS XML to JSON...")
    processor = XMLProcessor()
    processed_data = processor.process_full_text(xml_content)
    final_json = processor.process_json(processed_data, ordered_labels)
    
    if not final_json:
        print("Processing failed to produce JSON. Aborting.")
        return

    # Save the intermediate JSON to a file (optional)
    with open("example_output.json", "w") as f:
        json.dump(final_json, f, indent=2)
    print("Processed JSON has been saved to 'example_output.json'")

    # 2. Extract accession numbers and resources from the JSON content
    print("\n2. Extracting accession numbers and resources...")
    
    # Load the required spaCy model
    try:
        nlp = spacy.load("en_core_sci_sm")
    except OSError:
        print("Error: 'en_core_sci_sm' model not found.")
        print("Please run: python -m spacy download en_core_sci_sm")
        return

    all_extractions = []
    # Iterate through the sections and sentences in your JSON structure
    for section, sentences in final_json.get('sections', {}).items():
        for sentence in sentences:
            text = sentence.get('text', '')
            sentence_id = sentence.get('sentence_id')
            # Call the extraction function
            extractions = extract_with_spacy(nlp, text, section, sentence_id, offline=offline)
            if extractions:
                all_extractions.extend(extractions)

    print(f"Found {len(all_extractions)} accession numbers and resources.")
    if all_extractions:
        print("Example of extracted data:")
        print(json.dumps(all_extractions[0], indent=2))

    # Save the extractions to a file (optional)
    with open("example_accessions.json", "w") as f:
        json.dump(all_extractions, f, indent=2)
    print("Extracted accession numbers and resources saved to 'example_accessions.json'")
    print("-" * 35)


if __name__ == "__main__":
    # --- Example with a PMCID ---
    print("--- Running example with PMCID: PMC11704132 ---")
    pmcid_content = get_xml_content("PMC11704132")
    if pmcid_content:
        run_processing_pipeline(pmcid_content)

    # --- Example with a local file ---
    # Note: Assumes 'test_data/PXD053361.xml' exists.
    print("\n--- Running example with local file: test_data/PXD053361.xml ---")
    local_file_content = get_xml_content("test_data/PXD053361.xml")
    if local_file_content:
        run_processing_pipeline(local_file_content)
        
    # --- Example with a URL ---
    # print("\n--- Running example with URL ---")
    # url = "https://www.ebi.ac.uk/europepmc/webservices/rest/PMC11704132/fullTextXML"
    # url_content = get_xml_content(url)
    # if url_content:
    #     run_processing_pipeline(url_content)

How to Run the Example

  1. Ensure you have installed the package and its dependencies (see Installation).

  2. Save the code above as script_usage_example.py in the root of the project directory.

  3. Run the script from your terminal:

    python script_usage_example.py
    

The script will create two files, example_output.json and example_accessions.json, in your project directory.