Source code for scitex_scholar.core.Papers

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Timestamp: "2025-09-30 22:24:29 (ywatanabe)"
# File: /home/ywatanabe/proj/SciTeX-Code/src/scitex/scholar/core/Papers.py
# ----------------------------------------
from __future__ import annotations

import os

__FILE__ = __file__
__DIR__ = os.path.dirname(__FILE__)
# ----------------------------------------

"""Papers class for SciTeX Scholar module.

Papers is a simple collection of Paper objects.
All business logic is handled by Scholar or utility functions.

This is a simplified version - reduced from 39 methods to ~15 methods.
Business logic has been moved to Scholar and utility functions.
"""

from pathlib import Path
from typing import Any, Callable, Dict, Iterator, List, Optional, Union

import scitex_logging as logging

from scitex_scholar.config import ScholarConfig
from scitex_scholar.core.Paper import Paper

logger = logging.getLogger(__name__)



[docs]
class Papers:
    """A simple collection of Paper objects.

    This is a minimal collection class. Most business logic
    (loading, saving, enrichment, etc.) is handled by Scholar.

    Methods have been reduced from 39 to ~15 for simplicity.
    Complex operations should use Scholar or utility functions.
    """


[docs]
    def __init__(
        self,
        papers: Optional[Union[List[Paper], List[Dict]]] = None,
        project: Optional[str] = None,
        config: Optional[ScholarConfig] = None,
    ):
        """Initialize Papers collection.

        Args:
            papers: List of Paper objects or dicts to convert to Papers
            project: Project name for organizing papers
            config: Scholar configuration
        """
        self.project = project or "default"
        self.config = config or ScholarConfig()

        # Initialize papers list
        self._papers: List[Paper] = []

        if papers:
            for item in papers:
                if isinstance(item, Paper):
                    self._papers.append(item)
                elif isinstance(item, dict):
                    # Handle dict input - Pydantic handles validation
                    paper = Paper.from_dict(item)
                    self._papers.append(paper)
                else:
                    logger.warning(f"Skipping invalid item type: {type(item)}")


    # =========================================================================
    # BASIC COLLECTION METHODS
    # =========================================================================


[docs]
    def __len__(self) -> int:
        """Number of papers in collection."""
        return len(self._papers)



[docs]
    def __iter__(self) -> Iterator[Paper]:
        """Iterate over papers."""
        return iter(self._papers)



[docs]
    def __getitem__(self, index: Union[int, slice]) -> Union[Paper, "Papers"]:
        """Get paper(s) by index or slice.

        Args:
            index: Integer index or slice

        Returns:
            Single Paper if integer index, Papers collection if slice
        """
        if isinstance(index, slice):
            return Papers(self._papers[index], project=self.project, config=self.config)
        return self._papers[index]



[docs]
    def __repr__(self) -> str:
        """String representation."""
        return f"Papers(count={len(self)}, project={self.project})"



[docs]
    def __str__(self) -> str:
        """Human-readable string."""
        if len(self) == 0:
            return "Empty Papers collection"
        elif len(self) == 1:
            return "Papers collection with 1 paper"
        else:
            return f"Papers collection with {len(self)} papers"



[docs]
    def __dir__(self) -> List[str]:
        """Custom dir for better discoverability."""
        base_attrs = list(object.__dir__(self))
        custom_attrs = [
            "papers",
            "filter",
            "sort_by",
            "append",
            "extend",
            "to_list",
            "summary",
            "to_dict",
            "to_dataframe",
            "from_bibtex",
            "save",
        ]
        return sorted(set(base_attrs + custom_attrs))


    # =========================================================================
    # SIMPLE COLLECTION OPERATIONS
    # =========================================================================

    @property
    def papers(self) -> List[Paper]:
        """Get the underlying papers list."""
        return self._papers


[docs]
    def append(self, paper: Paper) -> None:
        """Add a paper to the collection.

        Args:
            paper: Paper to add
        """
        if isinstance(paper, Paper):
            self._papers.append(paper)
        else:
            logger.warning(f"Cannot append non-Paper object: {type(paper)}")



[docs]
    def extend(self, papers: Union[List[Paper], "Papers"]) -> None:
        """Add multiple papers to the collection.

        Args:
            papers: List of papers or another Papers collection
        """
        if isinstance(papers, Papers):
            self._papers.extend(papers._papers)
        elif isinstance(papers, list):
            for paper in papers:
                if isinstance(paper, Paper):
                    self._papers.append(paper)
        else:
            logger.warning(f"Cannot extend with type: {type(papers)}")



[docs]
    def to_list(self) -> List[Paper]:
        """Get papers as a list.

        Returns:
            List of Paper objects
        """
        return list(self._papers)



[docs]
    def filter(
        self,
        condition: Optional[Callable[[Paper], bool]] = None,
        year_min: Optional[int] = None,
        year_max: Optional[int] = None,
        has_doi: Optional[bool] = None,
        has_abstract: Optional[bool] = None,
        has_pdf: Optional[bool] = None,
        min_citations: Optional[int] = None,
        max_citations: Optional[int] = None,
        min_impact_factor: Optional[float] = None,
        max_impact_factor: Optional[float] = None,
        journal: Optional[str] = None,
        author: Optional[str] = None,
        keyword: Optional[str] = None,
        publisher: Optional[str] = None,
        **kwargs,
    ) -> "Papers":
        """Filter papers by condition or criteria.

        Parameters
        ----------
        condition
            Function that takes a Paper and returns bool.
        year_min
            Minimum year.
        year_max
            Maximum year.
        has_doi
            Filter papers with/without DOI.
        has_abstract
            Filter papers with/without abstract.
        has_pdf
            Filter papers with/without PDF URL.
        min_citations
            Minimum citation count.
        max_citations
            Maximum citation count.
        min_impact_factor
            Minimum journal impact factor.
        max_impact_factor
            Maximum journal impact factor.
        journal
            Journal name (partial match).
        author
            Author name (partial match).
        keyword
            Keyword (searches in keywords, title, abstract).
        publisher
            Publisher name (partial match).
        **kwargs
            Additional keyword arguments for backward compatibility.

        Returns
        -------
        Papers
            New Papers collection with filtered papers.

        Examples
        --------
        Filter using a lambda condition::

            high_impact = papers.filter(lambda p: p.journal_impact_factor and p.journal_impact_factor > 10)
            highly_cited = papers.filter(lambda p: p.citation_count and p.citation_count > 500)
            recent = papers.filter(lambda p: p.year and p.year >= 2020)

        Filter using built-in parameters::

            high_impact_v2 = papers.filter(min_impact_factor=10.0)
            highly_cited_v2 = papers.filter(min_citations=500)
            recent_v2 = papers.filter(year_min=2020)

        Combine multiple parameters::

            filtered = papers.filter(
                min_impact_factor=5.0,
                min_citations=100,
                year_min=2015,
                year_max=2023,
                journal="Nature",
                has_doi=True,
            )

        Chain filters for AND logic::

            elite_recent = papers.filter(min_impact_factor=10).filter(year_min=2020)
        """
        # If a lambda/function condition is provided, use it
        if condition is not None and callable(condition):
            filtered = [p for p in self._papers if condition(p)]
            logger.info(f"Lambda filter: {len(self._papers)} -> {len(filtered)} papers")
            return Papers(filtered, project=self.project, config=self.config)

        # Otherwise use criteria-based filtering
        from scitex_scholar._utils.papers_utils import filter_papers_advanced

        result = filter_papers_advanced(
            self,
            year_min=year_min,
            year_max=year_max,
            has_doi=has_doi,
            has_abstract=has_abstract,
            has_pdf=has_pdf,
            min_citations=min_citations or kwargs.get("min_citations"),
            max_citations=max_citations or kwargs.get("max_citations"),
            min_impact_factor=min_impact_factor or kwargs.get("min_impact_factor"),
            max_impact_factor=max_impact_factor or kwargs.get("max_impact_factor"),
            journal=journal,
            author=author,
            keyword=keyword,
            publisher=publisher,
        )

        # Preserve project and config
        result.project = self.project
        result.config = self.config

        logger.info(f"Filtered: {len(self._papers)} -> {len(result)} papers")
        return result



[docs]
    def sort_by(self, *criteria, reverse: bool = False, **kwargs) -> "Papers":
        """Sort papers by criteria.

        Parameters
        ----------
        *criteria
            Field names (as strings) or lambda functions to sort by.
        reverse
            Sort in descending order (default: False).
        **kwargs
            Additional options.

        Returns
        -------
        Papers
            New sorted Papers collection.

        Notes
        -----
        Available Paper fields for sorting:

        - ``title`` -- Paper title
        - ``year`` -- Publication year
        - ``citation_count`` -- Number of citations
        - ``journal_impact_factor`` -- Journal impact factor
        - ``journal`` -- Journal name
        - ``publisher`` -- Publisher name
        - ``doi`` -- Digital Object Identifier
        - ``created_at`` -- When record was created
        - ``updated_at`` -- When record was last updated

        Examples
        --------
        Sort by a single field::

            by_year = papers.sort_by('year')
            by_citations_desc = papers.sort_by('citation_count', reverse=True)

        Sort by multiple fields (primary, secondary, etc.)::

            by_year_then_citations = papers.sort_by('year', 'citation_count')

        Sort using a lambda function::

            by_citations = papers.sort_by(lambda p: p.citation_count or 0, reverse=True)
            by_year_safe = papers.sort_by(lambda p: p.year if p.year else 9999)

        Sort by a computed value::

            by_citation_per_year = papers.sort_by(
                lambda p: (p.citation_count or 0) / (2024 - p.year) if p.year else 0,
                reverse=True,
            )
        """
        if not criteria:
            return Papers(self._papers, project=self.project, config=self.config)

        # Handle single lambda
        if len(criteria) == 1 and callable(criteria[0]):
            sorted_papers = sorted(self._papers, key=criteria[0], reverse=reverse)
            return Papers(sorted_papers, project=self.project, config=self.config)

        # Handle field names
        from scitex_scholar._utils.papers_utils import sort_papers_multi

        return sort_papers_multi(self, list(criteria), reverse=reverse)


    # =========================================================================
    # BACKWARD COMPATIBILITY METHODS
    # These delegate to utilities or Scholar for the actual implementation
    # =========================================================================


[docs]
    @classmethod
    def from_bibtex(cls, bibtex_input: Union[str, Path]) -> "Papers":
        """Load papers from BibTeX.

        DEPRECATED: Use Scholar.from_bibtex() instead.
        This method is kept for backward compatibility.

        Args:
            bibtex_input: Path to BibTeX file or BibTeX string

        Returns:
            Papers collection
        """
        logger.warning(
            "Papers.from_bibtex() is deprecated. Use Scholar.from_bibtex() instead."
        )

        # Check if it's a file path
        if isinstance(bibtex_input, (str, Path)):
            path = Path(bibtex_input)
            if path.exists():
                return cls._from_bibtex_file(path)

        # Otherwise treat as BibTeX text
        return cls._from_bibtex_text(str(bibtex_input))


    @classmethod
    def _from_bibtex_file(cls, file_path: Union[str, Path]) -> "Papers":
        """Load papers from BibTeX file.

        Args:
            file_path: Path to BibTeX file

        Returns:
            Papers collection
        """
        import bibtexparser

        file_path = Path(file_path)
        if not file_path.exists():
            raise FileNotFoundError(f"BibTeX file not found: {file_path}")

        logger.info(f"Loading BibTeX from {file_path}")

        with open(file_path, "r", encoding="utf-8") as f:
            bib_db = bibtexparser.load(f)

        logger.info(f"Loaded {len(bib_db.entries)} BibTeX entries from {file_path}")

        papers = []
        for entry in bib_db.entries:
            paper = cls._bibtex_entry_to_paper(entry)
            if paper:
                papers.append(paper)

        logger.success(f"Created {len(papers)} papers from BibTeX file")
        return cls(papers)

    @classmethod
    def _from_bibtex_text(cls, bibtex_content: str) -> "Papers":
        """Load papers from BibTeX text.

        Args:
            bibtex_content: BibTeX content as string

        Returns:
            Papers collection
        """
        import bibtexparser

        bib_db = bibtexparser.loads(bibtex_content)
        logger.info(f"Parsed {len(bib_db.entries)} BibTeX entries from text")

        papers = []
        for entry in bib_db.entries:
            paper = cls._bibtex_entry_to_paper(entry)
            if paper:
                papers.append(paper)

        logger.success(f"Created {len(papers)} papers from BibTeX text")
        return cls(papers)

    @staticmethod
    def _bibtex_entry_to_paper(entry: Dict[str, Any]) -> Paper:
        """Convert BibTeX entry to Paper object.

        Args:
            entry: BibTeX entry dictionary

        Returns:
            Paper object
        """
        # Get fields from BibTeX entry
        fields = {k.lower(): v for k, v in entry.items()}

        # Parse authors
        authors = []
        if "author" in fields:
            author_str = fields["author"]
            authors = [a.strip() for a in author_str.split(" and ")]

        # Parse year - let Pydantic handle validation
        year = None
        if "year" in fields:
            year_str = str(fields["year"])
            if year_str.isdigit():
                year = int(year_str)

        # Parse keywords
        keywords = []
        if "keywords" in fields:
            keywords = [k.strip() for k in fields["keywords"].split(",")]

        # Create structured data for Paper
        basic_data = {
            "title": fields.get("title", "").strip("{}"),
            "authors": authors,
            "abstract": fields.get("abstract", ""),
            "year": year,
            "keywords": keywords,
        }

        id_data = {
            "doi": fields.get("doi"),
            "pmid": fields.get("pmid"),
            "arxiv_id": fields.get("arxiv"),
        }

        publication_data = {
            "journal": fields.get("journal"),
        }

        url_data = {
            "pdf": fields.get("url"),
        }

        # Create Paper with Pydantic structure
        paper = Paper()

        # Set basic metadata
        paper.metadata.basic.title = basic_data.get("title", "")
        paper.metadata.basic.authors = basic_data.get("authors")
        paper.metadata.basic.abstract = basic_data.get("abstract")
        paper.metadata.basic.year = basic_data.get("year")
        paper.metadata.basic.keywords = basic_data.get("keywords")

        # Set ID metadata
        if id_data.get("doi"):
            paper.metadata.set_doi(id_data["doi"])
        paper.metadata.id.pmid = id_data.get("pmid")
        paper.metadata.id.arxiv_id = id_data.get("arxiv_id")

        # Set publication metadata
        paper.metadata.publication.journal = publication_data.get("journal")

        # Set URL metadata
        if url_data.get("pdf"):
            paper.metadata.url.pdfs.append({"url": url_data["pdf"], "source": "bibtex"})

        # Store original BibTeX fields for later reconstruction
        paper._original_bibtex_fields = fields.copy()
        paper._bibtex_entry_type = entry.get("entry_type", "misc")
        paper._bibtex_key = entry.get("key", "")

        return paper


[docs]
    def save(
        self,
        output_path: Union[str, Path],
        format: Optional[str] = "auto",
        **kwargs,
    ) -> None:
        """Save papers to file.

        DEPRECATED: Use Scholar.save_papers() or Scholar.export_bibtex() instead.
        This method is kept for backward compatibility.

        Args:
            output_path: Path to save file
            format: Output format (auto, bibtex, json, csv)
            **kwargs: Additional options
        """
        logger.warning(
            "Papers.save() is deprecated. Use Scholar.export_bibtex() instead."
        )

        output_path = Path(output_path)

        # Auto-detect format from extension
        if format == "auto":
            ext = output_path.suffix.lower()
            if ext in [".bib", ".bibtex"]:
                format = "bibtex"
            elif ext == ".json":
                format = "json"
            elif ext == ".csv":
                format = "csv"
            else:
                format = "bibtex"

        output_path.parent.mkdir(parents=True, exist_ok=True)

        if format.lower() == "bibtex":
            from scitex_scholar._utils.papers_utils import papers_to_bibtex

            bibtex_content = papers_to_bibtex(self, output_path=None)
            output_path.write_text(bibtex_content)
            logger.success(f"Saved {len(self)} papers to {output_path}")

        elif format.lower() == "json":
            import json

            from scitex_scholar._utils.papers_utils import papers_to_dict

            with open(output_path, "w", encoding="utf-8") as f:
                json.dump(papers_to_dict(self), f, indent=2, ensure_ascii=False)
            logger.success(f"Saved {len(self)} papers to {output_path}")

        elif format.lower() == "csv":
            from scitex_scholar._utils.papers_utils import papers_to_dataframe

            df = papers_to_dataframe(self)
            df.to_csv(output_path, index=False)
            logger.success(f"Saved {len(self)} papers to {output_path}")

        else:
            raise ValueError(f"Unsupported format: {format}")



[docs]
    def to_dict(self) -> List[Dict[str, Any]]:
        """Convert to dictionary.

        DEPRECATED: Use papers_utils.papers_to_dict() for new code.

        Returns:
            Dictionary representation
        """
        from scitex_scholar._utils.papers_utils import papers_to_dict

        return papers_to_dict(self)



[docs]
    def to_dataframe(self) -> Any:
        """Convert to pandas DataFrame.

        DEPRECATED: Use papers_utils.papers_to_dataframe() for new code.

        Returns:
            DataFrame with papers data
        """
        try:
            from scitex_scholar._utils.papers_utils import papers_to_dataframe

            return papers_to_dataframe(self)
        except ImportError:
            logger.error("pandas is required for to_dataframe()")
            return None



[docs]
    def summary(self) -> Dict[str, Any]:
        """Get summary statistics.

        DEPRECATED: Use papers_utils.papers_statistics() for new code.

        Returns:
            Dictionary with statistics
        """
        from scitex_scholar._utils.papers_utils import papers_statistics

        return papers_statistics(self)



    # =========================================================================
    # METHODS REMOVED (use Scholar or utilities instead):
    # =========================================================================
    # The following methods have been removed to simplify the class:
    # - sync_with_library() -> Use Scholar internally
    # - create_project_symlinks() -> Use Scholar internally
    # - get_project_statistics() -> Use Scholar.get_library_statistics()
    # - download_pdfs() -> Use Scholar.download_pdfs()
    # - enrich() -> Use Scholar.enrich()
    # - merge_papers() -> Use papers_utils.merge_papers()
    # - deduplicate() -> Use papers_utils.deduplicate_papers()
    #
    # This reduces complexity from 39 methods to ~15 methods.
    # All business logic is now in Scholar or utility functions.


# For backward compatibility
__all__ = ["Papers"]


if __name__ == "__main__":

    def main():
        """Demonstrate simplified Papers class."""
        print("=" * 60)
        print("Papers Class - Simplified Collection")
        print("=" * 60)

        # Create test papers
        # Create sample papers with Pydantic structure
        p1 = Paper()
        p1.metadata.basic.title = "Paper 1"
        p1.metadata.basic.year = 2023
        p1.metadata.publication.journal = "Nature"

        p2 = Paper()
        p2.metadata.basic.title = "Paper 2"
        p2.metadata.basic.year = 2024
        p2.metadata.publication.journal = "Science"

        p3 = Paper()
        p3.metadata.basic.title = "Paper 3"
        p3.metadata.basic.year = 2022
        p3.metadata.publication.journal = "Cell"

        papers = Papers([p1, p2, p3])

        print(f"\n1. Collection: {papers}")
        print(f"   Count: {len(papers)}")
        print(f"   First: {papers[0].metadata.basic.title}")

        # Test filtering
        recent = papers.filter(
            lambda p: p.metadata.basic.year and p.metadata.basic.year >= 2023
        )
        print(f"\n2. Filtered (year >= 2023): {len(recent)} papers")

        # Test sorting
        sorted_papers = papers.sort_by(lambda p: p.metadata.basic.year or 0)
        print("\n3. Sorted by year:")
        for p in sorted_papers:
            print(f"   {p.metadata.basic.year}: {p.metadata.basic.title}")

        print("\n✅ Papers class simplified!")
        print("   - Reduced from 39 to ~15 methods")
        print("   - Business logic moved to Scholar")
        print("   - Clean collection interface")

    main()

# EOF