Untitled

from atlassian import Confluence
from bs4 import BeautifulSoup

def fetch_all_descendants_and_links(confluence, page_id, current_path=[], fetched_ids=set(), all_chunks=[]):
    """
    Recursively fetch content from a Confluence page and all its descendants, no depth limit.
    Organizes content with metadata for easier processing.

    Args:
        confluence: Confluence API client instance
        page_id: ID of the current page
        current_path: List of page titles from root to current page
        fetched_ids: Set of already fetched page IDs to avoid loops
        all_chunks: List to store content chunks with metadata
    Returns:
        List of dictionaries containing text chunks and their metadata
    """
    # Skip if page has already been fetched (prevents infinite loops)
    if page_id in fetched_ids:
        return all_chunks
    fetched_ids.add(page_id)

    try:
        # Fetch the page content and title
        page = confluence.get_page_by_id(page_id, expand='body.storage')
        title = page['title']
        new_path = current_path + [title]

        # Extract content and convert HTML to plain text
        content = page['body']['storage']['value']
        soup = BeautifulSoup(content, 'html.parser')
        text = soup.get_text(separator=' ', strip=True)

        # Split text into manageable chunks (e.g., by paragraphs)
        chunks = [chunk.strip() for chunk in text.split('\n') if chunk.strip()]

        # Store each chunk with its hierarchical path as metadata
        for chunk in chunks:
            all_chunks.append({
                'text': chunk,
                'metadata': {'path': ' > '.join(new_path)}
            })

        # Fetch child pages and recurse
        children = confluence.get_page_child_by_type(page_id, type='page')
        for child in children:
            child_id = child['id']
            fetch_all_descendants_and_links(confluence, child_id, new_path, fetched_ids, all_chunks)

    except Exception as e:
        print(f"Error fetching page {page_id}: {e}")

    return all_chunks
Editor is loading...