Untitled
unknown
plain_text
a month ago
2.1 kB
6
Indexable
from atlassian import Confluence from bs4 import BeautifulSoup def fetch_all_descendants_and_links(confluence, page_id, current_path=[], fetched_ids=set(), all_chunks=[]): """ Recursively fetch content from a Confluence page and all its descendants, no depth limit. Organizes content with metadata for easier processing. Args: confluence: Confluence API client instance page_id: ID of the current page current_path: List of page titles from root to current page fetched_ids: Set of already fetched page IDs to avoid loops all_chunks: List to store content chunks with metadata Returns: List of dictionaries containing text chunks and their metadata """ # Skip if page has already been fetched (prevents infinite loops) if page_id in fetched_ids: return all_chunks fetched_ids.add(page_id) try: # Fetch the page content and title page = confluence.get_page_by_id(page_id, expand='body.storage') title = page['title'] new_path = current_path + [title] # Extract content and convert HTML to plain text content = page['body']['storage']['value'] soup = BeautifulSoup(content, 'html.parser') text = soup.get_text(separator=' ', strip=True) # Split text into manageable chunks (e.g., by paragraphs) chunks = [chunk.strip() for chunk in text.split('\n') if chunk.strip()] # Store each chunk with its hierarchical path as metadata for chunk in chunks: all_chunks.append({ 'text': chunk, 'metadata': {'path': ' > '.join(new_path)} }) # Fetch child pages and recurse children = confluence.get_page_child_by_type(page_id, type='page') for child in children: child_id = child['id'] fetch_all_descendants_and_links(confluence, child_id, new_path, fetched_ids, all_chunks) except Exception as e: print(f"Error fetching page {page_id}: {e}") return all_chunks
Editor is loading...
Leave a Comment