Untitled
unknown
plain_text
7 months ago
2.1 kB
9
Indexable
from atlassian import Confluence
from bs4 import BeautifulSoup
def fetch_all_descendants_and_links(confluence, page_id, current_path=[], fetched_ids=set(), all_chunks=[]):
"""
Recursively fetch content from a Confluence page and all its descendants, no depth limit.
Organizes content with metadata for easier processing.
Args:
confluence: Confluence API client instance
page_id: ID of the current page
current_path: List of page titles from root to current page
fetched_ids: Set of already fetched page IDs to avoid loops
all_chunks: List to store content chunks with metadata
Returns:
List of dictionaries containing text chunks and their metadata
"""
# Skip if page has already been fetched (prevents infinite loops)
if page_id in fetched_ids:
return all_chunks
fetched_ids.add(page_id)
try:
# Fetch the page content and title
page = confluence.get_page_by_id(page_id, expand='body.storage')
title = page['title']
new_path = current_path + [title]
# Extract content and convert HTML to plain text
content = page['body']['storage']['value']
soup = BeautifulSoup(content, 'html.parser')
text = soup.get_text(separator=' ', strip=True)
# Split text into manageable chunks (e.g., by paragraphs)
chunks = [chunk.strip() for chunk in text.split('\n') if chunk.strip()]
# Store each chunk with its hierarchical path as metadata
for chunk in chunks:
all_chunks.append({
'text': chunk,
'metadata': {'path': ' > '.join(new_path)}
})
# Fetch child pages and recurse
children = confluence.get_page_child_by_type(page_id, type='page')
for child in children:
child_id = child['id']
fetch_all_descendants_and_links(confluence, child_id, new_path, fetched_ids, all_chunks)
except Exception as e:
print(f"Error fetching page {page_id}: {e}")
return all_chunksEditor is loading...
Leave a Comment