Untitled
unknown
python
10 months ago
4.8 kB
9
Indexable
def extract_performance_data_2024(self, table):
"""Extract performance data for 2024 format"""
if not table:
return [], []
rows = table.find_all("tr")
if len(rows) < 3: # Need at least header rows and one data row
return [], []
# Get headers from the second row (first row might be a title)
headers = ["Portfolio Manager"] # Add Portfolio Manager as first column
header_row = rows[1].find_all("th")
headers.extend([header.text.strip() for header in header_row])
data_list = []
# Start from third row (skip title and header rows)
for row in rows[2:]:
cells = row.find_all("td")
if cells:
row_data = [cell.text.strip() for cell in cells]
if any(row_data): # Only include non-empty rows
data_list.append(row_data)
return headers, data_list
def extract_performance_data_2023(self, table):
"""Extract performance data for 2023 and earlier format"""
if not table:
return [], []
rows = table.find_all("tr")
if len(rows) < 2: # Need at least header and one data row
return [], []
# Define standard columns for 2023 format
headers = [
"Portfolio Manager",
"Investment Approach",
"AUM (INR Cr.)",
"1 Month Returns",
"1 Year Returns",
"1 Month Portfolio Turnover",
"1 Year Portfolio Turnover"
]
data_list = []
# Start from second row (skip header)
for row in rows[1:]:
cells = row.find_all("td")
if cells:
row_data = [cell.text.strip() for cell in cells]
if "Benchmark:" not in row_data[0]: # Skip benchmark rows
# Ensure we have correct number of columns
while len(row_data) < len(headers) - 1: # -1 because Portfolio Manager will be added later
row_data.append("")
if any(row_data): # Only include non-empty rows
data_list.append(row_data)
return headers, data_list
def find_performance_table(self, tables, year_value):
"""Find the performance data table based on the year"""
year = int(year_value)
if year >= 2024:
# 2024 format - look for TWRR Returns
for table in tables:
headers = [th.text.strip().lower() for th in table.find_all("th")]
if any("twrr returns" in header.lower() for header in headers):
return table
else:
# 2023 and earlier format
for table in tables:
headers = [th.text.strip().lower() for th in table.find_all("th")]
if any(keyword in ' '.join(headers).lower() for keyword in ["returns(%)", "investment approach"]):
return table
return None
def scrape_single_pms(self, pmr_value, month_value, year_value):
"""Scrape and save data for a single portfolio manager"""
if self.is_data_exists(pmr_value):
print(f"Data already exists for {pmr_value}, skipping...")
return True
try:
# [Previous navigation code remains the same until after the soup parsing]
# Parse page content
soup = BeautifulSoup(self.driver.page_source, "html.parser")
tables = soup.find_all("table")
# Extract performance data
performance_table = self.find_performance_table(tables, year_value)
if performance_table:
if int(year_value) >= 2024:
headers, data = self.extract_performance_data_2024(performance_table)
else:
headers, data = self.extract_performance_data_2023(performance_table)
if data:
# Add portfolio manager to each row
performance_rows = []
for row in data:
full_row = [pmr_value] + row # Add portfolio manager as first column
performance_rows.append(full_row)
# Create DataFrame and save
if performance_rows:
performance_df = pd.DataFrame(performance_rows, columns=headers)
print(f"Saving performance data for {pmr_value}:")
print(performance_df) # Debug print
self.append_to_excel(performance_df, 'Performance Data')
else:
print(f"No performance rows generated for {pmr_value}")
else:
print(f"No performance table found for {pmr_value}")
# [Rest of the code for other data extraction remains the same]
return True
except Exception as e:
print(f"Error processing {pmr_value}: {str(e)}")
return FalseEditor is loading...
Leave a Comment