Untitled
unknown
python
2 months ago
4.8 kB
2
Indexable
def extract_performance_data_2024(self, table): """Extract performance data for 2024 format""" if not table: return [], [] rows = table.find_all("tr") if len(rows) < 3: # Need at least header rows and one data row return [], [] # Get headers from the second row (first row might be a title) headers = ["Portfolio Manager"] # Add Portfolio Manager as first column header_row = rows[1].find_all("th") headers.extend([header.text.strip() for header in header_row]) data_list = [] # Start from third row (skip title and header rows) for row in rows[2:]: cells = row.find_all("td") if cells: row_data = [cell.text.strip() for cell in cells] if any(row_data): # Only include non-empty rows data_list.append(row_data) return headers, data_list def extract_performance_data_2023(self, table): """Extract performance data for 2023 and earlier format""" if not table: return [], [] rows = table.find_all("tr") if len(rows) < 2: # Need at least header and one data row return [], [] # Define standard columns for 2023 format headers = [ "Portfolio Manager", "Investment Approach", "AUM (INR Cr.)", "1 Month Returns", "1 Year Returns", "1 Month Portfolio Turnover", "1 Year Portfolio Turnover" ] data_list = [] # Start from second row (skip header) for row in rows[1:]: cells = row.find_all("td") if cells: row_data = [cell.text.strip() for cell in cells] if "Benchmark:" not in row_data[0]: # Skip benchmark rows # Ensure we have correct number of columns while len(row_data) < len(headers) - 1: # -1 because Portfolio Manager will be added later row_data.append("") if any(row_data): # Only include non-empty rows data_list.append(row_data) return headers, data_list def find_performance_table(self, tables, year_value): """Find the performance data table based on the year""" year = int(year_value) if year >= 2024: # 2024 format - look for TWRR Returns for table in tables: headers = [th.text.strip().lower() for th in table.find_all("th")] if any("twrr returns" in header.lower() for header in headers): return table else: # 2023 and earlier format for table in tables: headers = [th.text.strip().lower() for th in table.find_all("th")] if any(keyword in ' '.join(headers).lower() for keyword in ["returns(%)", "investment approach"]): return table return None def scrape_single_pms(self, pmr_value, month_value, year_value): """Scrape and save data for a single portfolio manager""" if self.is_data_exists(pmr_value): print(f"Data already exists for {pmr_value}, skipping...") return True try: # [Previous navigation code remains the same until after the soup parsing] # Parse page content soup = BeautifulSoup(self.driver.page_source, "html.parser") tables = soup.find_all("table") # Extract performance data performance_table = self.find_performance_table(tables, year_value) if performance_table: if int(year_value) >= 2024: headers, data = self.extract_performance_data_2024(performance_table) else: headers, data = self.extract_performance_data_2023(performance_table) if data: # Add portfolio manager to each row performance_rows = [] for row in data: full_row = [pmr_value] + row # Add portfolio manager as first column performance_rows.append(full_row) # Create DataFrame and save if performance_rows: performance_df = pd.DataFrame(performance_rows, columns=headers) print(f"Saving performance data for {pmr_value}:") print(performance_df) # Debug print self.append_to_excel(performance_df, 'Performance Data') else: print(f"No performance rows generated for {pmr_value}") else: print(f"No performance table found for {pmr_value}") # [Rest of the code for other data extraction remains the same] return True except Exception as e: print(f"Error processing {pmr_value}: {str(e)}") return False
Editor is loading...
Leave a Comment