Untitled

 avatar
unknown
python
2 months ago
4.8 kB
2
Indexable
def extract_performance_data_2024(self, table):
    """Extract performance data for 2024 format"""
    if not table:
        return [], []
    
    rows = table.find_all("tr")
    if len(rows) < 3:  # Need at least header rows and one data row
        return [], []
    
    # Get headers from the second row (first row might be a title)
    headers = ["Portfolio Manager"]  # Add Portfolio Manager as first column
    header_row = rows[1].find_all("th")
    headers.extend([header.text.strip() for header in header_row])
    
    data_list = []
    # Start from third row (skip title and header rows)
    for row in rows[2:]:
        cells = row.find_all("td")
        if cells:
            row_data = [cell.text.strip() for cell in cells]
            if any(row_data):  # Only include non-empty rows
                data_list.append(row_data)
    
    return headers, data_list

def extract_performance_data_2023(self, table):
    """Extract performance data for 2023 and earlier format"""
    if not table:
        return [], []
    
    rows = table.find_all("tr")
    if len(rows) < 2:  # Need at least header and one data row
        return [], []
    
    # Define standard columns for 2023 format
    headers = [
        "Portfolio Manager",
        "Investment Approach",
        "AUM (INR Cr.)",
        "1 Month Returns",
        "1 Year Returns",
        "1 Month Portfolio Turnover",
        "1 Year Portfolio Turnover"
    ]
    
    data_list = []
    # Start from second row (skip header)
    for row in rows[1:]:
        cells = row.find_all("td")
        if cells:
            row_data = [cell.text.strip() for cell in cells]
            if "Benchmark:" not in row_data[0]:  # Skip benchmark rows
                # Ensure we have correct number of columns
                while len(row_data) < len(headers) - 1:  # -1 because Portfolio Manager will be added later
                    row_data.append("")
                if any(row_data):  # Only include non-empty rows
                    data_list.append(row_data)
    
    return headers, data_list

def find_performance_table(self, tables, year_value):
    """Find the performance data table based on the year"""
    year = int(year_value)
    if year >= 2024:
        # 2024 format - look for TWRR Returns
        for table in tables:
            headers = [th.text.strip().lower() for th in table.find_all("th")]
            if any("twrr returns" in header.lower() for header in headers):
                return table
    else:
        # 2023 and earlier format
        for table in tables:
            headers = [th.text.strip().lower() for th in table.find_all("th")]
            if any(keyword in ' '.join(headers).lower() for keyword in ["returns(%)", "investment approach"]):
                return table
    return None

def scrape_single_pms(self, pmr_value, month_value, year_value):
    """Scrape and save data for a single portfolio manager"""
    if self.is_data_exists(pmr_value):
        print(f"Data already exists for {pmr_value}, skipping...")
        return True

    try:
        # [Previous navigation code remains the same until after the soup parsing]
        
        # Parse page content
        soup = BeautifulSoup(self.driver.page_source, "html.parser")
        tables = soup.find_all("table")
        
        # Extract performance data
        performance_table = self.find_performance_table(tables, year_value)
        if performance_table:
            if int(year_value) >= 2024:
                headers, data = self.extract_performance_data_2024(performance_table)
            else:
                headers, data = self.extract_performance_data_2023(performance_table)
            
            if data:
                # Add portfolio manager to each row
                performance_rows = []
                for row in data:
                    full_row = [pmr_value] + row  # Add portfolio manager as first column
                    performance_rows.append(full_row)
                
                # Create DataFrame and save
                if performance_rows:
                    performance_df = pd.DataFrame(performance_rows, columns=headers)
                    print(f"Saving performance data for {pmr_value}:")
                    print(performance_df)  # Debug print
                    self.append_to_excel(performance_df, 'Performance Data')
                else:
                    print(f"No performance rows generated for {pmr_value}")
        else:
            print(f"No performance table found for {pmr_value}")
        
        # [Rest of the code for other data extraction remains the same]
        
        return True
        
    except Exception as e:
        print(f"Error processing {pmr_value}: {str(e)}")
        return False
Editor is loading...
Leave a Comment