Source code for src.kg.knowledge_graph

"""Knowledge graph module for this project.

.. autosummary::
    :nosignatures:

    KnowledgeBaseError
    KnowledgeGraphError
    KnowledgeBase
    KnowledgeGraph
    scrape_sbu_solar
    parse_requirements
    parse_prerequisites
    clean_course_title
    remove_non_numeric
    get_course_components
    get_sbu_cse_undergrad_course_offered_info
    get_sbu_cse_grad_course_offered_info
    get_sbu_cse_course_offered_info
"""

import os
import re
import time
import pandas as pd
import requests

from bs4 import BeautifulSoup
from dataclasses import dataclass
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

from typing import Any, Optional, List, Tuple, Union
from warnings import warn

from src.utils import util
from src.utils.util import timeit


[docs] class KnowledgeBaseError(Exception): """Exception intended for knowledge base errors.""" pass
[docs] class KnowledgeGraphError(Exception): """Exception intended for knowledge graph errors.""" pass
[docs] @dataclass class KnowledgeGraph: """Dataclass intended to encapsulate knowledge graphs. NOTE: - Only one knowledge graph file needs to be specified. Example usage: >>> kg = KnowledgeGraph(json="path/to/file.json") >>> kg.json 'path/to/file.json' Raises: KnowledgeGraphError: Arises if the knowledge graph file is not specified. Valid knowledge graph files include JSON, ERGO, RDF, OWL, CSV files. Attributes: json: JSON knowledge graph file. ergo: ERGO knowledge graph file. rdf: RDF knowledge graph file. owl: OWL knowledge graph file. csv: CSV knowledge graph file. lp: Logic programming (``Clingo``) knowledge graph file. """ json: str = "" ergo: str = "" rdf: str = "" owl: str = "" csv: str = "" df: pd.DataFrame = pd.DataFrame() lp: str = "" def __post_init__(self): """Post-initialization function to verify knowledge graph file or representation. Raises: ValueError: Arises if the knowledge graph file or representation is not specified. Valid knowledge graph files include JSON, ERGO, RDF, OWL, CSV files, and a valid representation includes a ``pandas data frame``. FileNotFoundError: Arises if the file does not exist. """ self.json: str = ( os.path.abspath(self.json) if (self.json and self.json.lower().endswith(".json")) else None ) self.ergo: str = ( os.path.abspath(self.ergo) if (self.ergo and self.ergo.lower().endswith(".ergo")) else None ) self.rdf: str = ( os.path.abspath(self.rdf) if (self.rdf and self.rdf.lower().endswith(".rdf")) else None ) self.owl: str = ( os.path.abspath(self.owl) if (self.owl and self.owl.lower().endswith(".owl")) else None ) self.csv: str = ( os.path.abspath(self.csv) if (self.csv and self.csv.lower().endswith(".csv")) else None ) self.df: pd.DataFrame = self.df if (len(self.df) != 0) else None self.lp: str = ( os.path.abspath(self.lp) if (self.lp and self.lp.lower().endswith(".lp")) else None ) # Check if knowledge graph file or representation is specified if ( (not self.json) and (not self.ergo) and (not self.rdf) and (not self.owl) and (not self.csv) and (not self.lp) and (len(self.df) == 0) ): raise KnowledgeGraphError( "Knowledge graph file or representation must be specified." ) # Check if file exists for file in [self.json, self.ergo, self.rdf, self.owl, self.csv, self.lp]: if file and (not os.path.exists(file)): raise FileNotFoundError(f"File not found: {file}")
[docs] @dataclass class KnowledgeBase: """Dataclass intended to encapsulate knowledge bases. NOTE: - Only one knowledge base file needs to be specified. Example usage: >>> kb = KnowledgeBase(url="https://www.stonybrook.edu") >>> kb.url 'https://www.stonybrook.edu' Raises: KnowledgeBaseError: Arises if the knowledge base file or representation is not specified. Valid knowledge base files include PDF, TXT or ERGO files, and valid representations include a URL. Attributes: url: URL knowledge base website link. pdf: PDF knowledge base file. txt: TXT knowledge base file. ergo: ERGO knowledge base file. lp: Logic programming (``Clingo``) knowledge base file. """ url: str = "" pdf: str = "" txt: str = "" ergo: str = "" lp: str = "" def __post_init__(self): """Post-initialization function to verify knowledge base file or representation. Raises: KnowledgeBaseError: Arises if the knowledge base file or representation is not specified. Valid knowledge base files include PDF, TXT files. FileNotFoundError: Arises if the file does not exist. """ self.pdf: str = ( os.path.abspath(self.pdf) if (self.pdf and self.pdf.lower().endswith(".pdf")) else None ) self.txt: str = ( os.path.abspath(self.txt) if (self.txt and self.txt.lower().endswith(".txt")) else None ) self.ergo: str = ( (os.path.abspath(self.ergo)) if (self.ergo and self.ergo.lower().endswith(".ergo")) else None ) self.lp: str = ( (os.path.abspath(self.lp)) if (self.lp and self.lp.lower().endswith(".lp")) else None ) # Check if knowledge base file or representation is specified if ( (not self.url) and (not self.pdf) and (not self.txt) and (not self.ergo) and (not self.lp) ): raise KnowledgeBaseError( "Knowledge base file or representation must be specified." ) # Check if file exists for file in [self.pdf, self.txt, self.ergo]: if file and (not os.path.exists(file)): raise FileNotFoundError(f"File not found: {file}")
[docs] @timeit def scrape_sbu_solar( url: Union[KnowledgeBase, str], major_three_letter_code: str, wait_time: int = 10, headless: bool = True, verbose: bool = False, output_filename: Optional[str] = None, ) -> KnowledgeGraph: """Scrape Stony Brook University's course catalog for a specific major's course information. This function scrapes Stony Brook University's course catalog and stores the information in a :class:`KnowledgeGraph` object. The course information includes course number, title, career, units, grading basis, enrollment requirements, anti-requisites, corequisites, course components, academic group, academic organization, and course description. Additionally, information to when courses are offered over a 4 semester span (specific only to CSE courses) is also included. This information is scraped from the CSE department's website, and is hardcoded for CSE courses only see (:func:`get_sbu_cse_course_offered_info`). WARNING: - This function uses a ``Selenium WebDriver`` and specific ``div`` IDs to scrape the course catalog. Usage example: >>> url = "https://prod.ps.stonybrook.edu/psc/csprodg/EMPLOYEE/CAMP/c/COMMUNITY_ACCESS.SSS_BROWSE_CATLG.GBL?" >>> kg = scrape_sbu_solar( ... url=url, ... major_three_letter_code="cse", ... wait_time=10, ... headless=True, ... verbose=True,) Args: url: Input Stony Brook URL (or :class:`KnowledgeBase` object) to scrape. major_three_letter_code: Three letter code for the major (e.g. CSE for computer science). wait_time: Maximum wait time (in seconds) for each click operation. Defaults to 10. headless: Do not open brower. Defaults to True. verbose: Print output to screen. Defaults to False. output_filename: Output filename for the JSON file. Defaults to None. Raises: ValueError: Arises if the course table is not displayed, is empty, or if the wait time is less than 0 seconds. Returns: :class:`KnowledgeGraph` object containing course information that corresponds to an output JSON file. """ # Verify output_filename if output_filename is not None: _path, _filename, _ = util.file_parts(output_filename) output_filename: str = os.path.join( _path, f"{_filename}.json" ) # Ensure JSON extension _return_json: bool = True else: _return_json: bool = False # Verify URL if isinstance(url, KnowledgeBase): url: str = url.url # Verify wait time is integer and greater than 0 wait_time: int = int(wait_time) # Headless option if headless: options = webdriver.ChromeOptions() options.add_argument("headless") else: options = None if wait_time < 0: raise ValueError("Wait time must be greater than 0 seconds.") # Setup Selenium WebDriver driver = webdriver.Chrome(options=options) driver.get(url) # Get page navigation letter major_three_letter_code: str = ( major_three_letter_code.upper() ) # Ensure major code is uppercase nav_letter: str = major_three_letter_code[ 0 ].upper() # Get first letter of major code # NOTE: If Major ID starts with 'A', skip this step. # # Click on the letter to navigate to the major if nav_letter == "A": pass else: WebDriverWait(driver, wait_time).until( EC.element_to_be_clickable((By.LINK_TEXT, nav_letter)) ).click() # NOTE: If major code is 'MAT', then partial # link text is 'MAT - M' must be used to navigate # # Navigate to major if major_three_letter_code == "MAT": WebDriverWait(driver, wait_time).until( EC.element_to_be_clickable( (By.PARTIAL_LINK_TEXT, f"{major_three_letter_code} - M") ) ).click() else: WebDriverWait(driver, wait_time).until( EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, major_three_letter_code)) ).click() # Get table data time.sleep(wait_time // 2) # Time to wait for javascript to load the table. table: List[webdriver.remote.webelement.WebElement] = driver.find_elements( By.TAG_NAME, "tbody" ) # This will be a list of tables # Remove tables without course number # Perform this in reverse order to avoid index errors # if table is removed from the list for tab in reversed(table): try: tab.find_element(By.PARTIAL_LINK_TEXT, "Course Nbr") except (AttributeError, NoSuchElementException): table.remove(tab) try: table = table[-1] # Get the last table except IndexError: # return None raise ValueError("Table is not displayed. Check the URL and major code.") # Verify table if not table.is_displayed(): raise ValueError("Table is not displayed. Check the URL and major code.") if not table.text: raise ValueError("Table is empty. Check the URL and major code.") # Extract headers headers = [header.text for header in table.find_elements(By.TAG_NAME, "th")] # Extract rows rows = [] for row in table.find_elements(By.TAG_NAME, "tr"): cells = [cell.text for cell in row.find_elements(By.TAG_NAME, "td")] if cells: # This check is to skip rows without table data cells rows.append(cells) # Create DataFrame df = pd.DataFrame(rows, columns=headers) # Create additional columns for course information _course_numbers: List[str] = df["Course Nbr"].tolist() # Get course numbers # Clean course titles _course_titles: List[str] = df["Course Title"].tolist() df["Course Title"] = [clean_course_title(title) for title in _course_titles] career_list: List[str] = [] units_list: List[str] = [] grading_basis_list: List[str] = [] enrollment_requirement_list: List[str] = [] enrollment_anti_requisite_list: List[str] = [] enrollment_corequisite_list: List[str] = [] course_components_list: List[Tuple[str, ...]] = [] academic_group_list: List[str] = [] academic_organization_list: List[str] = [] description_list: List[str] = [] course_numbers_with_three_letter_code: List[str] = [] if verbose: print(f"\nScraping course information for {major_three_letter_code}...\n") # Get information for each course for course in _course_numbers: if verbose: print(f"Processing course: {course}...") # Wait for the page to load and click on course number WebDriverWait(driver, wait_time).until( EC.element_to_be_clickable((By.LINK_TEXT, f"{course}")) ).click() # Use ID to find element -- it is unique. WebDriverWait(driver, wait_time).until( EC.presence_of_element_located( (By.ID, "win0divSSR_CRSE_OFF_VW_ACAD_CAREER$0") ) ).click() try: career: str = driver.find_element( By.ID, "win0divSSR_CRSE_OFF_VW_ACAD_CAREER$0" ).text except NoSuchElementException: career: str = "" # Course units try: units: str = float( driver.find_element(By.ID, "DERIVED_CRSECAT_UNITS_RANGE$0").text ) except (ValueError, NoSuchElementException): try: units: str = driver.find_element( By.ID, "DERIVED_CRSECAT_UNITS_RANGE$0" ).text except NoSuchElementException: units: str = "" # Grading basis try: grading_basis: str = driver.find_element( By.ID, "win0divSSR_CRSE_OFF_VW_GRADING_BASIS$0" ).text except NoSuchElementException: grading_basis: str = "" # Enrollment requirements (prerequisites, anti-requisites, and corequisites) try: _enrollment_requirement: str = driver.find_element( By.ID, "DERIVED_CRSECAT_DESCR254A$0" ).text except NoSuchElementException: _enrollment_requirement: str = "" enrollment_requirement: Union[str, List[List[str]]] enrollment_anti_requisite: Union[str, List[List[str]]] enrollment_corequisite: Union[str, List[List[str]]] enrollment_requirement, enrollment_anti_requisite, enrollment_corequisite = ( parse_requirements(input_string=_enrollment_requirement) ) # Course components course_components: Tuple[str, ...] = get_course_components(driver) # Academic group try: academic_group: str = driver.find_element( By.ID, "ACAD_GROUP_TBL_DESCR$0" ).text except NoSuchElementException: academic_group: str = "" # Academic organization try: academic_organization: str = driver.find_element( By.ID, "win0divACAD_ORG_TBL_DESCR$0" ).text except NoSuchElementException: academic_organization: str = "" # Course description try: description: str = driver.find_element( By.ID, "SSR_CRSE_OFF_VW_DESCRLONG$0" ).text except NoSuchElementException: description: str = "" # Append course number with three letter code course = remove_non_numeric(course) course_numbers_with_three_letter_code.append( f"{major_three_letter_code}{course}" # Remove space character, output should be: "CSE101" ) # Update lists career_list.append(career) units_list.append(units) grading_basis_list.append(grading_basis) enrollment_requirement_list.append(enrollment_requirement) enrollment_anti_requisite_list.append(enrollment_anti_requisite) enrollment_corequisite_list.append(enrollment_corequisite) course_components_list.append(course_components) academic_group_list.append(academic_group) academic_organization_list.append(academic_organization) description_list.append(description) # Wait then click to go back to the course list WebDriverWait(driver, wait_time // 2).until( EC.element_to_be_clickable( (By.LINK_TEXT, "Return to Browse Course Catalog") ) ).click() # Update DataFrame df["Course Nbr"] = course_numbers_with_three_letter_code df.insert(df.columns.__len__(), "Career", career_list) df.insert(df.columns.__len__(), "Units", units_list) df.insert(df.columns.__len__(), "Grading Basis", grading_basis_list) df.insert( df.columns.__len__(), "Enrollment Requirement", enrollment_requirement_list ) df.insert(df.columns.__len__(), "Antirequisites", enrollment_anti_requisite_list) df.insert(df.columns.__len__(), "Corequisites", enrollment_corequisite_list) df.insert(df.columns.__len__(), "Course Components", course_components_list) df.insert(df.columns.__len__(), "Academic Group", academic_group_list) df.insert(df.columns.__len__(), "Academic Organization", academic_organization_list) df.insert(df.columns.__len__(), "Description", description_list) # Rename columns df.rename( columns={ "Enrollment Requirement": "Prerequisites", "Units": "Credits", "Course Title": "CourseTitle", "Course Nbr": "CourseNumber", }, inplace=True, ) # Add semester offering information here if major_three_letter_code.upper() == "CSE": # Get course offering information for CSE _df = get_sbu_cse_course_offered_info( undergrad_url="https://www.cs.stonybrook.edu/students/Undergraduate-Studies/csecourses", # Hard code as this is for CSE only grad_url="https://www.cs.stonybrook.edu/students/Graduate-Studies/courses", # Hard code as this is for CSE only ) # Get dataframe headers current_columns: List[str] = df.columns.tolist() _df_columns: List[str] = _df.columns.tolist() # [ # -4: # ] # Only need the last 4 columns, as that has the course semester offering information new_columns: List[str] = current_columns + _df_columns # Concatenate along CourseNumber (columns) df2 = pd.concat([df, _df], axis=1, ignore_index=True) df = df2.fillna(0) # Rename columns in the new DataFrame df.columns = new_columns # Remove duplicate columns df = df.loc[:, ~df.columns.duplicated()] # Update course offering information for certain CSE courses # # TODO: Keep adding courses that are regularly offered but # not scheduled here. for rows in df.itertuples(): if ( ("593" in rows[1].lower()) or ("600" in rows[1].lower()) or ("698" in rows[1].lower()) or ("487" in rows[1].lower()) ): df.iloc[rows[0], -4:] = [1, 1, 1, 1] else: # Non-CSE cases -- just assume that courses are offered all year round new_columns: List[str] = ["spring1", "fall1", "spring2", "fall2"] for column in new_columns: df[column] = 1 # Condense semester offering information to just one fall and spring column each df["spring"] = df[["spring1", "spring2"]].max(axis=1) df["fall"] = df[["fall1", "fall2"]].max(axis=1) # Replace index with Course Nbr df.set_index( # "Course Nbr", "CourseNumber", inplace=True, drop=True, # Drop the Index column ) # Remove duplicate rows df = df[~df.index.duplicated(keep="first")] # Drop columns not needed by ErgoAI df.drop( [ # "Description", "Academic Group", "Academic Organization", "Course Components", "Grading Basis", "spring1", "fall1", "spring2", "fall2", ], axis=1, inplace=True, ) # Quit the driver, close the browser driver.quit() # Write to JSON file if requested if _return_json: df.to_json(output_filename, orient="index", indent=4) # Create KnowledgeGraph object kg = KnowledgeGraph(df=df, json=output_filename) return kg
[docs] def parse_prerequisites(input_string: str) -> Union[str, List[List[str]]]: """Parse major requirements from a string into a list of lists of course codes. This function is mainly used to separate disjunctions and conjunctions course prerequisites. Disjunctions are grouped together in the same sub-list, while conjunctions are separated into different sub-lists. For example, ``"Prerequisite: CSE 216 or CSE 260; AMS 310; CSE major"`` would be parsed as: ``[["CSE 216", "CSE 260"], ["AMS 310"], ["CSE major"]]``. WARNING: - This function is deprecated. Use :py:func:`~src.kg.knowledge_graph.parse_requirements` instead. Usage example: >>> input_string = "Prerequisite: CSE 216 or CSE 260; AMS 310; CSE major" >>> parse_prerequisites(input_string) [['CSE 216', 'CSE 260'], ['AMS 310'], ['CSE major']] Args: input_string: Input string containing major course requirements. Returns: List of lists of containing strings that corresponds to course prequisites. """ warn( "``parse_prerequisites()`` is deprecated. Please use ``parse_requirements()`` instead.", DeprecationWarning, stacklevel=2, ) # NOTE: Disjunction statements in the same sub-list, # conjunctions in separate lists # NOTE: This pattern assumes that course codes are always in # the format "AAA 123" # Define a regular expression pattern to capture course codes course_pattern = r"\b([A-Z]{3} \d{3})\b" # Split the input string into major requirements using the semi-colon conjunctive_parts = input_string.split(";") # Result list to hold parsed requirements result = [] for part in conjunctive_parts: # Find courses in each part part_courses = re.findall(course_pattern, part) # Add non-empty lists to the result if part_courses: result.append(part_courses) if not result: return "NONE" return result
[docs] def parse_requirements( input_string: str, ) -> Tuple[List[List[str]], List[List[str]], List[List[str]]]: """Parse major requirements from a string into a list of lists of course codes. This function is mainly used to separate disjunctions and conjunctions of course: prerequisites, anti-requisites and corequisites. Disjunctions are grouped together in the same sub-list, while conjunctions are separated into different sub-lists. Returns lists for prerequisites, anti-requisites, and corequisites. NOTE: - Disjunctive statements will appear in the same sub-list, while conjunctive statements will appear in a separate sub-list. - Use this function in place of :py:func:`~src.kg.knowledge_graph.parse_prerequisites`. Usage example: >>> input_string = "Prerequisite: CSE 216 or CSE 260; AMS 310; Anti-requisite: CSE 260. Corequisite: CSE 161." >>> parse_requirements(input_string) ([['CSE216', 'CSE260'], ['AMS310']], [['CSE260']], [['CSE161']]) Args: input_string: Input string containing major course requirements. Returns: Tuple of lists containing strings that corresponds to course prerequisites, anti-requisites, and corequisites. """ # Normalize spaces and split the string into main sections input_string = re.sub(r"\s+", " ", input_string.strip()) if "co-requisite" in input_string.lower(): input_string = input_string.replace("co-requisite", "corequisite:") # Find each type of requirement by searching for specific keywords prereq_match = re.search( r"Prerequisites?: (.*?)(?=(Anti-requisite|Corequisite|$))", input_string, re.IGNORECASE, ) antireq_match = re.search( r"Anti-requisite: (.*?)(?=(Prerequisite|Corequisite|$))", input_string, re.IGNORECASE, ) coreq_match = re.search( r"Corequisite: (.*?)(?=(Prerequisite|Anti-requisite|$))", input_string, re.IGNORECASE, ) def extract_courses(section): if not section: return [] # Split sections into parts based on semicolon or "and" as conjunctions parts = re.split(r";| and ", section) # For each part, find disjunctions or standalone courses and remove spaces courses = [ re.findall(r"[A-Z]{2,4}\d{3,4}", part.replace(" ", "")) for part in parts ] # Filter out empty lists return [course for course in courses if course] # TODO: Remove prerequisite if it contains 'MAP' prerequisites = extract_courses(prereq_match.group(1) if prereq_match else "") anti_requisites = extract_courses(antireq_match.group(1) if antireq_match else "") corequisites = extract_courses(coreq_match.group(1) if coreq_match else "") if not prerequisites: prerequisites = "NONE" if not anti_requisites: anti_requisites = "NONE" if not corequisites: corequisites = "NONE" return (prerequisites, anti_requisites, corequisites)
[docs] def get_course_components(driver: "webdriver") -> Tuple[str, ...]: """Helper function to get course components. Course components may include more than one word. Args: driver: (``Selenium WebDriver``) Input webdriver object. Returns: Tuple that consists of course components. """ course_components_list: List = [] for num in range(0, 10): try: course_components_list.append( driver.find_element(By.ID, f"DERIVED_CRSECAT_DESCR${num}").text ) except NoSuchElementException: return tuple(course_components_list)
[docs] def clean_course_title(course_title: str) -> str: """Clean course title by removing any additional information after '**'. Args: course_title: Course title string. Returns: Cleaned course title string. """ # Use a regular expression to match only the course title before '**' cleaned_title = re.sub(r"\*\*.*$", "", course_title).strip() return cleaned_title
[docs] def remove_non_numeric(course_number: str) -> str: """Remove any non-digit characters from the course number. Args: course_number: Course number string. Returns: Cleaned course number string. """ # Remove any non-digit characters from the string cleaned_number = re.sub(r"\D", "", course_number) return cleaned_number
[docs] def get_sbu_cse_undergrad_course_offered_info(url: str) -> pd.DataFrame: """Scrape Stony Brook University's undergraduate CSE course offering webpage. Usage example: >>> url = "https://www.cs.stonybrook.edu/students/Undergraduate-Studies/csecourses" >>> df = get_sbu_cse_undergrad_course_offered_info(url=url) Args: url: URL of the Stony Brook University undergraduate course offering webpage. Returns: Pandas DataFrame containing the undergraduate course offering information. """ # Get the HTML content of the page response = requests.get(url) # Parse the HTML content soup = BeautifulSoup(response.text, "html.parser") # Find the table containing the course information table = soup.find("table", class_="views-table views-view-table cols-7") # Get the headers of the table headers: List[Any] = [header.text.strip() for header in table.find_all("th")] # Get table rows rows: List[Any] = [] # Iterate over each row in the table, skip the header row for row in table.find_all("tr")[1:]: columns: List[Any] = [] for idx, col in enumerate(row.find_all("td")): text = col.text.strip() # Check if the header for this column is a semester column if any(term in headers[idx] for term in ["Spring", "Summer", "Fall"]): # Process for presence of '✔' if "✔" in text: columns.append(1) else: columns.append(0) else: # Keep original text for non-semester columns columns.append(text) rows.append(columns) df = pd.DataFrame(rows, columns=headers) return df
[docs] def get_sbu_cse_grad_course_offered_info(url: str) -> pd.DataFrame: """Scrape Stony Brook University's CSE graduate course offering webpage. Usage example: >>> url = "https://www.cs.stonybrook.edu/students/Graduate-Studies/courses" >>> df = get_sbu_cse_grad_course_offered_info(url=url) Args: url: URL of the Stony Brook University graduate course offering webpage. Returns: Pandas DataFrame containing the graduate course offering information. """ # Get the HTML content of the page response = requests.get(url) # Parse the HTML content soup = BeautifulSoup(response.text, "html.parser") # Find the table containing the course information table = soup.find("table", class_="views-table views-view-table cols-6") # Get the headers of the table headers: List[Any] = [header.text.strip() for header in table.find_all("th")] # Get table rows rows: List[Any] = [] # Iterate over each row in the table, skip the header row for row in table.find_all("tr")[1:]: columns: List[Any] = [] for idx, col in enumerate(row.find_all("td")): text = col.text.strip() # Check if the header for this column is a semester column if any(term in headers[idx] for term in ["Spring", "Summer", "Fall"]): # Process for presence of '✔' if "✔" in text: columns.append(1) else: columns.append(0) else: # Keep original text for non-semester columns columns.append(text) rows.append(columns) df = pd.DataFrame(rows, columns=headers) return df
[docs] def get_sbu_cse_course_offered_info(undergrad_url: str, grad_url: str) -> pd.DataFrame: """Scrape Stony Brook University's CSE undergraduate and graduate course offering webpages. WARNING: - The URLs used in ``Usage example`` were (accessed and) current as of May 03 2024. - The tables located at each URL contain information: Spring 2023, Fall 2023, Spring 2024, and Fall 2024 -- this will need to be updated in this function in the future. Usage example: >>> undergrad_url = "https://www.cs.stonybrook.edu/students/Undergraduate-Studies/csecourses" >>> grad_url = "https://www.cs.stonybrook.edu/students/Graduate-Studies/courses" >>> df = get_sbu_cse_course_offered_info(undergrad_url=undergrad_url, grad_url=grad_url) Args: undergrad_url: URL of the Stony Brook University undergraduate course offering webpage. grad_url: URL of the Stony Brook University graduate course offering webpage. Returns: Pandas DataFrame containing the undergraduate and graduate course offering information. """ # Scrape undergraduate course information df1 = get_sbu_cse_undergrad_course_offered_info(url=undergrad_url) # Scrape graduate course information df2 = get_sbu_cse_grad_course_offered_info(url=grad_url) # Combine both dataframes df = pd.concat([df1, df2], axis=0, ignore_index=True).drop(["Summer 2024"], axis=1) # Rename columns df.rename( columns={ "Course Name": "CourseNumber", "Course Title": "CourseTitle", "Spring 2023": "spring1", "Fall 2023": "fall1", "Spring 2024": "spring2", "Fall 2024": "fall2", }, inplace=True, ) return df