Source code for src.kg.knowledge_graph

"""Knowledge graph module for this project.

.. autosummary::
    :nosignatures:

    KnowledgeBaseError
    KnowledgeGraphError
    KnowledgeBase
    KnowledgeGraph
    scrape_sbu_solar
    parse_requirements
    parse_prerequisites
    clean_course_title
    remove_non_numeric
    get_course_components
    get_sbu_cse_undergrad_course_offered_info
    get_sbu_cse_grad_course_offered_info
    get_sbu_cse_course_offered_info
"""

import os
import re
import time
import pandas as pd
import requests

from bs4 import BeautifulSoup
from dataclasses import dataclass
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

from typing import Any, Optional, List, Tuple, Union
from warnings import warn

from src.utils import util
from src.utils.util import timeit



[docs]
class KnowledgeBaseError(Exception):
    """Exception intended for knowledge base errors."""

    pass




[docs]
class KnowledgeGraphError(Exception):
    """Exception intended for knowledge graph errors."""

    pass




[docs]
@dataclass
class KnowledgeGraph:
    """Dataclass intended to encapsulate knowledge graphs.

    NOTE:
        - Only one knowledge graph file needs to be specified.

    Example usage:
        >>> kg = KnowledgeGraph(json="path/to/file.json")
        >>> kg.json
        'path/to/file.json'

    Raises:
        KnowledgeGraphError: Arises if the knowledge graph file is not specified. Valid knowledge graph files include JSON, ERGO, RDF, OWL, CSV files.

    Attributes:
        json: JSON knowledge graph file.
        ergo: ERGO knowledge graph file.
        rdf: RDF knowledge graph file.
        owl: OWL knowledge graph file.
        csv: CSV knowledge graph file.
        lp: Logic programming (``Clingo``) knowledge graph file.
    """

    json: str = ""
    ergo: str = ""
    rdf: str = ""
    owl: str = ""
    csv: str = ""
    df: pd.DataFrame = pd.DataFrame()
    lp: str = ""

    def __post_init__(self):
        """Post-initialization function to verify knowledge graph file or representation.

        Raises:
            ValueError: Arises if the knowledge graph file or representation is not specified. Valid knowledge graph files include JSON, ERGO, RDF, OWL, CSV files, and a valid representation includes a ``pandas data frame``.
            FileNotFoundError: Arises if the file does not exist.
        """
        self.json: str = (
            os.path.abspath(self.json)
            if (self.json and self.json.lower().endswith(".json"))
            else None
        )
        self.ergo: str = (
            os.path.abspath(self.ergo)
            if (self.ergo and self.ergo.lower().endswith(".ergo"))
            else None
        )
        self.rdf: str = (
            os.path.abspath(self.rdf)
            if (self.rdf and self.rdf.lower().endswith(".rdf"))
            else None
        )
        self.owl: str = (
            os.path.abspath(self.owl)
            if (self.owl and self.owl.lower().endswith(".owl"))
            else None
        )

        self.csv: str = (
            os.path.abspath(self.csv)
            if (self.csv and self.csv.lower().endswith(".csv"))
            else None
        )

        self.df: pd.DataFrame = self.df if (len(self.df) != 0) else None

        self.lp: str = (
            os.path.abspath(self.lp)
            if (self.lp and self.lp.lower().endswith(".lp"))
            else None
        )

        # Check if knowledge graph file or representation is specified
        if (
            (not self.json)
            and (not self.ergo)
            and (not self.rdf)
            and (not self.owl)
            and (not self.csv)
            and (not self.lp)
            and (len(self.df) == 0)
        ):
            raise KnowledgeGraphError(
                "Knowledge graph file or representation must be specified."
            )

        # Check if file exists
        for file in [self.json, self.ergo, self.rdf, self.owl, self.csv, self.lp]:
            if file and (not os.path.exists(file)):
                raise FileNotFoundError(f"File not found: {file}")




[docs]
@dataclass
class KnowledgeBase:
    """Dataclass intended to encapsulate knowledge bases.

    NOTE:
        - Only one knowledge base file needs to be specified.

    Example usage:
        >>> kb = KnowledgeBase(url="https://www.stonybrook.edu")
        >>> kb.url
        'https://www.stonybrook.edu'

    Raises:
        KnowledgeBaseError: Arises if the knowledge base file or representation is not specified. Valid knowledge base files include PDF, TXT or ERGO files, and valid representations include a URL.

    Attributes:
        url: URL knowledge base website link.
        pdf: PDF knowledge base file.
        txt: TXT knowledge base file.
        ergo: ERGO knowledge base file.
        lp: Logic programming (``Clingo``) knowledge base file.
    """

    url: str = ""
    pdf: str = ""
    txt: str = ""
    ergo: str = ""
    lp: str = ""

    def __post_init__(self):
        """Post-initialization function to verify knowledge base file or representation.

        Raises:
            KnowledgeBaseError: Arises if the knowledge base file or representation is not specified. Valid knowledge base files include PDF, TXT files.
            FileNotFoundError: Arises if the file does not exist.
        """
        self.pdf: str = (
            os.path.abspath(self.pdf)
            if (self.pdf and self.pdf.lower().endswith(".pdf"))
            else None
        )
        self.txt: str = (
            os.path.abspath(self.txt)
            if (self.txt and self.txt.lower().endswith(".txt"))
            else None
        )

        self.ergo: str = (
            (os.path.abspath(self.ergo))
            if (self.ergo and self.ergo.lower().endswith(".ergo"))
            else None
        )

        self.lp: str = (
            (os.path.abspath(self.lp))
            if (self.lp and self.lp.lower().endswith(".lp"))
            else None
        )

        # Check if knowledge base file or representation is specified
        if (
            (not self.url)
            and (not self.pdf)
            and (not self.txt)
            and (not self.ergo)
            and (not self.lp)
        ):
            raise KnowledgeBaseError(
                "Knowledge base file or representation must be specified."
            )

        # Check if file exists
        for file in [self.pdf, self.txt, self.ergo]:
            if file and (not os.path.exists(file)):
                raise FileNotFoundError(f"File not found: {file}")




[docs]
@timeit
def scrape_sbu_solar(
    url: Union[KnowledgeBase, str],
    major_three_letter_code: str,
    wait_time: int = 10,
    headless: bool = True,
    verbose: bool = False,
    output_filename: Optional[str] = None,
) -> KnowledgeGraph:
    """Scrape Stony Brook University's course catalog for a specific major's course information.
    This function scrapes Stony Brook University's course catalog and stores the information in a :class:`KnowledgeGraph` object.
    The course information includes course number, title, career, units, grading basis, enrollment requirements, anti-requisites, corequisites, course components, academic group, academic organization, and course description.
    Additionally, information to when courses are offered over a 4 semester span (specific only to CSE courses) is also included. This information is scraped from the CSE department's website, and is
    hardcoded for CSE courses only see (:func:`get_sbu_cse_course_offered_info`).

    WARNING:
        - This function uses a ``Selenium WebDriver`` and specific ``div`` IDs to scrape the course catalog.

    Usage example:
        >>> url = "https://prod.ps.stonybrook.edu/psc/csprodg/EMPLOYEE/CAMP/c/COMMUNITY_ACCESS.SSS_BROWSE_CATLG.GBL?"
        >>> kg = scrape_sbu_solar(
        ...        url=url,
        ...        major_three_letter_code="cse",
        ...        wait_time=10,
        ...        headless=True,
        ...        verbose=True,)

    Args:
        url: Input Stony Brook URL (or :class:`KnowledgeBase` object) to scrape.
        major_three_letter_code: Three letter code for the major (e.g. CSE for computer science).
        wait_time: Maximum wait time (in seconds) for each click operation. Defaults to 10.
        headless: Do not open brower. Defaults to True.
        verbose: Print output to screen. Defaults to False.
        output_filename: Output filename for the JSON file. Defaults to None.

    Raises:
        ValueError: Arises if the course table is not displayed, is empty, or if the wait time is less than 0 seconds.

    Returns:
        :class:`KnowledgeGraph` object containing course information that corresponds to an output JSON file.
    """

    # Verify output_filename
    if output_filename is not None:
        _path, _filename, _ = util.file_parts(output_filename)
        output_filename: str = os.path.join(
            _path, f"{_filename}.json"
        )  # Ensure JSON extension
        _return_json: bool = True
    else:
        _return_json: bool = False

    # Verify URL
    if isinstance(url, KnowledgeBase):
        url: str = url.url

    # Verify wait time is integer and greater than 0
    wait_time: int = int(wait_time)

    # Headless option
    if headless:
        options = webdriver.ChromeOptions()
        options.add_argument("headless")
    else:
        options = None

    if wait_time < 0:
        raise ValueError("Wait time must be greater than 0 seconds.")

    # Setup Selenium WebDriver
    driver = webdriver.Chrome(options=options)
    driver.get(url)

    # Get page navigation letter
    major_three_letter_code: str = (
        major_three_letter_code.upper()
    )  # Ensure major code is uppercase

    nav_letter: str = major_three_letter_code[
        0
    ].upper()  # Get first letter of major code

    # NOTE: If Major ID starts with 'A', skip this step.
    #
    # Click on the letter to navigate to the major
    if nav_letter == "A":
        pass
    else:
        WebDriverWait(driver, wait_time).until(
            EC.element_to_be_clickable((By.LINK_TEXT, nav_letter))
        ).click()

    # NOTE: If major code is 'MAT', then partial
    #  link text is 'MAT - M' must be used to navigate
    #
    # Navigate to major
    if major_three_letter_code == "MAT":
        WebDriverWait(driver, wait_time).until(
            EC.element_to_be_clickable(
                (By.PARTIAL_LINK_TEXT, f"{major_three_letter_code} - M")
            )
        ).click()
    else:
        WebDriverWait(driver, wait_time).until(
            EC.element_to_be_clickable((By.PARTIAL_LINK_TEXT, major_three_letter_code))
        ).click()

    # Get table data
    time.sleep(wait_time // 2)  # Time to wait for javascript to load the table.
    table: List[webdriver.remote.webelement.WebElement] = driver.find_elements(
        By.TAG_NAME, "tbody"
    )  # This will be a list of tables

    # Remove tables without course number
    # Perform this in reverse order to avoid index errors
    #   if table is removed from the list
    for tab in reversed(table):
        try:
            tab.find_element(By.PARTIAL_LINK_TEXT, "Course Nbr")
        except (AttributeError, NoSuchElementException):
            table.remove(tab)

    try:
        table = table[-1]  # Get the last table
    except IndexError:
        # return None
        raise ValueError("Table is not displayed. Check the URL and major code.")

    # Verify table
    if not table.is_displayed():
        raise ValueError("Table is not displayed. Check the URL and major code.")

    if not table.text:
        raise ValueError("Table is empty. Check the URL and major code.")

    # Extract headers
    headers = [header.text for header in table.find_elements(By.TAG_NAME, "th")]

    # Extract rows
    rows = []
    for row in table.find_elements(By.TAG_NAME, "tr"):
        cells = [cell.text for cell in row.find_elements(By.TAG_NAME, "td")]
        if cells:  # This check is to skip rows without table data cells
            rows.append(cells)

    # Create DataFrame
    df = pd.DataFrame(rows, columns=headers)

    # Create additional columns for course information
    _course_numbers: List[str] = df["Course Nbr"].tolist()  # Get course numbers

    # Clean course titles
    _course_titles: List[str] = df["Course Title"].tolist()
    df["Course Title"] = [clean_course_title(title) for title in _course_titles]

    career_list: List[str] = []
    units_list: List[str] = []
    grading_basis_list: List[str] = []
    enrollment_requirement_list: List[str] = []
    enrollment_anti_requisite_list: List[str] = []
    enrollment_corequisite_list: List[str] = []
    course_components_list: List[Tuple[str, ...]] = []
    academic_group_list: List[str] = []
    academic_organization_list: List[str] = []
    description_list: List[str] = []
    course_numbers_with_three_letter_code: List[str] = []

    if verbose:
        print(f"\nScraping course information for {major_three_letter_code}...\n")

    # Get information for each course
    for course in _course_numbers:

        if verbose:
            print(f"Processing course: {course}...")

        # Wait for the page to load and click on course number
        WebDriverWait(driver, wait_time).until(
            EC.element_to_be_clickable((By.LINK_TEXT, f"{course}"))
        ).click()

        # Use ID to find element -- it is unique.
        WebDriverWait(driver, wait_time).until(
            EC.presence_of_element_located(
                (By.ID, "win0divSSR_CRSE_OFF_VW_ACAD_CAREER$0")
            )
        ).click()

        try:
            career: str = driver.find_element(
                By.ID, "win0divSSR_CRSE_OFF_VW_ACAD_CAREER$0"
            ).text
        except NoSuchElementException:
            career: str = ""

        # Course units
        try:
            units: str = float(
                driver.find_element(By.ID, "DERIVED_CRSECAT_UNITS_RANGE$0").text
            )
        except (ValueError, NoSuchElementException):
            try:
                units: str = driver.find_element(
                    By.ID, "DERIVED_CRSECAT_UNITS_RANGE$0"
                ).text
            except NoSuchElementException:
                units: str = ""

        # Grading basis
        try:
            grading_basis: str = driver.find_element(
                By.ID, "win0divSSR_CRSE_OFF_VW_GRADING_BASIS$0"
            ).text
        except NoSuchElementException:
            grading_basis: str = ""

        # Enrollment requirements (prerequisites, anti-requisites, and corequisites)
        try:
            _enrollment_requirement: str = driver.find_element(
                By.ID, "DERIVED_CRSECAT_DESCR254A$0"
            ).text
        except NoSuchElementException:
            _enrollment_requirement: str = ""

        enrollment_requirement: Union[str, List[List[str]]]
        enrollment_anti_requisite: Union[str, List[List[str]]]
        enrollment_corequisite: Union[str, List[List[str]]]

        enrollment_requirement, enrollment_anti_requisite, enrollment_corequisite = (
            parse_requirements(input_string=_enrollment_requirement)
        )

        # Course components
        course_components: Tuple[str, ...] = get_course_components(driver)

        # Academic group
        try:
            academic_group: str = driver.find_element(
                By.ID, "ACAD_GROUP_TBL_DESCR$0"
            ).text
        except NoSuchElementException:
            academic_group: str = ""

        # Academic organization
        try:
            academic_organization: str = driver.find_element(
                By.ID, "win0divACAD_ORG_TBL_DESCR$0"
            ).text
        except NoSuchElementException:
            academic_organization: str = ""

        # Course description
        try:
            description: str = driver.find_element(
                By.ID, "SSR_CRSE_OFF_VW_DESCRLONG$0"
            ).text
        except NoSuchElementException:
            description: str = ""

        # Append course number with three letter code
        course = remove_non_numeric(course)
        course_numbers_with_three_letter_code.append(
            f"{major_three_letter_code}{course}"  # Remove space character, output should be: "CSE101"
        )

        # Update lists
        career_list.append(career)
        units_list.append(units)
        grading_basis_list.append(grading_basis)
        enrollment_requirement_list.append(enrollment_requirement)
        enrollment_anti_requisite_list.append(enrollment_anti_requisite)
        enrollment_corequisite_list.append(enrollment_corequisite)
        course_components_list.append(course_components)
        academic_group_list.append(academic_group)
        academic_organization_list.append(academic_organization)
        description_list.append(description)

        # Wait then click to go back to the course list
        WebDriverWait(driver, wait_time // 2).until(
            EC.element_to_be_clickable(
                (By.LINK_TEXT, "Return to Browse Course Catalog")
            )
        ).click()

    # Update DataFrame
    df["Course Nbr"] = course_numbers_with_three_letter_code
    df.insert(df.columns.__len__(), "Career", career_list)
    df.insert(df.columns.__len__(), "Units", units_list)
    df.insert(df.columns.__len__(), "Grading Basis", grading_basis_list)
    df.insert(
        df.columns.__len__(), "Enrollment Requirement", enrollment_requirement_list
    )
    df.insert(df.columns.__len__(), "Antirequisites", enrollment_anti_requisite_list)
    df.insert(df.columns.__len__(), "Corequisites", enrollment_corequisite_list)
    df.insert(df.columns.__len__(), "Course Components", course_components_list)
    df.insert(df.columns.__len__(), "Academic Group", academic_group_list)
    df.insert(df.columns.__len__(), "Academic Organization", academic_organization_list)
    df.insert(df.columns.__len__(), "Description", description_list)

    # Rename columns
    df.rename(
        columns={
            "Enrollment Requirement": "Prerequisites",
            "Units": "Credits",
            "Course Title": "CourseTitle",
            "Course Nbr": "CourseNumber",
        },
        inplace=True,
    )

    # Add semester offering information here
    if major_three_letter_code.upper() == "CSE":
        # Get course offering information for CSE
        _df = get_sbu_cse_course_offered_info(
            undergrad_url="https://www.cs.stonybrook.edu/students/Undergraduate-Studies/csecourses",  # Hard code as this is for CSE only
            grad_url="https://www.cs.stonybrook.edu/students/Graduate-Studies/courses",  # Hard code as this is for CSE only
        )

        # Get dataframe headers
        current_columns: List[str] = df.columns.tolist()
        _df_columns: List[str] = _df.columns.tolist()  # [
        #     -4:
        # ]  # Only need the last 4 columns, as that has the course semester offering information
        new_columns: List[str] = current_columns + _df_columns

        # Concatenate along CourseNumber (columns)
        df2 = pd.concat([df, _df], axis=1, ignore_index=True)
        df = df2.fillna(0)

        # Rename columns in the new DataFrame
        df.columns = new_columns

        # Remove duplicate columns
        df = df.loc[:, ~df.columns.duplicated()]

        # Update course offering information for certain CSE courses
        #
        # TODO: Keep adding courses that are regularly offered but
        #       not scheduled here.
        for rows in df.itertuples():
            if (
                ("593" in rows[1].lower())
                or ("600" in rows[1].lower())
                or ("698" in rows[1].lower())
                or ("487" in rows[1].lower())
            ):
                df.iloc[rows[0], -4:] = [1, 1, 1, 1]
    else:
        # Non-CSE cases -- just assume that courses are offered all year round
        new_columns: List[str] = ["spring1", "fall1", "spring2", "fall2"]
        for column in new_columns:
            df[column] = 1

    # Condense semester offering information to just one fall and spring column each
    df["spring"] = df[["spring1", "spring2"]].max(axis=1)
    df["fall"] = df[["fall1", "fall2"]].max(axis=1)

    # Replace index with Course Nbr
    df.set_index(
        # "Course Nbr",
        "CourseNumber",
        inplace=True,
        drop=True,  # Drop the Index column
    )

    # Remove duplicate rows
    df = df[~df.index.duplicated(keep="first")]

    # Drop columns not needed by ErgoAI
    df.drop(
        [
            # "Description",
            "Academic Group",
            "Academic Organization",
            "Course Components",
            "Grading Basis",
            "spring1",
            "fall1",
            "spring2",
            "fall2",
        ],
        axis=1,
        inplace=True,
    )

    # Quit the driver, close the browser
    driver.quit()

    # Write to JSON file if requested
    if _return_json:
        df.to_json(output_filename, orient="index", indent=4)

    # Create KnowledgeGraph object
    kg = KnowledgeGraph(df=df, json=output_filename)

    return kg




[docs]
def parse_prerequisites(input_string: str) -> Union[str, List[List[str]]]:
    """Parse major requirements from a string into a list of lists of course codes.
    This function is mainly used to separate disjunctions and conjunctions course prerequisites.
    Disjunctions are grouped together in the same sub-list, while conjunctions are separated into different sub-lists.
    For example, ``"Prerequisite: CSE 216 or CSE 260; AMS 310; CSE major"`` would be parsed as: ``[["CSE 216", "CSE 260"], ["AMS 310"], ["CSE major"]]``.

    WARNING:
        - This function is deprecated. Use :py:func:`~src.kg.knowledge_graph.parse_requirements` instead.

    Usage example:
        >>> input_string = "Prerequisite: CSE 216 or CSE 260; AMS 310; CSE major"
        >>> parse_prerequisites(input_string)
        [['CSE 216', 'CSE 260'], ['AMS 310'], ['CSE major']]

    Args:
        input_string: Input string containing major course requirements.

    Returns:
        List of lists of containing strings that corresponds to course prequisites.
    """
    warn(
        "``parse_prerequisites()`` is deprecated. Please use ``parse_requirements()`` instead.",
        DeprecationWarning,
        stacklevel=2,
    )
    # NOTE: Disjunction statements in the same sub-list,
    #   conjunctions in separate lists

    # NOTE: This pattern assumes that course codes are always in
    #   the format "AAA 123"
    # Define a regular expression pattern to capture course codes
    course_pattern = r"\b([A-Z]{3} \d{3})\b"

    # Split the input string into major requirements using the semi-colon
    conjunctive_parts = input_string.split(";")

    # Result list to hold parsed requirements
    result = []

    for part in conjunctive_parts:
        # Find courses in each part
        part_courses = re.findall(course_pattern, part)

        # Add non-empty lists to the result
        if part_courses:
            result.append(part_courses)

    if not result:
        return "NONE"

    return result




[docs]
def parse_requirements(
    input_string: str,
) -> Tuple[List[List[str]], List[List[str]], List[List[str]]]:
    """Parse major requirements from a string into a list of lists of course codes.
    This function is mainly used to separate disjunctions and conjunctions of course: prerequisites,
    anti-requisites and corequisites. Disjunctions are grouped together in the same sub-list, while conjunctions
    are separated into different sub-lists. Returns lists for prerequisites, anti-requisites, and corequisites.

    NOTE:
        - Disjunctive statements will appear in the same sub-list, while conjunctive statements will appear in a separate sub-list.
        - Use this function in place of :py:func:`~src.kg.knowledge_graph.parse_prerequisites`.

    Usage example:
        >>> input_string = "Prerequisite: CSE 216 or CSE 260; AMS 310; Anti-requisite: CSE 260. Corequisite: CSE 161."
        >>> parse_requirements(input_string)
        ([['CSE216', 'CSE260'], ['AMS310']], [['CSE260']], [['CSE161']])

    Args:
        input_string: Input string containing major course requirements.

    Returns:
        Tuple of lists containing strings that corresponds to course prerequisites, anti-requisites, and corequisites.
    """
    # Normalize spaces and split the string into main sections
    input_string = re.sub(r"\s+", " ", input_string.strip())

    if "co-requisite" in input_string.lower():
        input_string = input_string.replace("co-requisite", "corequisite:")

    # Find each type of requirement by searching for specific keywords
    prereq_match = re.search(
        r"Prerequisites?: (.*?)(?=(Anti-requisite|Corequisite|$))",
        input_string,
        re.IGNORECASE,
    )
    antireq_match = re.search(
        r"Anti-requisite: (.*?)(?=(Prerequisite|Corequisite|$))",
        input_string,
        re.IGNORECASE,
    )
    coreq_match = re.search(
        r"Corequisite: (.*?)(?=(Prerequisite|Anti-requisite|$))",
        input_string,
        re.IGNORECASE,
    )

    def extract_courses(section):
        if not section:
            return []
        # Split sections into parts based on semicolon or "and" as conjunctions
        parts = re.split(r";| and ", section)
        # For each part, find disjunctions or standalone courses and remove spaces
        courses = [
            re.findall(r"[A-Z]{2,4}\d{3,4}", part.replace(" ", "")) for part in parts
        ]
        # Filter out empty lists
        return [course for course in courses if course]

    # TODO: Remove prerequisite if it contains 'MAP'
    prerequisites = extract_courses(prereq_match.group(1) if prereq_match else "")
    anti_requisites = extract_courses(antireq_match.group(1) if antireq_match else "")
    corequisites = extract_courses(coreq_match.group(1) if coreq_match else "")

    if not prerequisites:
        prerequisites = "NONE"

    if not anti_requisites:
        anti_requisites = "NONE"

    if not corequisites:
        corequisites = "NONE"

    return (prerequisites, anti_requisites, corequisites)




[docs]
def get_course_components(driver: "webdriver") -> Tuple[str, ...]:
    """Helper function to get course components. Course components may include more than one word.

    Args:
        driver: (``Selenium WebDriver``) Input webdriver object.

    Returns:
        Tuple that consists of course components.
    """
    course_components_list: List = []
    for num in range(0, 10):
        try:
            course_components_list.append(
                driver.find_element(By.ID, f"DERIVED_CRSECAT_DESCR${num}").text
            )
        except NoSuchElementException:
            return tuple(course_components_list)




[docs]
def clean_course_title(course_title: str) -> str:
    """Clean course title by removing any additional information after '**'.

    Args:
        course_title: Course title string.

    Returns:
        Cleaned course title string.
    """
    # Use a regular expression to match only the course title before '**'
    cleaned_title = re.sub(r"\*\*.*$", "", course_title).strip()
    return cleaned_title




[docs]
def remove_non_numeric(course_number: str) -> str:
    """Remove any non-digit characters from the course number.

    Args:
        course_number: Course number string.

    Returns:
        Cleaned course number string.
    """
    # Remove any non-digit characters from the string
    cleaned_number = re.sub(r"\D", "", course_number)
    return cleaned_number




[docs]
def get_sbu_cse_undergrad_course_offered_info(url: str) -> pd.DataFrame:
    """Scrape Stony Brook University's undergraduate CSE course offering webpage.

    Usage example:
        >>> url = "https://www.cs.stonybrook.edu/students/Undergraduate-Studies/csecourses"
        >>> df = get_sbu_cse_undergrad_course_offered_info(url=url)

    Args:
        url: URL of the Stony Brook University undergraduate course offering webpage.

    Returns:
        Pandas DataFrame containing the undergraduate course offering information.
    """
    # Get the HTML content of the page
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table containing the course information
    table = soup.find("table", class_="views-table views-view-table cols-7")

    # Get the headers of the table
    headers: List[Any] = [header.text.strip() for header in table.find_all("th")]

    # Get table rows
    rows: List[Any] = []

    # Iterate over each row in the table, skip the header row
    for row in table.find_all("tr")[1:]:
        columns: List[Any] = []
        for idx, col in enumerate(row.find_all("td")):
            text = col.text.strip()
            # Check if the header for this column is a semester column
            if any(term in headers[idx] for term in ["Spring", "Summer", "Fall"]):
                # Process for presence of '✔'
                if "✔" in text:
                    columns.append(1)
                else:
                    columns.append(0)
            else:
                # Keep original text for non-semester columns
                columns.append(text)
        rows.append(columns)

    df = pd.DataFrame(rows, columns=headers)
    return df




[docs]
def get_sbu_cse_grad_course_offered_info(url: str) -> pd.DataFrame:
    """Scrape Stony Brook University's CSE graduate course offering webpage.

    Usage example:
        >>> url = "https://www.cs.stonybrook.edu/students/Graduate-Studies/courses"
        >>> df = get_sbu_cse_grad_course_offered_info(url=url)

    Args:
        url: URL of the Stony Brook University graduate course offering webpage.

    Returns:
        Pandas DataFrame containing the graduate course offering information.
    """
    # Get the HTML content of the page
    response = requests.get(url)

    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Find the table containing the course information
    table = soup.find("table", class_="views-table views-view-table cols-6")

    # Get the headers of the table
    headers: List[Any] = [header.text.strip() for header in table.find_all("th")]

    # Get table rows
    rows: List[Any] = []

    # Iterate over each row in the table, skip the header row
    for row in table.find_all("tr")[1:]:
        columns: List[Any] = []
        for idx, col in enumerate(row.find_all("td")):
            text = col.text.strip()
            # Check if the header for this column is a semester column
            if any(term in headers[idx] for term in ["Spring", "Summer", "Fall"]):
                # Process for presence of '✔'
                if "✔" in text:
                    columns.append(1)
                else:
                    columns.append(0)
            else:
                # Keep original text for non-semester columns
                columns.append(text)
        rows.append(columns)

    df = pd.DataFrame(rows, columns=headers)
    return df




[docs]
def get_sbu_cse_course_offered_info(undergrad_url: str, grad_url: str) -> pd.DataFrame:
    """Scrape Stony Brook University's CSE undergraduate and graduate course offering webpages.

    WARNING:
        - The URLs used in ``Usage example`` were (accessed and) current as of May 03 2024.
        - The tables located at each URL contain information: Spring 2023, Fall 2023, Spring 2024, and Fall 2024 -- this will need to be updated in this function in the future.

    Usage example:
        >>> undergrad_url = "https://www.cs.stonybrook.edu/students/Undergraduate-Studies/csecourses"
        >>> grad_url = "https://www.cs.stonybrook.edu/students/Graduate-Studies/courses"
        >>> df = get_sbu_cse_course_offered_info(undergrad_url=undergrad_url, grad_url=grad_url)

    Args:
        undergrad_url: URL of the Stony Brook University undergraduate course offering webpage.
        grad_url: URL of the Stony Brook University graduate course offering webpage.

    Returns:
        Pandas DataFrame containing the undergraduate and graduate course offering information.
    """
    # Scrape undergraduate course information
    df1 = get_sbu_cse_undergrad_course_offered_info(url=undergrad_url)

    # Scrape graduate course information
    df2 = get_sbu_cse_grad_course_offered_info(url=grad_url)

    # Combine both dataframes
    df = pd.concat([df1, df2], axis=0, ignore_index=True).drop(["Summer 2024"], axis=1)

    # Rename columns
    df.rename(
        columns={
            "Course Name": "CourseNumber",
            "Course Title": "CourseTitle",
            "Spring 2023": "spring1",
            "Fall 2023": "fall1",
            "Spring 2024": "spring2",
            "Fall 2024": "fall2",
        },
        inplace=True,
    )
    return df