CSCE 590 Web Scraping - Selenium

CSCE 590 Web Scraping - Selenium • Topics • Login to SelfServiceCarolina example • Readings: February 21, 2017

Take-Home

Login to SSC using Selenium

if __name__ == "__main__": • vipID=input("Enter your VIP ID:") • password= getpass.getpass('Enter your Password:') • driver = init_driver() • login(driver, "Selenium") • time.sleep(5) • driver.quit()

Open chromedriver • definit_driver(): • driver = webdriver.Chrome("E:/chromedriver_win32/chromedriver.exe") • driver.wait = WebDriverWait(driver, 5) • return driver

def login(driver, query): • driver.get("https://my.sc.edu/") • print ("MySC opened") • try: • link = driver.wait.until(EC.presence_of_element_located( • (By.PARTIAL_LINK_TEXT, "Sign in to"))) • #https://ssb.onecarolina.sc.edu/BANP/twbkwbis.P_WWWLogin?pkg=twbkwbis.P_GenMenu%3Fname%3Dbmenu.P_MainMnu • print ("Found link", link) • link.click() • #button = driver.wait.until(EC.element_to_be_clickable( • # (By.NAME, "btnK"))) • #box.send_keys(query) • #button.click() • except TimeoutException: • print("we have a problem First Page")

Login Page • try: • user_box = driver.wait.until(EC.presence_of_element_located( • (By.NAME, "username"))) • user_box.send_keys(vipID) • passwd_box = driver.wait.until(EC.presence_of_element_located( • (By.ID, "vipid-password"))) • passwd_box.send_keys(password) • button = driver.wait.until(EC.element_to_be_clickable( • (By.NAME, "submit"))) • print ("Found submit button", button) • #box.send_keys(query) • button.click() • print ("Signed in successfully") • except TimeoutException: • print("we have a problem Login Page")

Faculty Page • try: • print ("Signed in successfully-- Main Menu Page") • facMainMenuBTN = driver.wait.until(EC.presence_of_element_located( • (By.ID, "bmenu--P_FacMainMnu___UID3"))) • facMainMenuBTN.click() • except TimeoutException: • print("we have a problem Main Page") • print ("Made it to the Faculty Page") • try: • link = driver.wait.until(EC.presence_of_element_located( • (By.ID, "bwskfcls--p_sel_crse_search___UID6"))) • #https://ssb.onecarolina.sc.edu/BANP/twbkwbis.P_WWWLogin?pkg=twbkwbis.P_GenMenu%3Fname%3Dbmenu.P_MainMnu • link.click() • except TimeoutException: • print("we have a problem Faculty Page")

try: • select = driver.find_element_by_xpath("//select[@id='term_input_id']/option[@value='201608']").click() • button = driver.find_element_by_id("id____UID7") # "value="Submit" • button.click() • advSearchButton= driver.find_element_by_id("id____UID6") # "value="Advanced Search" • advSearchButton.click() • select = driver.find_element_by_xpath("//select[@id='camp_id']/option[@value='COL']").click() • select = driver.find_element_by_xpath("//select[@id='subj_id']/option[@value='MATH']").click() • sectionSearchButton= driver.find_element_by_id("id____UID5") # "value="Advanced Search" • page = sectionSearchButton.click() • except TimeoutException: • print("we have a problem Faculty Page")

try: • sections = driver.wait.until(EC.presence_of_element_located( • (By.CLASS_NAME, "datadisplaytable"))) • print ("Sections=", sections) • html_page = driver.page_source • soup = BeautifulSoup(html_page, 'html.parser') • except TimeoutException: • print("we have a problem Section Search Results Page")

# Now that we have the page as "soup" let's generate a csv file from it • text = soup.get_text() • outfile = open('workfile.html', 'w') • prettypage = soup.prettify() • outfile.write(prettypage) • outfile.close(outfile)

with open('listing.csv', 'wb') as f: • writer = csv.writer(f) • for tr in soup.find_all('tr')[2:]: • tds = tr.find_all('td') • row = [elem.text.encode('utf-8') for elem in tds] • writer.writerow(row) • return

Cleaning your data – Chapter 7

1-2grams.py • from urllib.request import urlopen • from bs4 import BeautifulSoup • defgetNgrams(input, n): • input = input.split(' ') • output = [] • for i in range(len(input)-n+1): • output.append(input[i:i+n]) • return output

html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)") • bsObj = BeautifulSoup(html, "html.parser") • content = bsObj.find("div", {"id":"mw-content-text"}).get_text() • ngrams = getNgrams(content, 2) • print(ngrams) • print("2-grams count is: "+str(len(ngrams)))

-2-clean2grams.py • from urllib.request import urlopen • from bs4 import BeautifulSoup • import re • import string • from collections import OrderedDict • defcleanInput(input): • pass • defgetNgrams(input, n) • pass

defcleanInput(input): • input = re.sub('\n+', " ", input) • input = re.sub('\[[0-9]*\]', "", input) • input = re.sub(' +', " ", input) • input = bytes(input, "UTF-8") • input = input.decode("ascii", "ignore") • cleanInput = [] • input = input.split(' ') • for item in input: • item = item.strip(string.punctuation) • if len(item) > 1 or (item.lower() == 'a' or item.lower() == 'i'): • cleanInput.append(item) • return cleanInput

defgetNgrams(input, n): • input = cleanInput(input) • output = dict() • for i in range(len(input)-n+1): • newNGram = " ".join(input[i:i+n]) • if newNGram in output: • output[newNGram] += 1 • else: • output[newNGram] = 1 • return output

html = urlopen("http://en.wikipedia.org/wiki/Python_(programming_language)") • bsObj = BeautifulSoup(html, "html.parser") • content = bsObj.find("div", {"id":"mw-content-text"}).get_text() • #ngrams = getNgrams(content, 2) • #print(ngrams) • #print("2-grams count is: "+str(len(ngrams))) • ngrams = getNgrams(content, 2) • ngrams = OrderedDict(sorted(ngrams.items(), key=lambda t: t[1], reverse=True)) • print(ngrams)

CSCE 590 Web Scraping - Selenium

CSCE 590 Web Scraping - Selenium

Presentation Transcript

Data Mining | Web Scraping

Selenium Web Test Tool Training

Web Scraping Services

Selenium Web Driver & Web Services

Web Scraping ,Data Scraping,Web Extraction,Data Extraction - USA

Data scraping services- worth web scraping services

Web Scraping

Web Scraping Google

CSCE 590 Web Scraping – NLTK IE

CSCE 590 Web Scraping Lecture 6

590 Web Scraping – Handling Images

590 Scraping – Social Web

CSCE 590 Web Scraping – NLTK

590 Scraping – NER shape features

CSCE 590 Web Scraping - NLTK

Web data scraping services

Best Web Scraping Service

Web Scraping Services