Part 1: Web Scraping with Python, Requests and BeautifulSoup
Preparations
Open a terminal and execute the following commands:
pip install requests
pip install bs4
pip install pandas
pip install openpyxl
The script to scrape all the Pokemons
The following script is used to scrape all the Pokemons from the Pokemon webshop. It will store the data in pokemons.xlsx and pokemons.csv
web-scraping.py
#####################################################################################
############################### IMPORTING MODULES AND INITIALIZING VARIABLES#########
#####################################################################################
import requests
from bs4 import BeautifulSoup
import pandas as pd
current_page = 1
data = []
#####################################################################################
############################### LOOPING THROUGH ALL PAGES ###########################
#####################################################################################
while True:
print("Currently scraping page: "+str(current_page))
############################### FETCHING THE PAGE ##################################
url = "https://scrapeme.live/shop/page/"+str(current_page)+"/"
page = requests.get(url)
############################### PARSING THE PAGE ##################################
soup = BeautifulSoup(page.text,"html.parser")
all_items = soup.find_all("li",class_="product")
if "Page not found" in soup.title.string:
break
############################### LOOPING THROUGH EVERY ELEMENT ON PAGE #############
for item in all_items:
row = {}
############################### FETCHING DATA POINTS OF THE ELEMENT ###############
row['Name'] = item.find("h2",class_="woocommerce-loop-product__title").text
row['Price'] = item.find("span",class_="woocommerce-Price-amount amount").text.replace("£","").replace(".00","")
row['URL'] = item.find("a").get("href")
data.append(row)
current_page += 1
############################### WRITING TO EXCEL / CSV ###########################
df = pd.DataFrame(data)
df.to_excel("pokemons.xlsx",index=False)
df.to_csv("pokemons.csv",index=False)
The script to scrape the details from the inner page of every Pokemon:
web-scraping-all-pages.py
#####################################################################################
############################### IMPORTING MODULES AND INITIALIZING VARIABLES ########
#####################################################################################
import requests
from bs4 import BeautifulSoup
import pandas as pd
pd.options.mode.chained_assignment = None
pokemons = pd.read_excel("pokemons.xlsx")
#####################################################################################
############################### REQUESTING TO SCRAPE EVERY INDIVIDUAL PAGE ##########
#####################################################################################
pokemons.insert(3,"Stock","")
pokemons.insert(4,"Short_description","")
pokemons.insert(5,"SKU","")
i = 0
for ind in pokemons.index:
print("Currently scraping: "+pokemons['URL'][ind])
url = pokemons['URL'][ind]
page = requests.get(url)
############################### PARSING THE PAGE ##################################
soup = BeautifulSoup(page.text,"html.parser")
# writing individual fields to the Excel dataframe
pokemons['Stock'][ind] = soup.find("p",class_="stock").text.replace(" in stock","")
pokemons['Short_description'][ind] = soup.find("div",class_="woocommerce-product-details__short-description").text
pokemons['SKU'][ind] = soup.find("span",class_="sku").text
i += 1
if(i == 10):
break
# writing data to Excel
pokemons.to_excel("pokemons_complete.xlsx",index=False)
pokemons.to_csv("pokemons_complete.csv",index=False,sep="|")
Part 2: Scraping with AgentQL
Preparations
You can find all the installation steps on the AgentQL website.
pip install pandas
pip install openpyxl
pip install agentql
agentql init
The script to scrape the Pokemons from the website with AgentQL
import agentql
from playwright.sync_api import sync_playwright
import pandas as pd
page_nr = 1
data = []
with sync_playwright() as playwright, playwright.chromium.launch(headless=False) as browser:
page = agentql.wrap(browser.new_page())
while page_nr <= 4:
page.goto("https://scrapeme.live/shop/page/"+str(page_nr)+"/")
# use your own words to describe what you're looking for
QUERY = """
{
products[] {
name
price
}
}
"""
# query_data returns data from the page
response = page.query_data(QUERY)
#print(response)
for product in response['products']:
#data.append(product)
data.append({"name":product['name'],"price":str(product['price']).replace("£","").replace(".00","")})
page_nr += 1
df = pd.DataFrame(data)
df.to_excel("pokemons_agentql.xlsx",index=False)