From 8690994b939626bb81b6f21049f38bb31a8aa21c Mon Sep 17 00:00:00 2001 From: rav4s Date: Wed, 20 Jan 2021 12:55:10 -0600 Subject: [PATCH] Added code to scrape iFixit smartphone website and generate lists of each metric --- scraper.py | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/scraper.py b/scraper.py index bd35e42..9f0f1ac 100644 --- a/scraper.py +++ b/scraper.py @@ -1 +1,53 @@ import time +from bs4 import BeautifulSoup +import requests +import re + +def get_device_manufacturers(device_names): + list_of_manufacturers_without_tags = [] + for i in device_names: + i = str(i) + i = i.replace('
\n','') + i = i.lstrip() + i = i.split('<')[0] + i = i.rstrip() + list_of_manufacturers_without_tags.append(i) + return list_of_manufacturers_without_tags + +def get_device_models(device_names, soup): + list_of_models_without_tags = [] + list_of_models = soup.findAll("span", {"class": "selected"}) + for i in list_of_models: + i = str(i) + i = i.replace('','').replace('','') + list_of_models_without_tags.append(i) + return list_of_models_without_tags + +def format_device_scores(device_scores, soup): + list_of_scores_without_tag = [] + list_of_scores = soup.findAll("h3") + for i in list_of_scores: + i = str(i) + i = i.replace('

','').replace('

','') + list_of_scores_without_tag.append(i) + return list_of_scores_without_tag + +def data_from_each(list_of_manufacturers_without_tags, list_of_models_without_tags, list_of_scores_without_tag): + j = 0 + for i in list_of_manufacturers_without_tags: + print(list_of_manufacturers_without_tags[j], list_of_models_without_tags[j], list_of_scores_without_tag[j]) + j = j + 1 + +def main(): + link = "https://www.ifixit.com/smartphone-repairability" + page = requests.get(link) + soup = BeautifulSoup(page.content, 'html.parser') + device_names = soup.findAll("div", {"class": "cell device-name"}) + device_scores = soup.findAll("div", {"class": "cell device-score"}) + list_of_manufacturers_without_tags = get_device_manufacturers(device_names) + list_of_models_without_tags = get_device_models(device_names, soup) + list_of_scores_without_tag = format_device_scores(device_scores, soup) + data_from_each(list_of_manufacturers_without_tags, list_of_models_without_tags, list_of_scores_without_tag) + +if __name__ == '__main__': + main()