source: phonelux_scrappers/scrappers/anhoch_scrapper.py@ dfd5d87

Last change on this file since dfd5d87 was b68ae8d, checked in by Marko <Marko@…>, 23 months ago

Created spring app, edited setec_scrapper

  • Property mode set to 100644
File size: 3.3 KB
Line 
1import time
2from datetime import datetime
3import psycopg2
4from selenium import webdriver
5from selenium.webdriver.common.by import By
6from selenium.webdriver.support.ui import WebDriverWait
7from selenium.webdriver.support import expected_conditions as EC
8
9import config_read
10from bs4 import BeautifulSoup
11import requests
12import unicodedata
13import sys
14
15file_path = '../outputfile.txt'
16sys.stdout = open(file_path, "w")
17
18# Call to read the configuration file and connect to database
19cinfo = config_read.get_databaseconfig("../postgresdb.config")
20db_connection = psycopg2.connect(
21 database=cinfo[0],
22 host=cinfo[1],
23 user=cinfo[2],
24 password=cinfo[3]
25)
26cur = db_connection.cursor()
27
28
29def scrape_function(driver1, i):
30 offer_shop = "Anhoch" # offer shop
31 last_updated = datetime.now().date()
32 is_validated = False
33
34 anhoch_html = driver1.page_source
35 soup1 = BeautifulSoup(anhoch_html, 'html.parser')
36 active_li = soup1.find('div', {'class': 'adjust-elems pagination pagination-centered'}).find('li', {'class': 'active'})
37
38 li_element = int(active_li.get_text().strip())
39
40 print('page: '+str(li_element))
41
42 if int(active_li.get_text().strip()) == i:
43 phones = soup1.find('section', {'id': 'main'}).find('div', {'class': 'span8'}) \
44 .find('div', {'class': 'products'}).find_all('li')
45 for phone in phones:
46 offer_url = phone.find('a').get('href')
47 image_url = phone.find('a').find('img').get('src')
48 offer_name = phone.find('div', {'class': 'product-name'}).find('a').get_text().strip()
49 price = int(phone.get('data-price'))
50 brand = phone.find('div', {'class': 'product-price'}).find_all('div')[2].find('strong').get_text().strip()
51
52 response2 = requests.get(offer_url)
53 soup2 = BeautifulSoup(response2.content, 'html.parser')
54 offer_shop_code = soup2.find('div', {'class': 'product-desc'}).get_text().strip().split('\n')[3]
55
56 offer_description = soup2.find('div', {'class': 'description'}) \
57 .find('div', {'class': 'tab-content'}).find('pre').get_text().strip()
58
59 print(offer_name)
60 print(brand)
61 print()
62 print()
63
64 # insert_script = 'INSERT INTO phone_offers (offer_shop, brand, offer_name , price, image_url, offer_url,' \
65 # 'offer_shop_code, offer_description, last_updated, is_validated)' \
66 # ' VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'
67 # insert_value = (offer_shop, brand, offer_name, price, image_url, offer_url,
68 # offer_shop_code, offer_description, last_updated, is_validated)
69 # cur.execute(insert_script, insert_value)
70 # db_connection.commit()
71 else:
72 driver1.implicitly_wait(5)
73 scrape_function(driver1, i)
74
75
76for i in range(1, 19):
77 anhoch_url = "https://www.anhoch.com/category/3017/smartfoni-i-mobilni-tel#page/"+str(i)
78 # print(anhoch_url)
79
80 # selenium is used because of the dynamic content of the page
81 driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver')
82 driver1.get(anhoch_url)
83
84 scrape_function(driver1, i)
85 # closing the driver so the safari instance can pair with another webdriver session
86 driver1.close()
87
88
89cur.close()
90db_connection.close()
Note: See TracBrowser for help on using the repository browser.