Context Navigation

source: phonelux_scrappers/scrappers/anhoch_scrapper.py@ dfd5d87

Last change on this file since dfd5d87 was b68ae8d, checked in by Marko <Marko@…>, 23 months ago
Created spring app, edited setec_scrapper
Property mode set to `100644`
File size: 3.3 KB

Line
1	import time
2	from datetime import datetime
3	import psycopg2
4	from selenium import webdriver
5	from selenium.webdriver.common.by import By
6	from selenium.webdriver.support.ui import WebDriverWait
7	from selenium.webdriver.support import expected_conditions as EC
8
9	import config_read
10	from bs4 import BeautifulSoup
11	import requests
12	import unicodedata
13	import sys
14
15	file_path = '../outputfile.txt'
16	sys.stdout = open(file_path, "w")
17
18	# Call to read the configuration file and connect to database
19	cinfo = config_read.get_databaseconfig("../postgresdb.config")
20	db_connection = psycopg2.connect(
21	database=cinfo[0],
22	host=cinfo[1],
23	user=cinfo[2],
24	password=cinfo[3]
25	)
26	cur = db_connection.cursor()
27
28
29	def scrape_function(driver1, i):
30	offer_shop = "Anhoch" # offer shop
31	last_updated = datetime.now().date()
32	is_validated = False
33
34	anhoch_html = driver1.page_source
35	soup1 = BeautifulSoup(anhoch_html, 'html.parser')
36	active_li = soup1.find('div', {'class': 'adjust-elems pagination pagination-centered'}).find('li', {'class': 'active'})
37
38	li_element = int(active_li.get_text().strip())
39
40	print('page: '+str(li_element))
41
42	if int(active_li.get_text().strip()) == i:
43	phones = soup1.find('section', {'id': 'main'}).find('div', {'class': 'span8'}) \
44	.find('div', {'class': 'products'}).find_all('li')
45	for phone in phones:
46	offer_url = phone.find('a').get('href')
47	image_url = phone.find('a').find('img').get('src')
48	offer_name = phone.find('div', {'class': 'product-name'}).find('a').get_text().strip()
49	price = int(phone.get('data-price'))
50	brand = phone.find('div', {'class': 'product-price'}).find_all('div')[2].find('strong').get_text().strip()
51
52	response2 = requests.get(offer_url)
53	soup2 = BeautifulSoup(response2.content, 'html.parser')
54	offer_shop_code = soup2.find('div', {'class': 'product-desc'}).get_text().strip().split('\n')[3]
55
56	offer_description = soup2.find('div', {'class': 'description'}) \
57	.find('div', {'class': 'tab-content'}).find('pre').get_text().strip()
58
59	print(offer_name)
60	print(brand)
61	print()
62	print()
63
64	# insert_script = 'INSERT INTO phone_offers (offer_shop, brand, offer_name , price, image_url, offer_url,' \
65	# 'offer_shop_code, offer_description, last_updated, is_validated)' \
66	# ' VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'
67	# insert_value = (offer_shop, brand, offer_name, price, image_url, offer_url,
68	# offer_shop_code, offer_description, last_updated, is_validated)
69	# cur.execute(insert_script, insert_value)
70	# db_connection.commit()
71	else:
72	driver1.implicitly_wait(5)
73	scrape_function(driver1, i)
74
75
76	for i in range(1, 19):
77	anhoch_url = "https://www.anhoch.com/category/3017/smartfoni-i-mobilni-tel#page/"+str(i)
78	# print(anhoch_url)
79
80	# selenium is used because of the dynamic content of the page
81	driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver')
82	driver1.get(anhoch_url)
83
84	scrape_function(driver1, i)
85	# closing the driver so the safari instance can pair with another webdriver session
86	driver1.close()
87
88
89	cur.close()
90	db_connection.close()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: