Context Navigation

source: phonelux_scrappers/scrappers/tehnomarket_scrapper.py@ dfd5d87

Last change on this file since dfd5d87 was b68ae8d, checked in by Marko <Marko@…>, 3 years ago
Created spring app, edited setec_scrapper
Property mode set to `100644`
File size: 3.4 KB

Line
1	import unicodedata
2	from datetime import datetime
3	import psycopg2
4	import config_read
5	from bs4 import BeautifulSoup
6	from selenium import webdriver
7	import requests
8
9	import sys
10
11	file_path = 'outputfile.txt'
12	sys.stdout = open(file_path, "w")
13
14	# Call to read the configuration file and connect to database
15	cinfo = config_read.get_databaseconfig("../postgresdb.config")
16	db_connection = psycopg2.connect(
17	database=cinfo[0],
18	host=cinfo[1],
19	user=cinfo[2],
20	password=cinfo[3]
21	)
22	cur = db_connection.cursor()
23
24
25	def scrape_function(driver1, i):
26	offer_shop = "Tehnomarket" # offer shop
27	last_updated = datetime.now().date()
28	is_validated = False
29
30	tehnomarket_html = driver1.page_source
31	soup1 = BeautifulSoup(tehnomarket_html, 'html.parser')
32	active_li = soup1.find('div', {'class': 'adjust-elems pagination pagination-centered'}).find('li',
33	{'class': 'active'})
34
35	print('page: ' + active_li.get_text())
36
37	if int(active_li.get_text().strip()) == i:
38	phones = soup1.find('ul', {'class': 'products products-display-grid thumbnails'}).find_all('li', {
39	'class': 'span4 product-fix'})
40
41	for phone in phones:
42	offer_url = phone.find('a').get('href')
43	offer_name = phone.find('div', {'class': 'product-name'}).get_text().strip()
44	price = int(phone.find('div', {'class': 'product-price clearfix'}).find('strong') \
45	.get_text().replace('ден.', '').replace(',', '').strip())
46
47	response2 = requests.get(offer_url)
48	soup2 = BeautifulSoup(response2.content, 'html.parser')
49
50	image = soup2.find('div', {'id': 'product_gallery'}).find('img')
51
52	image_url = None
53	if image is not None:
54	image_url = image.get('src')
55
56	details = soup2.find('div', {'class': 'product-desc'}).get_text().split('\n')
57
58	brand = details[2].strip().capitalize()
59	offer_shop_code = details[4].strip()
60
61	specifications = []
62	for info in soup2.find_all('span', {'class': 'info'}):
63	specifications.append(info.get_text())
64
65	print(brand)
66	print(offer_name)
67	print()
68	print()
69
70	offer_description = '\n'.join(specifications)
71
72	insert_script = 'INSERT INTO phone_offers (offer_shop, brand, offer_name, price, image_url, offer_url,' \
73	'offer_description, offer_shop_code, last_updated, is_validated)' \
74	' VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'
75	insert_value = (offer_shop, brand, offer_name, price, image_url, offer_url, offer_description,
76	offer_shop_code, last_updated, is_validated)
77	cur.execute(insert_script, insert_value)
78	db_connection.commit()
79	else:
80	driver1.implicitly_wait(30)
81	scrape_function(driver1, i)
82
83
84	for i in range(1, 6):
85	tehnomarket_url = 'https://tehnomarket.com.mk/category/4109/mobilni-telefoni#page/' + str(i)
86	# print(anhoch_url)
87
88	# selenium is used because of the dynamic content of the page
89	driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver')
90	driver1.get(tehnomarket_url)
91
92	scrape_function(driver1, i)
93	# closing the driver so the safari instance can pair with another webdriver session
94	driver1.close()
95
96	cur.close()
97	db_connection.close()

Note: See TracBrowser for help on using the repository browser.

Download in other formats: