source: phonelux_scrappers/scrappers/tehnomarket_scrapper.py@ dfd5d87

Last change on this file since dfd5d87 was b68ae8d, checked in by Marko <Marko@…>, 23 months ago

Created spring app, edited setec_scrapper

  • Property mode set to 100644
File size: 3.4 KB
Line 
1import unicodedata
2from datetime import datetime
3import psycopg2
4import config_read
5from bs4 import BeautifulSoup
6from selenium import webdriver
7import requests
8
9import sys
10
11file_path = 'outputfile.txt'
12sys.stdout = open(file_path, "w")
13
14# Call to read the configuration file and connect to database
15cinfo = config_read.get_databaseconfig("../postgresdb.config")
16db_connection = psycopg2.connect(
17 database=cinfo[0],
18 host=cinfo[1],
19 user=cinfo[2],
20 password=cinfo[3]
21)
22cur = db_connection.cursor()
23
24
25def scrape_function(driver1, i):
26 offer_shop = "Tehnomarket" # offer shop
27 last_updated = datetime.now().date()
28 is_validated = False
29
30 tehnomarket_html = driver1.page_source
31 soup1 = BeautifulSoup(tehnomarket_html, 'html.parser')
32 active_li = soup1.find('div', {'class': 'adjust-elems pagination pagination-centered'}).find('li',
33 {'class': 'active'})
34
35 print('page: ' + active_li.get_text())
36
37 if int(active_li.get_text().strip()) == i:
38 phones = soup1.find('ul', {'class': 'products products-display-grid thumbnails'}).find_all('li', {
39 'class': 'span4 product-fix'})
40
41 for phone in phones:
42 offer_url = phone.find('a').get('href')
43 offer_name = phone.find('div', {'class': 'product-name'}).get_text().strip()
44 price = int(phone.find('div', {'class': 'product-price clearfix'}).find('strong') \
45 .get_text().replace('ден.', '').replace(',', '').strip())
46
47 response2 = requests.get(offer_url)
48 soup2 = BeautifulSoup(response2.content, 'html.parser')
49
50 image = soup2.find('div', {'id': 'product_gallery'}).find('img')
51
52 image_url = None
53 if image is not None:
54 image_url = image.get('src')
55
56 details = soup2.find('div', {'class': 'product-desc'}).get_text().split('\n')
57
58 brand = details[2].strip().capitalize()
59 offer_shop_code = details[4].strip()
60
61 specifications = []
62 for info in soup2.find_all('span', {'class': 'info'}):
63 specifications.append(info.get_text())
64
65 print(brand)
66 print(offer_name)
67 print()
68 print()
69
70 offer_description = '\n'.join(specifications)
71
72 insert_script = 'INSERT INTO phone_offers (offer_shop, brand, offer_name, price, image_url, offer_url,' \
73 'offer_description, offer_shop_code, last_updated, is_validated)' \
74 ' VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s);'
75 insert_value = (offer_shop, brand, offer_name, price, image_url, offer_url, offer_description,
76 offer_shop_code, last_updated, is_validated)
77 cur.execute(insert_script, insert_value)
78 db_connection.commit()
79 else:
80 driver1.implicitly_wait(30)
81 scrape_function(driver1, i)
82
83
84for i in range(1, 6):
85 tehnomarket_url = 'https://tehnomarket.com.mk/category/4109/mobilni-telefoni#page/' + str(i)
86 # print(anhoch_url)
87
88 # selenium is used because of the dynamic content of the page
89 driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver')
90 driver1.get(tehnomarket_url)
91
92 scrape_function(driver1, i)
93 # closing the driver so the safari instance can pair with another webdriver session
94 driver1.close()
95
96cur.close()
97db_connection.close()
Note: See TracBrowser for help on using the repository browser.