source: phonelux_scrappers/scrappers/tehnomarket_scrapper.py@ 895cd87

Last change on this file since 895cd87 was 895cd87, checked in by Marko <Marko@…>, 21 months ago

Refactored code

  • Property mode set to 100644
File size: 5.6 KB
Line 
1import json
2import unicodedata
3from datetime import datetime
4import psycopg2
5import config_read
6from bs4 import BeautifulSoup
7from selenium import webdriver
8import requests
9import sys
10
11from classes.phoneoffer import PhoneOffer
12
13file_path = 'outputfile.txt'
14sys.stdout = open(file_path, "w")
15
16
17def scrape_function(driver1, i, new_offers):
18 offer_shop = "Tehnomarket" # offer shop
19 last_updated = datetime.now().date()
20 is_validated = False
21
22 tehnomarket_html = driver1.page_source
23 soup1 = BeautifulSoup(tehnomarket_html, 'html.parser')
24 active_li = soup1.find('div', {'class': 'adjust-elems pagination pagination-centered'}).find('li',
25 {'class': 'active'})
26
27 print('page: ' + active_li.get_text())
28
29 if int(active_li.get_text().strip()) == i:
30 phones = soup1.find('ul', {'class': 'products products-display-grid thumbnails'}).find_all('li', {
31 'class': 'span4 product-fix'})
32
33 for phone in phones:
34 offer_url = phone.find('a').get('href')
35 offer_name = phone.find('div', {'class': 'product-name'}).get_text().strip()
36 price = int(phone.find('div', {'class': 'product-price clearfix'}).find('strong') \
37 .get_text().replace('ден.', '').replace(',', '').strip())
38
39 response2 = requests.get(offer_url)
40 soup2 = BeautifulSoup(response2.content, 'html.parser')
41
42 image = soup2.find('div', {'id': 'product_gallery'}).find('img')
43
44 image_url = None
45 if image is not None:
46 image_url = image.get('src')
47
48 details = soup2.find('div', {'class': 'product-desc'}).get_text().split('\n')
49
50 brand = details[2].strip().capitalize()
51 offer_shop_code = details[4].strip()
52
53 back_camera = None
54 operating_system = None
55 chipset = None
56 battery = None
57 ram_memory = None
58 rom_memory = None
59 cpu = None
60 front_camera = None
61 color = None
62
63 specifications = []
64 for info in soup2.find_all('span', {'class': 'info'}):
65 specifications.append(info.get_text())
66
67 offer_description = '\n'.join(specifications)
68
69 new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory,
70 color, front_camera, back_camera, chipset, battery, operating_system, cpu,
71 image_url,
72 offer_url, last_updated, is_validated, offer_description, offer_shop_code))
73 else:
74 driver1.implicitly_wait(30)
75 scrape_function(driver1, i, new_offers)
76
77
78# Tehnomarket phone offers that are already in database
79
80offers = json.loads(
81 unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/tehnomarket').text))
82
83database_offers = []
84
85for offer in offers:
86 phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'],
87 offer['ram_memory'],
88 offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'],
89 offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'],
90 offer['image_url'],
91 offer['offer_url'], offer['last_updated'], offer['is_validated'],
92 offer['offer_description'],
93 offer['offer_shop_code'])
94 database_offers.append(phoneOffer)
95
96new_offers = []
97
98for i in range(1, 6):
99 tehnomarket_url = 'https://tehnomarket.com.mk/category/4109/mobilni-telefoni#page/' + str(i)
100 # print(anhoch_url)
101
102 # selenium is used because of the dynamic content of the page
103 driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver')
104 driver1.get(tehnomarket_url)
105
106 scrape_function(driver1, i, new_offers)
107
108 # closing the driver so the safari instance can pair with another webdriver session
109 driver1.close()
110
111for new_offer in new_offers:
112 flag = False
113 flag_price = False
114 offer_id = None
115
116 for old_offer in database_offers:
117
118 if new_offer.offer_shop_code == old_offer.offer_shop_code:
119 flag = True
120 if new_offer.price != old_offer.price:
121 flag_price = True
122 offer_id = old_offer.offer_id
123
124 if flag:
125 # print('ALREADY IN DATABASE')
126 # print(new_offer)
127 # if it's already in database, check PRICE and if it's changed, change it !!!!!!
128 if flag_price:
129 print('PRICE CHANGED!') # CHANGE PRICE
130 print('offer id: ' + str(offer_id))
131 headers = {'Content-type': 'application/json'}
132 requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price),
133 headers=headers)
134 else:
135 print('ADDED') # ADD OFFER
136 print(new_offer)
137 headers = {'Content-type': 'application/json'}
138 requests.post('http://localhost:8080/phoneoffer/addoffer',
139 headers=headers, data=json.dumps(new_offer.__dict__, default=str))
140
141print('------------------------------------')
142
143for old_offer in database_offers:
144 flag = False
145 for new_offer in new_offers:
146 if old_offer.offer_shop_code == new_offer.offer_shop_code:
147 flag = True
148
149 if not flag:
150 print('OFFER DELETED')
151 print(old_offer)
152 # DELETE OFFER
153 requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id))
Note: See TracBrowser for help on using the repository browser.