source: phonelux_scrappers/scrappers/ledikom_scrapper.py@ 47f4eaf

Last change on this file since 47f4eaf was 47f4eaf, checked in by Marko <Marko@…>, 2 years ago

Final features implemented

  • Property mode set to 100644
File size: 8.2 KB
Line 
1import json
2import traceback
3import unicodedata
4from datetime import datetime
5import psycopg2
6import config_read
7from bs4 import BeautifulSoup
8from selenium import webdriver
9import requests
10
11import sys
12
13from classes.phoneoffer import PhoneOffer
14
15file_path = 'outputfile.txt'
16sys.stdout = open(file_path, "w")
17
18offer_shop = "Ledikom" # offer shop
19last_updated = datetime.now().date()
20is_validated = False
21
22# Call to read the configuration file and connect to database
23cinfo = config_read.get_databaseconfig("../postgresdb.config")
24db_connection = psycopg2.connect(
25 database=cinfo[0],
26 host=cinfo[1],
27 user=cinfo[2],
28 password=cinfo[3]
29)
30cur = db_connection.cursor()
31
32try:
33 # Ledikom phone offers that are already in database
34 offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/ledikom').text))
35
36 database_offers = []
37
38 for offer in offers:
39 phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'],
40 offer['ram_memory'],
41 offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'],
42 offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'],
43 offer['image_url'],
44 offer['offer_url'], offer['last_updated'], offer['is_validated'],
45 offer['offer_description'],
46 offer['offer_shop_code'])
47 database_offers.append(phoneOffer)
48
49 new_offers = []
50
51 ledikom_phone_urls = [
52 'https://ledikom.mk/c/416/uredi/apple/iphone?limit=96',
53 'https://ledikom.mk/c/421/uredi/samsung/telefoni?limit=96',
54 'https://ledikom.mk/c/424/mobilni-telefoni/xiaomi/telefoni?limit=96',
55 'https://ledikom.mk/c/430/uredi/huawei/telefoni?limit=96',
56 'https://ledikom.mk/c/441/uredi/oneplus/telefoni?limit=96',
57 'https://ledikom.mk/c/413/uredi/google/telefoni?limit=96',
58 'https://ledikom.mk/c/411/uredi/honor/telefoni?limit=96',
59 'https://ledikom.mk/c/460/uredi/nokia/telefoni?limit=96',
60 'https://ledikom.mk/c/461/uredi/asus/telefoni?limit=96',
61 'https://ledikom.mk/c/488/proizvodi/oppo/telefoni?limit=96'
62 ]
63
64 for ledikom_url in ledikom_phone_urls:
65
66 # selenium is used because of the dynamic content of the page
67 driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver')
68 driver1.get(ledikom_url)
69 ledikom_html = driver1.page_source
70
71 # closing the driver so the safari instance can pair with another webdriver session
72 driver1.close()
73
74 soup1 = BeautifulSoup(ledikom_html, 'html.parser')
75
76 phones = soup1.find('div', {'id': 'content'}) \
77 .find('div', {'class': 'container'}).find('div', {'class': 'row'}).find('div', {'class': 'item-display'}) \
78 .find_all('div', {'class': 'item-in-grid'})
79
80 if len(phones) == 0:
81 continue
82
83 for phone in phones:
84 offer_url = 'https://ledikom.mk' + phone.find('a').get('href')
85 image_url = phone.find('a').find('img').get('src')
86 temp_offer_name = phone.find('div', {'class': 'item-name'}).find('a').get_text().strip()
87 offer_name = ' '.join(temp_offer_name.split())
88 brand = offer_name.split(' ')[0]
89 price = int(phone.find('span', {'class': 'price'}).get_text().replace('ден.', '')
90 .replace('ден', '')
91 .replace('.', '').strip())
92
93 driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver')
94 driver1.get(offer_url)
95 # getting offer page html
96 offer_html = driver1.page_source
97 driver1.close()
98
99 soup2 = BeautifulSoup(offer_html, 'html.parser')
100
101 specifications = soup2.find('div', {'id': 'content'}).find('section', {'class': 'padding-section'}) \
102 .find_all('div', {'class': 'container'})[1].find('div', {'class': 'col-md-7'}) \
103 .find_all('div', {'class': 'row'})
104
105 color = None
106 rom_memory = None
107 ram_memory = None
108 back_camera = None
109 operating_system = None
110 chipset = None
111 battery = None
112 cpu = None
113 front_camera = None
114 offer_shop_code = None
115 offer_description = None
116
117 if len(specifications) != 0:
118 colors_tags = specifications[0].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a')
119 temp_colors = []
120 for color_tag in colors_tags:
121 temp_colors.append(color_tag.get_text().strip())
122 color = ','.join(temp_colors)
123
124 if len(specifications) >= 2:
125 temp_rom = specifications[1].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a')
126 rom_list = []
127 for rom in temp_rom:
128 rom_list.append(rom.get('title'))
129 rom_memory = ','.join(rom_list)
130
131 if len(specifications) >= 3:
132 temp_ram = specifications[2].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a')
133 ram_list = []
134 for ram in temp_ram:
135 ram_list.append(ram.get('title'))
136
137 ram_memory = ','.join(ram_list)
138
139 if 'Xiaomi' in brand:
140 temp = color
141 color = rom_memory
142 rom_memory = temp
143
144 temp = ram_memory
145 ram_memory = color
146 color = temp
147
148 new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory,
149 color, front_camera, back_camera, chipset, battery, operating_system, cpu,
150 image_url,
151 offer_url, last_updated, is_validated, offer_description, offer_shop_code))
152
153 for new_offer in new_offers:
154 flag = False
155 flag_price = False
156 offer_id = None
157
158 for old_offer in database_offers:
159
160 if new_offer.offer_name == old_offer.offer_name:
161 flag = True
162 if new_offer.price != old_offer.price:
163 flag_price = True
164 offer_id = old_offer.offer_id
165
166 if flag:
167 # print('ALREADY IN DATABASE')
168 # print(new_offer)
169 # if it's already in database, check PRICE and if it's changed, change it !!!!!!
170 if flag_price:
171 print('PRICE CHANGED!') # CHANGE PRICE
172 print('offer id: ' + str(offer_id))
173 headers = {'Content-type': 'application/json'}
174 requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price),
175 headers=headers)
176 else:
177 print('ADDED') # ADD OFFER
178 print(new_offer)
179 headers = {'Content-type': 'application/json'}
180 requests.post('http://localhost:8080/phoneoffer/addoffer',
181 headers=headers, data=json.dumps(new_offer.__dict__, default=str))
182
183 print('------------------------------------')
184
185 for old_offer in database_offers:
186 flag = False
187 for new_offer in new_offers:
188 if old_offer.offer_name == new_offer.offer_name:
189 flag = True
190
191 if not flag:
192 print('OFFER DELETED')
193 print(old_offer)
194 # DELETE OFFER
195 requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id))
196except Exception:
197 traceback.print_exc()
198 insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \
199 ' VALUES (%s, %s, %s);'
200 insert_value = (offer_shop, last_updated, 'failed')
201 cur.execute(insert_script, insert_value)
202 db_connection.commit()
203 cur.close()
204 db_connection.close()
205else:
206 insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \
207 ' VALUES (%s, %s, %s);'
208 insert_value = (offer_shop, last_updated, 'success')
209 cur.execute(insert_script, insert_value)
210 db_connection.commit()
211 cur.close()
212 db_connection.close()
213
Note: See TracBrowser for help on using the repository browser.