source: phonelux_scrappers/scrappers/neptun_scrapper.py@ 47f4eaf

Last change on this file since 47f4eaf was 47f4eaf, checked in by Marko <Marko@…>, 2 years ago

Final features implemented

  • Property mode set to 100644
File size: 8.0 KB
Line 
1import json
2import traceback
3import unicodedata
4from datetime import datetime
5import psycopg2
6import config_read
7from bs4 import BeautifulSoup
8from selenium import webdriver
9import requests
10
11import sys
12
13from classes.phoneoffer import PhoneOffer
14
15file_path = 'outputfile.txt'
16sys.stdout = open(file_path, "w")
17
18offer_shop = "Neptun" # offer shop
19last_updated = datetime.now().date()
20is_validated = False
21
22# Call to read the configuration file and connect to database
23cinfo = config_read.get_databaseconfig("../postgresdb.config")
24db_connection = psycopg2.connect(
25 database=cinfo[0],
26 host=cinfo[1],
27 user=cinfo[2],
28 password=cinfo[3]
29)
30cur = db_connection.cursor()
31
32try:
33 # Neptun phone offers that are already in database
34 offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/neptun').text))
35
36 database_offers = []
37
38 for offer in offers:
39 phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'],
40 offer['ram_memory'],
41 offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'],
42 offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'],
43 offer['image_url'],
44 offer['offer_url'], offer['last_updated'], offer['is_validated'],
45 offer['offer_description'],
46 offer['offer_shop_code'])
47 database_offers.append(phoneOffer)
48
49 new_offers = []
50
51 for i in range(1, 11):
52 neptun_url = 'https://www.neptun.mk/mobilni_telefoni.nspx?page=' + str(i)
53
54 # selenium is used because of the dynamic content of the page
55 driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver')
56 driver1.get(neptun_url)
57 neptun_html = driver1.page_source
58
59 # closing the driver so the safari instance can pair with another webdriver session
60 driver1.close()
61
62 # response1 = requests.get(neptun_url)
63 soup1 = BeautifulSoup(neptun_html, 'html.parser')
64
65 phones = soup1.find('div', {'id': 'mainContainer'}).find('div',
66 {'class': 'col-lg-9 col-md-9 col-sm-8 col-fix-main'}) \
67 .find_all('div', {'class': 'ng-scope product-list-item-grid'})
68
69 for phone in phones:
70 offer_url = 'https://www.neptun.mk' + phone.find('a').get('href')
71 offer_name = phone.find('a').find('h2').get_text().replace('MOB.TEL.', '').strip()
72 brand = offer_name.split(' ')[0].strip().capitalize()
73 image_url = 'https://www.neptun.mk' + phone.find('a').find('div', {'class': 'row'}).find('img').get('src')
74 price = int(
75 phone.find('div', {'class': 'col-sm-12 static'}).find('div', {'class': 'product-list-item__prices pt35'})
76 .find('div', {'class': 'row'}).find('div', {'class': 'newPriceModel'}) \
77 .find('span', {'class': 'product-price__amount--value ng-binding'}).get_text().replace('.', ''))
78
79 driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver')
80 driver1.get(offer_url)
81 offer_html = driver1.page_source
82 # closing the driver so the safari instance can pair with another webdriver session
83 driver1.close()
84
85 soup2 = BeautifulSoup(offer_html, 'html.parser')
86
87 offer_shop_code = soup2.find('div', {'ng-if': 'showProductDetails'}) \
88 .find('div', {'class': 'product-details-first-row'}).find('span', {
89 'ng-bind': 'model.CodeNumber'}).get_text().strip()
90
91 specifications_table = \
92 soup2.find('div', {'id': 'mainContainer'}).find('div', {'ng-if': 'showProductDetails'}).find_all('ul')[-1]
93 specifications = specifications_table.get_text(separator='\n').strip().split("\n")
94
95 offer_description = specifications_table.get_text(separator='\n').strip()
96
97 back_camera = None
98 operating_system = None
99 chipset = None
100 battery = None
101 ram_memory = None
102 rom_memory = None
103 cpu = None
104 front_camera = None
105 color = None
106
107 for specification in specifications:
108 if 'Батерија:' in specification:
109 battery = specification.split('Батерија:')[1]
110
111 if 'CPU:' in specification:
112 cpu = specification.split('CPU:')[1]
113
114 if 'Chipset:' in specification:
115 chipset = specification.split('Chipset:')[1]
116
117 if 'RAM Меморија:' in specification:
118 ram_memory = specification.split('RAM Меморија:')[1]
119 continue
120
121 if 'ROM Меморија:' in specification:
122 rom_memory = specification.split('ROM Меморија:')[1]
123 continue
124
125 if 'ROM:' in specification:
126 rom_memory = specification.split('ROM:')[1]
127
128 if 'RAM:' in specification:
129 ram_memory = specification.split('RAM:')[1]
130
131 if 'iOS' in specification or 'Android' in specification:
132 operating_system = specification
133
134 new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory,
135 color, front_camera, back_camera, chipset, battery, operating_system, cpu,
136 image_url,
137 offer_url, last_updated, is_validated, offer_description, offer_shop_code))
138
139 for new_offer in new_offers:
140 flag = False
141 flag_price = False
142 offer_id = None
143
144 for old_offer in database_offers:
145
146 if new_offer.offer_shop_code == old_offer.offer_shop_code:
147 flag = True
148 if new_offer.price != old_offer.price:
149 flag_price = True
150 offer_id = old_offer.offer_id
151
152 if flag:
153 # print('ALREADY IN DATABASE')
154 # print(new_offer)
155 # if it's already in database, check PRICE and if it's changed, change it !!!!!!
156 if flag_price:
157 print('PRICE CHANGED!') # CHANGE PRICE
158 print('offer id: ' + str(offer_id))
159 headers = {'Content-type': 'application/json'}
160 requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price),
161 headers=headers)
162 else:
163 print('ADDED') # ADD OFFER
164 print(new_offer)
165 headers = {'Content-type': 'application/json'}
166 requests.post('http://localhost:8080/phoneoffer/addoffer',
167 headers=headers, data=json.dumps(new_offer.__dict__, default=str))
168
169 print('------------------------------------')
170
171 for old_offer in database_offers:
172 flag = False
173 for new_offer in new_offers:
174 if old_offer.offer_shop_code == new_offer.offer_shop_code:
175 flag = True
176
177 if not flag:
178 print('OFFER DELETED')
179 print(old_offer)
180 # DELETE OFFER
181 requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id))
182except Exception:
183 traceback.print_exc()
184 insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \
185 ' VALUES (%s, %s, %s);'
186 insert_value = (offer_shop, last_updated, 'failed')
187 cur.execute(insert_script, insert_value)
188 db_connection.commit()
189 cur.close()
190 db_connection.close()
191else:
192 insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \
193 ' VALUES (%s, %s, %s);'
194 insert_value = (offer_shop, last_updated, 'success')
195 cur.execute(insert_script, insert_value)
196 db_connection.commit()
197 cur.close()
198 db_connection.close()
Note: See TracBrowser for help on using the repository browser.