Changeset 47f4eaf for phonelux_scrappers/scrappers/neptun_scrapper.py
- Timestamp:
- 11/20/22 16:34:52 (2 years ago)
- Branches:
- master
- Parents:
- ffd50db
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
phonelux_scrappers/scrappers/neptun_scrapper.py
rffd50db r47f4eaf 1 1 import json 2 import traceback 2 3 import unicodedata 3 4 from datetime import datetime … … 19 20 is_validated = False 20 21 21 # Neptun phone offers that are already in database 22 # Call to read the configuration file and connect to database 23 cinfo = config_read.get_databaseconfig("../postgresdb.config") 24 db_connection = psycopg2.connect( 25 database=cinfo[0], 26 host=cinfo[1], 27 user=cinfo[2], 28 password=cinfo[3] 29 ) 30 cur = db_connection.cursor() 22 31 23 offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/neptun').text)) 32 try: 33 # Neptun phone offers that are already in database 34 offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/neptun').text)) 24 35 25 database_offers = []36 database_offers = [] 26 37 27 for offer in offers:28 phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'],29 offer['ram_memory'],30 offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'],31 offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'],32 offer['image_url'],33 offer['offer_url'], offer['last_updated'], offer['is_validated'],34 offer['offer_description'],35 offer['offer_shop_code'])36 database_offers.append(phoneOffer)38 for offer in offers: 39 phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'], 40 offer['ram_memory'], 41 offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'], 42 offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'], 43 offer['image_url'], 44 offer['offer_url'], offer['last_updated'], offer['is_validated'], 45 offer['offer_description'], 46 offer['offer_shop_code']) 47 database_offers.append(phoneOffer) 37 48 38 new_offers = []49 new_offers = [] 39 50 40 for i in range(1, 11):41 neptun_url = 'https://www.neptun.mk/mobilni_telefoni.nspx?page=' + str(i)51 for i in range(1, 11): 52 neptun_url = 'https://www.neptun.mk/mobilni_telefoni.nspx?page=' + str(i) 42 53 43 # selenium is used because of the dynamic content of the page44 driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver')45 driver1.get(neptun_url)46 neptun_html = driver1.page_source54 # selenium is used because of the dynamic content of the page 55 driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver') 56 driver1.get(neptun_url) 57 neptun_html = driver1.page_source 47 58 48 # closing the driver so the safari instance can pair with another webdriver session49 driver1.close()50 51 # response1 = requests.get(neptun_url)52 soup1 = BeautifulSoup(neptun_html, 'html.parser')53 54 phones = soup1.find('div', {'id': 'mainContainer'}).find('div',55 {'class': 'col-lg-9 col-md-9 col-sm-8 col-fix-main'}) \56 .find_all('div', {'class': 'ng-scope product-list-item-grid'})57 58 for phone in phones:59 offer_url = 'https://www.neptun.mk' + phone.find('a').get('href')60 offer_name = phone.find('a').find('h2').get_text().replace('MOB.TEL.', '').strip()61 brand = offer_name.split(' ')[0].strip().capitalize()62 image_url = 'https://www.neptun.mk' + phone.find('a').find('div', {'class': 'row'}).find('img').get('src')63 price = int(64 phone.find('div', {'class': 'col-sm-12 static'}).find('div', {'class': 'product-list-item__prices pt35'})65 .find('div', {'class': 'row'}).find('div', {'class': 'newPriceModel'}) \66 .find('span', {'class': 'product-price__amount--value ng-binding'}).get_text().replace('.', ''))67 68 driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver')69 driver1.get(offer_url)70 offer_html = driver1.page_source71 59 # closing the driver so the safari instance can pair with another webdriver session 72 60 driver1.close() 73 61 74 soup2 = BeautifulSoup(offer_html, 'html.parser') 62 # response1 = requests.get(neptun_url) 63 soup1 = BeautifulSoup(neptun_html, 'html.parser') 75 64 76 offer_shop_code = soup2.find('div', {'ng-if': 'showProductDetails'}) \77 .find('div', {'class': 'product-details-first-row'}).find('span', {78 'ng-bind': 'model.CodeNumber'}).get_text().strip()65 phones = soup1.find('div', {'id': 'mainContainer'}).find('div', 66 {'class': 'col-lg-9 col-md-9 col-sm-8 col-fix-main'}) \ 67 .find_all('div', {'class': 'ng-scope product-list-item-grid'}) 79 68 80 specifications_table = \ 81 soup2.find('div', {'id': 'mainContainer'}).find('div', {'ng-if': 'showProductDetails'}).find_all('ul')[-1] 82 specifications = specifications_table.get_text(separator='\n').strip().split("\n") 69 for phone in phones: 70 offer_url = 'https://www.neptun.mk' + phone.find('a').get('href') 71 offer_name = phone.find('a').find('h2').get_text().replace('MOB.TEL.', '').strip() 72 brand = offer_name.split(' ')[0].strip().capitalize() 73 image_url = 'https://www.neptun.mk' + phone.find('a').find('div', {'class': 'row'}).find('img').get('src') 74 price = int( 75 phone.find('div', {'class': 'col-sm-12 static'}).find('div', {'class': 'product-list-item__prices pt35'}) 76 .find('div', {'class': 'row'}).find('div', {'class': 'newPriceModel'}) \ 77 .find('span', {'class': 'product-price__amount--value ng-binding'}).get_text().replace('.', '')) 83 78 84 offer_description = specifications_table.get_text(separator='\n').strip() 79 driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver') 80 driver1.get(offer_url) 81 offer_html = driver1.page_source 82 # closing the driver so the safari instance can pair with another webdriver session 83 driver1.close() 85 84 86 back_camera = None 87 operating_system = None 88 chipset = None 89 battery = None 90 ram_memory = None 91 rom_memory = None 92 cpu = None 93 front_camera = None 94 color = None 85 soup2 = BeautifulSoup(offer_html, 'html.parser') 95 86 96 for specification in specifications:97 if 'Батерија:' in specification:98 battery = specification.split('Батерија:')[1]87 offer_shop_code = soup2.find('div', {'ng-if': 'showProductDetails'}) \ 88 .find('div', {'class': 'product-details-first-row'}).find('span', { 89 'ng-bind': 'model.CodeNumber'}).get_text().strip() 99 90 100 if 'CPU:' in specification: 101 cpu = specification.split('CPU:')[1] 91 specifications_table = \ 92 soup2.find('div', {'id': 'mainContainer'}).find('div', {'ng-if': 'showProductDetails'}).find_all('ul')[-1] 93 specifications = specifications_table.get_text(separator='\n').strip().split("\n") 102 94 103 if 'Chipset:' in specification: 104 chipset = specification.split('Chipset:')[1] 95 offer_description = specifications_table.get_text(separator='\n').strip() 105 96 106 if 'RAM Меморија:' in specification: 107 ram_memory = specification.split('RAM Меморија:')[1] 108 continue 97 back_camera = None 98 operating_system = None 99 chipset = None 100 battery = None 101 ram_memory = None 102 rom_memory = None 103 cpu = None 104 front_camera = None 105 color = None 109 106 110 if 'ROM Меморија:' in specification:111 rom_memory = specification.split('ROM Меморија:')[1]112 continue107 for specification in specifications: 108 if 'Батерија:' in specification: 109 battery = specification.split('Батерија:')[1] 113 110 114 if 'ROM:' in specification:115 rom_memory = specification.split('ROM:')[1]111 if 'CPU:' in specification: 112 cpu = specification.split('CPU:')[1] 116 113 117 if 'RAM:' in specification:118 ram_memory = specification.split('RAM:')[1]114 if 'Chipset:' in specification: 115 chipset = specification.split('Chipset:')[1] 119 116 120 if 'iOS' in specification or 'Android' in specification: 121 operating_system = specification 117 if 'RAM Меморија:' in specification: 118 ram_memory = specification.split('RAM Меморија:')[1] 119 continue 122 120 123 new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory, 124 color, front_camera, back_camera, chipset, battery, operating_system, cpu, 125 image_url, 126 offer_url, last_updated, is_validated, offer_description, offer_shop_code)) 121 if 'ROM Меморија:' in specification: 122 rom_memory = specification.split('ROM Меморија:')[1] 123 continue 127 124 128 for new_offer in new_offers: 129 flag = False 130 flag_price = False 131 offer_id = None 125 if 'ROM:' in specification: 126 rom_memory = specification.split('ROM:')[1] 127 128 if 'RAM:' in specification: 129 ram_memory = specification.split('RAM:')[1] 130 131 if 'iOS' in specification or 'Android' in specification: 132 operating_system = specification 133 134 new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory, 135 color, front_camera, back_camera, chipset, battery, operating_system, cpu, 136 image_url, 137 offer_url, last_updated, is_validated, offer_description, offer_shop_code)) 138 139 for new_offer in new_offers: 140 flag = False 141 flag_price = False 142 offer_id = None 143 144 for old_offer in database_offers: 145 146 if new_offer.offer_shop_code == old_offer.offer_shop_code: 147 flag = True 148 if new_offer.price != old_offer.price: 149 flag_price = True 150 offer_id = old_offer.offer_id 151 152 if flag: 153 # print('ALREADY IN DATABASE') 154 # print(new_offer) 155 # if it's already in database, check PRICE and if it's changed, change it !!!!!! 156 if flag_price: 157 print('PRICE CHANGED!') # CHANGE PRICE 158 print('offer id: ' + str(offer_id)) 159 headers = {'Content-type': 'application/json'} 160 requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price), 161 headers=headers) 162 else: 163 print('ADDED') # ADD OFFER 164 print(new_offer) 165 headers = {'Content-type': 'application/json'} 166 requests.post('http://localhost:8080/phoneoffer/addoffer', 167 headers=headers, data=json.dumps(new_offer.__dict__, default=str)) 168 169 print('------------------------------------') 132 170 133 171 for old_offer in database_offers: 172 flag = False 173 for new_offer in new_offers: 174 if old_offer.offer_shop_code == new_offer.offer_shop_code: 175 flag = True 134 176 135 if new_offer.offer_shop_code == old_offer.offer_shop_code: 136 flag = True 137 if new_offer.price != old_offer.price: 138 flag_price = True 139 offer_id = old_offer.offer_id 140 141 if flag: 142 # print('ALREADY IN DATABASE') 143 # print(new_offer) 144 # if it's already in database, check PRICE and if it's changed, change it !!!!!! 145 if flag_price: 146 print('PRICE CHANGED!') # CHANGE PRICE 147 print('offer id: ' + str(offer_id)) 148 headers = {'Content-type': 'application/json'} 149 requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price), 150 headers=headers) 151 else: 152 print('ADDED') # ADD OFFER 153 print(new_offer) 154 headers = {'Content-type': 'application/json'} 155 requests.post('http://localhost:8080/phoneoffer/addoffer', 156 headers=headers, data=json.dumps(new_offer.__dict__, default=str)) 157 158 print('------------------------------------') 159 160 for old_offer in database_offers: 161 flag = False 162 for new_offer in new_offers: 163 if old_offer.offer_shop_code == new_offer.offer_shop_code: 164 flag = True 165 166 if not flag: 167 print('OFFER DELETED') 168 print(old_offer) 169 # DELETE OFFER 170 requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id)) 177 if not flag: 178 print('OFFER DELETED') 179 print(old_offer) 180 # DELETE OFFER 181 requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id)) 182 except Exception: 183 traceback.print_exc() 184 insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \ 185 ' VALUES (%s, %s, %s);' 186 insert_value = (offer_shop, last_updated, 'failed') 187 cur.execute(insert_script, insert_value) 188 db_connection.commit() 189 cur.close() 190 db_connection.close() 191 else: 192 insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \ 193 ' VALUES (%s, %s, %s);' 194 insert_value = (offer_shop, last_updated, 'success') 195 cur.execute(insert_script, insert_value) 196 db_connection.commit() 197 cur.close() 198 db_connection.close()
Note:
See TracChangeset
for help on using the changeset viewer.