Changeset 47f4eaf for phonelux_scrappers/scrappers/ledikom_scrapper.py
- Timestamp:
- 11/20/22 16:34:52 (2 years ago)
- Branches:
- master
- Parents:
- ffd50db
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
phonelux_scrappers/scrappers/ledikom_scrapper.py
rffd50db r47f4eaf 1 1 import json 2 import traceback 2 3 import unicodedata 3 4 from datetime import datetime … … 19 20 is_validated = False 20 21 21 # Ledikom phone offers that are already in database 22 23 offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/ledikom').text)) 24 25 database_offers = [] 26 27 for offer in offers: 28 phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'], 29 offer['ram_memory'], 30 offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'], 31 offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'], 32 offer['image_url'], 33 offer['offer_url'], offer['last_updated'], offer['is_validated'], 34 offer['offer_description'], 35 offer['offer_shop_code']) 36 database_offers.append(phoneOffer) 37 38 new_offers = [] 39 40 ledikom_phone_urls = [ 41 'https://ledikom.mk/c/416/uredi/apple/iphone?limit=96', 42 'https://ledikom.mk/c/421/uredi/samsung/telefoni?limit=96', 43 'https://ledikom.mk/c/424/mobilni-telefoni/xiaomi/telefoni?limit=96', 44 'https://ledikom.mk/c/430/uredi/huawei/telefoni?limit=96', 45 'https://ledikom.mk/c/441/uredi/oneplus/telefoni?limit=96', 46 'https://ledikom.mk/c/413/uredi/google/telefoni?limit=96', 47 'https://ledikom.mk/c/411/uredi/honor/telefoni?limit=96', 48 'https://ledikom.mk/c/460/uredi/nokia/telefoni?limit=96', 49 'https://ledikom.mk/c/461/uredi/asus/telefoni?limit=96', 50 'https://ledikom.mk/c/488/proizvodi/oppo/telefoni?limit=96' 51 ] 52 53 for ledikom_url in ledikom_phone_urls: 54 55 # selenium is used because of the dynamic content of the page 56 driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver') 57 driver1.get(ledikom_url) 58 ledikom_html = driver1.page_source 59 60 # closing the driver so the safari instance can pair with another webdriver session 61 driver1.close() 62 63 soup1 = BeautifulSoup(ledikom_html, 'html.parser') 64 65 phones = soup1.find('div', {'id': 'content'}) \ 66 .find('div', {'class': 'container'}).find('div', {'class': 'row'}).find('div', {'class': 'item-display'}) \ 67 .find_all('div', {'class': 'item-in-grid'}) 68 69 if len(phones) == 0: 70 continue 71 72 for phone in phones: 73 offer_url = 'https://ledikom.mk' + phone.find('a').get('href') 74 image_url = phone.find('a').find('img').get('src') 75 temp_offer_name = phone.find('div', {'class': 'item-name'}).find('a').get_text().strip() 76 offer_name = ' '.join(temp_offer_name.split()) 77 brand = offer_name.split(' ')[0] 78 price = int(phone.find('span', {'class': 'price'}).get_text().replace('ден.', '') 79 .replace('ден', '') 80 .replace('.', '').strip()) 81 22 # Call to read the configuration file and connect to database 23 cinfo = config_read.get_databaseconfig("../postgresdb.config") 24 db_connection = psycopg2.connect( 25 database=cinfo[0], 26 host=cinfo[1], 27 user=cinfo[2], 28 password=cinfo[3] 29 ) 30 cur = db_connection.cursor() 31 32 try: 33 # Ledikom phone offers that are already in database 34 offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/ledikom').text)) 35 36 database_offers = [] 37 38 for offer in offers: 39 phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'], 40 offer['ram_memory'], 41 offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'], 42 offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'], 43 offer['image_url'], 44 offer['offer_url'], offer['last_updated'], offer['is_validated'], 45 offer['offer_description'], 46 offer['offer_shop_code']) 47 database_offers.append(phoneOffer) 48 49 new_offers = [] 50 51 ledikom_phone_urls = [ 52 'https://ledikom.mk/c/416/uredi/apple/iphone?limit=96', 53 'https://ledikom.mk/c/421/uredi/samsung/telefoni?limit=96', 54 'https://ledikom.mk/c/424/mobilni-telefoni/xiaomi/telefoni?limit=96', 55 'https://ledikom.mk/c/430/uredi/huawei/telefoni?limit=96', 56 'https://ledikom.mk/c/441/uredi/oneplus/telefoni?limit=96', 57 'https://ledikom.mk/c/413/uredi/google/telefoni?limit=96', 58 'https://ledikom.mk/c/411/uredi/honor/telefoni?limit=96', 59 'https://ledikom.mk/c/460/uredi/nokia/telefoni?limit=96', 60 'https://ledikom.mk/c/461/uredi/asus/telefoni?limit=96', 61 'https://ledikom.mk/c/488/proizvodi/oppo/telefoni?limit=96' 62 ] 63 64 for ledikom_url in ledikom_phone_urls: 65 66 # selenium is used because of the dynamic content of the page 82 67 driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver') 83 driver1.get(offer_url) 84 # getting offer page html 85 offer_html = driver1.page_source 68 driver1.get(ledikom_url) 69 ledikom_html = driver1.page_source 70 71 # closing the driver so the safari instance can pair with another webdriver session 86 72 driver1.close() 87 73 88 soup2 = BeautifulSoup(offer_html, 'html.parser') 89 90 specifications = soup2.find('div', {'id': 'content'}).find('section', {'class': 'padding-section'}) \ 91 .find_all('div', {'class': 'container'})[1].find('div', {'class': 'col-md-7'}) \ 92 .find_all('div', {'class': 'row'}) 93 94 color = None 95 rom_memory = None 96 ram_memory = None 97 back_camera = None 98 operating_system = None 99 chipset = None 100 battery = None 101 cpu = None 102 front_camera = None 103 offer_shop_code = None 104 offer_description = None 105 106 if len(specifications) != 0: 107 colors_tags = specifications[0].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a') 108 temp_colors = [] 109 for color_tag in colors_tags: 110 temp_colors.append(color_tag.get_text().strip()) 111 color = ','.join(temp_colors) 112 113 if len(specifications) >= 2: 114 temp_rom = specifications[1].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a') 115 rom_list = [] 116 for rom in temp_rom: 117 rom_list.append(rom.get('title')) 118 rom_memory = ','.join(rom_list) 119 120 if len(specifications) >= 3: 121 temp_ram = specifications[2].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a') 122 ram_list = [] 123 for ram in temp_ram: 124 ram_list.append(ram.get('title')) 125 126 ram_memory = ','.join(ram_list) 127 128 if 'Xiaomi' in brand: 129 temp = color 130 color = rom_memory 131 rom_memory = temp 132 133 temp = ram_memory 134 ram_memory = color 135 color = temp 136 137 new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory, 138 color, front_camera, back_camera, chipset, battery, operating_system, cpu, 139 image_url, 140 offer_url, last_updated, is_validated, offer_description, offer_shop_code)) 141 142 for new_offer in new_offers: 143 flag = False 144 flag_price = False 145 offer_id = None 74 soup1 = BeautifulSoup(ledikom_html, 'html.parser') 75 76 phones = soup1.find('div', {'id': 'content'}) \ 77 .find('div', {'class': 'container'}).find('div', {'class': 'row'}).find('div', {'class': 'item-display'}) \ 78 .find_all('div', {'class': 'item-in-grid'}) 79 80 if len(phones) == 0: 81 continue 82 83 for phone in phones: 84 offer_url = 'https://ledikom.mk' + phone.find('a').get('href') 85 image_url = phone.find('a').find('img').get('src') 86 temp_offer_name = phone.find('div', {'class': 'item-name'}).find('a').get_text().strip() 87 offer_name = ' '.join(temp_offer_name.split()) 88 brand = offer_name.split(' ')[0] 89 price = int(phone.find('span', {'class': 'price'}).get_text().replace('ден.', '') 90 .replace('ден', '') 91 .replace('.', '').strip()) 92 93 driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver') 94 driver1.get(offer_url) 95 # getting offer page html 96 offer_html = driver1.page_source 97 driver1.close() 98 99 soup2 = BeautifulSoup(offer_html, 'html.parser') 100 101 specifications = soup2.find('div', {'id': 'content'}).find('section', {'class': 'padding-section'}) \ 102 .find_all('div', {'class': 'container'})[1].find('div', {'class': 'col-md-7'}) \ 103 .find_all('div', {'class': 'row'}) 104 105 color = None 106 rom_memory = None 107 ram_memory = None 108 back_camera = None 109 operating_system = None 110 chipset = None 111 battery = None 112 cpu = None 113 front_camera = None 114 offer_shop_code = None 115 offer_description = None 116 117 if len(specifications) != 0: 118 colors_tags = specifications[0].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a') 119 temp_colors = [] 120 for color_tag in colors_tags: 121 temp_colors.append(color_tag.get_text().strip()) 122 color = ','.join(temp_colors) 123 124 if len(specifications) >= 2: 125 temp_rom = specifications[1].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a') 126 rom_list = [] 127 for rom in temp_rom: 128 rom_list.append(rom.get('title')) 129 rom_memory = ','.join(rom_list) 130 131 if len(specifications) >= 3: 132 temp_ram = specifications[2].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a') 133 ram_list = [] 134 for ram in temp_ram: 135 ram_list.append(ram.get('title')) 136 137 ram_memory = ','.join(ram_list) 138 139 if 'Xiaomi' in brand: 140 temp = color 141 color = rom_memory 142 rom_memory = temp 143 144 temp = ram_memory 145 ram_memory = color 146 color = temp 147 148 new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory, 149 color, front_camera, back_camera, chipset, battery, operating_system, cpu, 150 image_url, 151 offer_url, last_updated, is_validated, offer_description, offer_shop_code)) 152 153 for new_offer in new_offers: 154 flag = False 155 flag_price = False 156 offer_id = None 157 158 for old_offer in database_offers: 159 160 if new_offer.offer_name == old_offer.offer_name: 161 flag = True 162 if new_offer.price != old_offer.price: 163 flag_price = True 164 offer_id = old_offer.offer_id 165 166 if flag: 167 # print('ALREADY IN DATABASE') 168 # print(new_offer) 169 # if it's already in database, check PRICE and if it's changed, change it !!!!!! 170 if flag_price: 171 print('PRICE CHANGED!') # CHANGE PRICE 172 print('offer id: ' + str(offer_id)) 173 headers = {'Content-type': 'application/json'} 174 requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price), 175 headers=headers) 176 else: 177 print('ADDED') # ADD OFFER 178 print(new_offer) 179 headers = {'Content-type': 'application/json'} 180 requests.post('http://localhost:8080/phoneoffer/addoffer', 181 headers=headers, data=json.dumps(new_offer.__dict__, default=str)) 182 183 print('------------------------------------') 146 184 147 185 for old_offer in database_offers: 148 149 if new_offer.offer_name == old_offer.offer_name: 150 flag = True 151 if new_offer.price != old_offer.price: 152 flag_price = True 153 offer_id = old_offer.offer_id 154 155 if flag: 156 # print('ALREADY IN DATABASE') 157 # print(new_offer) 158 # if it's already in database, check PRICE and if it's changed, change it !!!!!! 159 if flag_price: 160 print('PRICE CHANGED!') # CHANGE PRICE 161 print('offer id: ' + str(offer_id)) 162 headers = {'Content-type': 'application/json'} 163 requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price), 164 headers=headers) 165 else: 166 print('ADDED') # ADD OFFER 167 print(new_offer) 168 headers = {'Content-type': 'application/json'} 169 requests.post('http://localhost:8080/phoneoffer/addoffer', 170 headers=headers, data=json.dumps(new_offer.__dict__, default=str)) 171 172 print('------------------------------------') 173 174 for old_offer in database_offers: 175 flag = False 176 for new_offer in new_offers: 177 if old_offer.offer_name == new_offer.offer_name: 178 flag = True 179 180 if not flag: 181 print('OFFER DELETED') 182 print(old_offer) 183 # DELETE OFFER 184 requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id)) 186 flag = False 187 for new_offer in new_offers: 188 if old_offer.offer_name == new_offer.offer_name: 189 flag = True 190 191 if not flag: 192 print('OFFER DELETED') 193 print(old_offer) 194 # DELETE OFFER 195 requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id)) 196 except Exception: 197 traceback.print_exc() 198 insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \ 199 ' VALUES (%s, %s, %s);' 200 insert_value = (offer_shop, last_updated, 'failed') 201 cur.execute(insert_script, insert_value) 202 db_connection.commit() 203 cur.close() 204 db_connection.close() 205 else: 206 insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \ 207 ' VALUES (%s, %s, %s);' 208 insert_value = (offer_shop, last_updated, 'success') 209 cur.execute(insert_script, insert_value) 210 db_connection.commit() 211 cur.close() 212 db_connection.close() 213
Note:
See TracChangeset
for help on using the changeset viewer.