Changeset 47f4eaf for phonelux_scrappers/scrappers/mobelix_scrapper.py
- Timestamp:
- 11/20/22 16:34:52 (2 years ago)
- Branches:
- master
- Parents:
- ffd50db
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
phonelux_scrappers/scrappers/mobelix_scrapper.py
rffd50db r47f4eaf 3 3 import unicodedata 4 4 from datetime import datetime 5 5 import traceback 6 6 import psycopg2 7 7 import config_read … … 19 19 is_validated = False 20 20 21 # Mobelix phone offers that are already in database 22 23 offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/mobelix').text)) 24 25 database_offers = [] 26 27 for offer in offers: 28 phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'], 29 offer['ram_memory'], 30 offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'], 31 offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'], 32 offer['image_url'], 33 offer['offer_url'], offer['last_updated'], offer['is_validated'], 34 offer['offer_description'], 35 offer['offer_shop_code']) 36 database_offers.append(phoneOffer) 37 38 new_offers = [] 39 40 for i in range(1, 17): 41 mobelix_url = "https://mobelix.com.mk/mk/mobilni-telefoni?page=" + str(i) 42 43 response1 = requests.get(mobelix_url) 44 soup1 = BeautifulSoup(response1.content, 'html.parser') 45 46 phones = soup1.find_all('div', {'class': 'p-2 rounded text-dark bg-white d-flex w-100'}) 47 48 for phone in phones: 49 offer_url = phone.find('a').get('href') 50 image_url = phone.find_all('div', {'class': 'col-12'})[0].find('img').get('src') 51 brand = phone.find_all('div', {'class': 'col-12'})[1].find('h5', {'class': 'mb-0'}).get_text().strip() 52 offer_name = phone.find_all('div', {'class': 'col-12'})[1] \ 53 .find('h3', {'class': 'h5 font-weight-normal'}).get_text().strip() 54 55 if 'Watch' in offer_name or 'Pad' in offer_name or 'Tab' in offer_name or 'Pods' in offer_name or 'Buds' in offer_name or 'HomePod' in offer_name: 56 continue 57 58 if brand not in offer_name: 59 offer_name = brand + " " + offer_name 60 61 temp_prices = phone.find_all('div', {'class': 'col-12'})[1] \ 62 .find('p', {'class': 'h5 price'}).get_text(separator='/').strip() 63 64 if len(temp_prices.split('/')) > 1: 65 price = int(float(temp_prices.split('/')[1].replace(',', '').replace('ден', '').strip())) 21 # Call to read the configuration file and connect to database 22 cinfo = config_read.get_databaseconfig("../postgresdb.config") 23 db_connection = psycopg2.connect( 24 database=cinfo[0], 25 host=cinfo[1], 26 user=cinfo[2], 27 password=cinfo[3] 28 ) 29 cur = db_connection.cursor() 30 31 try: 32 # Mobelix phone offers that are already in database 33 offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/mobelix').text)) 34 35 database_offers = [] 36 37 for offer in offers: 38 phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'], 39 offer['ram_memory'], 40 offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'], 41 offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'], 42 offer['image_url'], 43 offer['offer_url'], offer['last_updated'], offer['is_validated'], 44 offer['offer_description'], 45 offer['offer_shop_code']) 46 database_offers.append(phoneOffer) 47 48 new_offers = [] 49 50 for i in range(1, 17): 51 mobelix_url = "https://mobelix.com.mk/mk/mobilni-telefoni?page=" + str(i) 52 53 response1 = requests.get(mobelix_url) 54 soup1 = BeautifulSoup(response1.content, 'html.parser') 55 56 phones = soup1.find_all('div', {'class': 'p-2 rounded text-dark bg-white d-flex w-100'}) 57 58 for phone in phones: 59 offer_url = phone.find('a').get('href') 60 image_url = phone.find_all('div', {'class': 'col-12'})[0].find('img').get('src') 61 brand = phone.find_all('div', {'class': 'col-12'})[1].find('h5', {'class': 'mb-0'}).get_text().strip() 62 offer_name = phone.find_all('div', {'class': 'col-12'})[1] \ 63 .find('h3', {'class': 'h5 font-weight-normal'}).get_text().strip() 64 65 if 'Watch' in offer_name or 'Pad' in offer_name or 'Tab' in offer_name or 'Pods' in offer_name or 'Buds' in offer_name or 'HomePod' in offer_name: 66 continue 67 68 if brand not in offer_name: 69 offer_name = brand + " " + offer_name 70 71 temp_prices = phone.find_all('div', {'class': 'col-12'})[1] \ 72 .find('p', {'class': 'h5 price'}).get_text(separator='/').strip() 73 74 if len(temp_prices.split('/')) > 1: 75 price = int(float(temp_prices.split('/')[1].replace(',', '').replace('ден', '').strip())) 76 else: 77 price = int(float(temp_prices.split('/')[0].replace(',', '').replace('ден', '').strip())) 78 79 response2 = requests.get(offer_url) 80 soup2 = BeautifulSoup(response2.content, 'html.parser') 81 82 colors_divs = soup2.find('div', {'class': 'color-wrapper mt-2 mb-1'}) \ 83 .find_all('div', {'class': 'color-box d-inline-block'}) # color div tags 84 85 temp_colors = [] 86 for div in colors_divs: 87 temp_colors.append(div.get('title')) 88 89 color = ",".join(temp_colors) # available colors for offer 90 91 tables = soup2.find('div', {'class': 'mobelix-specs table-white bordered-table'}).find_all('table') 92 93 operating_system = None 94 chipset = None 95 battery = None 96 ram_memory = None 97 rom_memory = None 98 front_camera = '' 99 back_camera = '' 100 cpu = None 101 offer_shop_code = None 102 offer_description = None 103 104 for table in tables: 105 for cell in table.find_all('td'): 106 if cell.get('data-spec') is None: 107 continue 108 109 if cell.get('data-spec') == 'os': 110 operating_system = unicodedata.normalize('NFKD', cell.get_text().strip()) 111 112 if cell.get('data-spec') == 'chipset': 113 chipset = unicodedata.normalize('NFKD', cell.get_text().strip()) 114 115 if cell.get('data-spec') == 'cpu': 116 cpu = unicodedata.normalize('NFKD', cell.get_text().strip()) 117 118 if cell.get('data-spec') == 'internalmemory': 119 temp_rom = [] 120 temp_ram = [] 121 temp_internalmemory = unicodedata.normalize('NFKD', cell.get_text().strip()) 122 for internalmemory in temp_internalmemory.split(','): 123 temp_rom.append(internalmemory.strip().split(' ')[0]) 124 if len(internalmemory.strip().split(' ')) > 1: 125 temp_ram.append(internalmemory.strip().split(' ')[1]) 126 rom_memory = ','.join(temp_rom) 127 ram_memory = ','.join(temp_ram) 128 129 if cell.get('data-spec') == 'cam1modules' or cell.get('data-spec') == 'cam1features' or cell.get( 130 'data-spec') == 'cam1video': 131 back_camera += unicodedata.normalize('NFKD', cell.get_text().strip()) + '\n' 132 133 if cell.get('data-spec') == 'cam2modules' or cell.get('data-spec') == 'cam2features' or cell.get( 134 'data-spec') == 'cam2video': 135 front_camera += unicodedata.normalize('NFKD', cell.get_text().strip()) + '\n' 136 137 if cell.get('data-spec') == 'batdescription1': 138 battery = unicodedata.normalize('NFKD', cell.get_text().strip()) 139 140 if front_camera == 'No': 141 front_camera = None 142 143 if back_camera == 'No': 144 back_camera = None 145 146 new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory, 147 color, front_camera, back_camera, chipset, battery, operating_system, cpu, 148 image_url, 149 offer_url, last_updated, is_validated, offer_description, offer_shop_code)) 150 151 152 for new_offer in new_offers: 153 flag = False 154 flag_price = False 155 offer_id = None 156 157 for old_offer in database_offers: 158 159 if new_offer.offer_name == old_offer.offer_name: 160 flag = True 161 if new_offer.price != old_offer.price: 162 flag_price = True 163 offer_id = old_offer.offer_id 164 165 if flag: 166 # print('ALREADY IN DATABASE') 167 # print(new_offer) 168 # if it's already in database, check PRICE and if it's changed, change it !!!!!! 169 if flag_price: 170 print('PRICE CHANGED!') # CHANGE PRICE 171 print('offer id: ' + str(offer_id)) 172 headers = {'Content-type': 'application/json'} 173 requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price), 174 headers=headers) 66 175 else: 67 price = int(float(temp_prices.split('/')[0].replace(',', '').replace('ден', '').strip())) 68 69 response2 = requests.get(offer_url) 70 soup2 = BeautifulSoup(response2.content, 'html.parser') 71 72 colors_divs = soup2.find('div', {'class': 'color-wrapper mt-2 mb-1'}) \ 73 .find_all('div', {'class': 'color-box d-inline-block'}) # color div tags 74 75 temp_colors = [] 76 for div in colors_divs: 77 temp_colors.append(div.get('title')) 78 79 color = ",".join(temp_colors) # available colors for offer 80 81 tables = soup2.find('div', {'class': 'mobelix-specs table-white bordered-table'}).find_all('table') 82 83 operating_system = None 84 chipset = None 85 battery = None 86 ram_memory = None 87 rom_memory = None 88 front_camera = '' 89 back_camera = '' 90 cpu = None 91 offer_shop_code = None 92 offer_description = None 93 94 for table in tables: 95 for cell in table.find_all('td'): 96 if cell.get('data-spec') is None: 97 continue 98 99 if cell.get('data-spec') == 'os': 100 operating_system = unicodedata.normalize('NFKD', cell.get_text().strip()) 101 102 if cell.get('data-spec') == 'chipset': 103 chipset = unicodedata.normalize('NFKD', cell.get_text().strip()) 104 105 if cell.get('data-spec') == 'cpu': 106 cpu = unicodedata.normalize('NFKD', cell.get_text().strip()) 107 108 if cell.get('data-spec') == 'internalmemory': 109 temp_rom = [] 110 temp_ram = [] 111 temp_internalmemory = unicodedata.normalize('NFKD', cell.get_text().strip()) 112 for internalmemory in temp_internalmemory.split(','): 113 temp_rom.append(internalmemory.strip().split(' ')[0]) 114 if len(internalmemory.strip().split(' ')) > 1: 115 temp_ram.append(internalmemory.strip().split(' ')[1]) 116 rom_memory = ','.join(temp_rom) 117 ram_memory = ','.join(temp_ram) 118 119 if cell.get('data-spec') == 'cam1modules' or cell.get('data-spec') == 'cam1features' or cell.get( 120 'data-spec') == 'cam1video': 121 back_camera += unicodedata.normalize('NFKD', cell.get_text().strip()) + '\n' 122 123 if cell.get('data-spec') == 'cam2modules' or cell.get('data-spec') == 'cam2features' or cell.get( 124 'data-spec') == 'cam2video': 125 front_camera += unicodedata.normalize('NFKD', cell.get_text().strip()) + '\n' 126 127 if cell.get('data-spec') == 'batdescription1': 128 battery = unicodedata.normalize('NFKD', cell.get_text().strip()) 129 130 if front_camera == 'No': 131 front_camera = None 132 133 if back_camera == 'No': 134 back_camera = None 135 136 new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory, 137 color, front_camera, back_camera, chipset, battery, operating_system, cpu, 138 image_url, 139 offer_url, last_updated, is_validated, offer_description, offer_shop_code)) 140 141 142 for new_offer in new_offers: 143 flag = False 144 flag_price = False 145 offer_id = None 176 print('ADDED') # ADD OFFER 177 print(new_offer) 178 headers = {'Content-type': 'application/json'} 179 requests.post('http://localhost:8080/phoneoffer/addoffer', 180 headers=headers, data=json.dumps(new_offer.__dict__, default=str)) 181 182 print('------------------------------------') 146 183 147 184 for old_offer in database_offers: 148 149 if new_offer.offer_name == old_offer.offer_name: 150 flag = True 151 if new_offer.price != old_offer.price: 152 flag_price = True 153 offer_id = old_offer.offer_id 154 155 if flag: 156 # print('ALREADY IN DATABASE') 157 # print(new_offer) 158 # if it's already in database, check PRICE and if it's changed, change it !!!!!! 159 if flag_price: 160 print('PRICE CHANGED!') # CHANGE PRICE 161 print('offer id: ' + str(offer_id)) 162 headers = {'Content-type': 'application/json'} 163 requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price), 164 headers=headers) 165 else: 166 print('ADDED') # ADD OFFER 167 print(new_offer) 168 headers = {'Content-type': 'application/json'} 169 requests.post('http://localhost:8080/phoneoffer/addoffer', 170 headers=headers, data=json.dumps(new_offer.__dict__, default=str)) 171 172 print('------------------------------------') 173 174 for old_offer in database_offers: 175 flag = False 176 for new_offer in new_offers: 177 if old_offer.offer_name == new_offer.offer_name: 178 flag = True 179 180 if not flag: 181 print('OFFER DELETED') 182 print(old_offer) 183 # DELETE OFFER 184 requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id)) 185 flag = False 186 for new_offer in new_offers: 187 if old_offer.offer_name == new_offer.offer_name: 188 flag = True 189 190 if not flag: 191 print('OFFER DELETED') 192 print(old_offer) 193 # DELETE OFFER 194 requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id)) 195 except Exception: 196 traceback.print_exc() 197 insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \ 198 ' VALUES (%s, %s, %s);' 199 insert_value = (offer_shop, last_updated, 'failed') 200 cur.execute(insert_script, insert_value) 201 db_connection.commit() 202 cur.close() 203 db_connection.close() 204 else: 205 insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \ 206 ' VALUES (%s, %s, %s);' 207 insert_value = (offer_shop, last_updated, 'success') 208 cur.execute(insert_script, insert_value) 209 db_connection.commit() 210 cur.close() 211 db_connection.close() 212
Note:
See TracChangeset
for help on using the changeset viewer.