Changeset 47f4eaf for phonelux_scrappers/scrappers/mobigo_scrapper.py
- Timestamp:
- 11/20/22 16:34:52 (2 years ago)
- Branches:
- master
- Parents:
- ffd50db
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
phonelux_scrappers/scrappers/mobigo_scrapper.py
rffd50db r47f4eaf 1 1 import json 2 import traceback 2 3 import unicodedata 3 4 from datetime import datetime … … 18 19 is_validated = False 19 20 20 # Mobi Go phone offers that are already in database 21 22 offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/mobigo').text)) 23 24 database_offers = [] 25 26 for offer in offers: 27 phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'], 28 offer['ram_memory'], 29 offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'], 30 offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'], 31 offer['image_url'], 32 offer['offer_url'], offer['last_updated'], offer['is_validated'], 33 offer['offer_description'], 34 offer['offer_shop_code']) 35 database_offers.append(phoneOffer) 36 37 new_offers = [] 38 39 40 for i in range(1, 6): 41 mobigo_url = "https://mobigo.mk/page/" + str(i) + "/" 42 43 response1 = requests.get(mobigo_url) 44 45 soup1 = BeautifulSoup(response1.content, 'html.parser') 46 47 phone_sections = soup1.find_all('ul', {'class': 'recent-posts'}) 48 phones = phone_sections[len(phone_sections) - 1].find_all('li') 49 50 for phone in phones: 51 offer_url = phone.find('div', {'class', 'post-thumb'}).find('a').get('href') # offer url 52 image_url = phone.find('div', {'class', 'post-thumb'}).find('a').find('img').get('src') # image url 53 offer_name = phone.find('div', {'class', 'post-content'}).find_all('h2')[0].get_text().strip() # offer_name 54 55 if "Watch" in offer_name or "Tab" in offer_name: # if the product is watch or tablet, continue 56 continue 57 58 price = int(float(phone.find('div', {'class', 'post-content'}).find_all('h2')[1] \ 59 .get_text().replace('ден.', '').replace('.', '').strip())) # price 60 61 response2 = requests.get(offer_url) 62 soup2 = BeautifulSoup(response2.content, 'html.parser') 63 64 brand = soup2.find('a', {'rel': 'category tag'}).get_text().strip() # brand 65 66 if brand not in offer_name: 67 offer_name = brand + " " + offer_name 68 69 specifications = soup2.find('table', {'id': 'singlet'}).find_all('tr') 70 71 ram_memory = None 72 rom_memory = None 73 battery = None 74 back_camera = None 75 front_camera = None 76 chipset = None 77 operating_system = None 78 cpu = None 79 offer_shop_code = None 80 offer_description = None 81 color = None 82 83 for specification in specifications: 84 if specification.find('td') == None: 21 # Call to read the configuration file and connect to database 22 cinfo = config_read.get_databaseconfig("../postgresdb.config") 23 db_connection = psycopg2.connect( 24 database=cinfo[0], 25 host=cinfo[1], 26 user=cinfo[2], 27 password=cinfo[3] 28 ) 29 cur = db_connection.cursor() 30 31 try: 32 # Mobi Go phone offers that are already in database 33 offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/mobigo').text)) 34 35 database_offers = [] 36 37 for offer in offers: 38 phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'], 39 offer['ram_memory'], 40 offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'], 41 offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'], 42 offer['image_url'], 43 offer['offer_url'], offer['last_updated'], offer['is_validated'], 44 offer['offer_description'], 45 offer['offer_shop_code']) 46 database_offers.append(phoneOffer) 47 48 new_offers = [] 49 50 51 for i in range(1, 6): 52 mobigo_url = "https://mobigo.mk/page/" + str(i) + "/" 53 54 response1 = requests.get(mobigo_url) 55 56 soup1 = BeautifulSoup(response1.content, 'html.parser') 57 58 phone_sections = soup1.find_all('ul', {'class': 'recent-posts'}) 59 phones = phone_sections[len(phone_sections) - 1].find_all('li') 60 61 for phone in phones: 62 offer_url = phone.find('div', {'class', 'post-thumb'}).find('a').get('href') # offer url 63 image_url = phone.find('div', {'class', 'post-thumb'}).find('a').find('img').get('src') # image url 64 offer_name = phone.find('div', {'class', 'post-content'}).find_all('h2')[0].get_text().strip() # offer_name 65 66 if "Watch" in offer_name or "Tab" in offer_name: # if the product is watch or tablet, continue 85 67 continue 86 68 87 # operating system 88 if specification.find('td').get_text() == "Платформа": 89 if specification.find('i').get_text() != "/": 90 operating_system = specification.find('i').get_text().strip() 91 else: 92 operating_system = None 93 94 # chipset 95 if specification.find('td').get_text() == "Chipset": 96 if specification.find('i').get_text() != "/": 97 chipset = specification.find('i').get_text().strip() 98 else: 99 chipset = None 100 101 # ram and rom memory 102 if specification.find('td').get_text() == "Меморија": 103 if specification.find('i').get_text() != "/": 104 rom_memory = specification.find('i').get_text().replace(',', '').split(' ')[0].strip() 105 ram_memory = specification.find('i').get_text().replace(',', '').split(' ')[1].strip() 106 else: 107 rom_memory = None 108 ram_memory = None 109 110 # back camera 111 if specification.find('td').get_text() == "Главна Камера": 112 if specification.find('i').get_text() != "/": 113 back_camera = specification.find('i').get_text().strip() 114 else: 115 back_camera = None 116 117 # front camera 118 if specification.find('td').get_text() == "Селфи Камера": 119 if specification.find('i').get_text() != "/": 120 front_camera = specification.find('i').get_text().strip() 121 else: 122 front_camera = None 123 124 # battery 125 if specification.find('td').get_text() == "Батерија": 126 if specification.find('i').get_text() != "/": 127 battery = specification.find('i').get_text().strip() 128 else: 129 battery = None 130 131 new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory, 132 color, front_camera, back_camera, chipset, battery, operating_system, cpu, 133 image_url, 134 offer_url, last_updated, is_validated, offer_description, offer_shop_code)) 135 136 137 for new_offer in new_offers: 138 flag = False 139 flag_price = False 140 offer_id = None 69 price = int(float(phone.find('div', {'class', 'post-content'}).find_all('h2')[1] \ 70 .get_text().replace('ден.', '').replace('.', '').strip())) # price 71 72 response2 = requests.get(offer_url) 73 soup2 = BeautifulSoup(response2.content, 'html.parser') 74 75 brand = soup2.find('a', {'rel': 'category tag'}).get_text().strip() # brand 76 77 if brand not in offer_name: 78 offer_name = brand + " " + offer_name 79 80 specifications = soup2.find('table', {'id': 'singlet'}).find_all('tr') 81 82 ram_memory = None 83 rom_memory = None 84 battery = None 85 back_camera = None 86 front_camera = None 87 chipset = None 88 operating_system = None 89 cpu = None 90 offer_shop_code = None 91 offer_description = None 92 color = None 93 94 for specification in specifications: 95 if specification.find('td') == None: 96 continue 97 98 # operating system 99 if specification.find('td').get_text() == "Платформа": 100 if specification.find('i').get_text() != "/": 101 operating_system = specification.find('i').get_text().strip() 102 else: 103 operating_system = None 104 105 # chipset 106 if specification.find('td').get_text() == "Chipset": 107 if specification.find('i').get_text() != "/": 108 chipset = specification.find('i').get_text().strip() 109 else: 110 chipset = None 111 112 # ram and rom memory 113 if specification.find('td').get_text() == "Меморија": 114 if specification.find('i').get_text() != "/": 115 rom_memory = specification.find('i').get_text().replace(',', '').split(' ')[0].strip() 116 ram_memory = specification.find('i').get_text().replace(',', '').split(' ')[1].strip() 117 else: 118 rom_memory = None 119 ram_memory = None 120 121 # back camera 122 if specification.find('td').get_text() == "Главна Камера": 123 if specification.find('i').get_text() != "/": 124 back_camera = specification.find('i').get_text().strip() 125 else: 126 back_camera = None 127 128 # front camera 129 if specification.find('td').get_text() == "Селфи Камера": 130 if specification.find('i').get_text() != "/": 131 front_camera = specification.find('i').get_text().strip() 132 else: 133 front_camera = None 134 135 # battery 136 if specification.find('td').get_text() == "Батерија": 137 if specification.find('i').get_text() != "/": 138 battery = specification.find('i').get_text().strip() 139 else: 140 battery = None 141 142 new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory, 143 color, front_camera, back_camera, chipset, battery, operating_system, cpu, 144 image_url, 145 offer_url, last_updated, is_validated, offer_description, offer_shop_code)) 146 147 148 for new_offer in new_offers: 149 flag = False 150 flag_price = False 151 offer_id = None 152 153 for old_offer in database_offers: 154 155 if new_offer.offer_name == old_offer.offer_name: 156 flag = True 157 if new_offer.price != old_offer.price: 158 flag_price = True 159 offer_id = old_offer.offer_id 160 161 if flag: 162 print('ALREADY IN DATABASE') 163 print(new_offer) 164 # if it's already in database, check PRICE and if it's changed, change it !!!!!! 165 if flag_price: 166 print('PRICE CHANGED!') # CHANGE PRICE 167 print('offer id: ' + str(offer_id)) 168 headers = {'Content-type': 'application/json'} 169 requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price), 170 headers=headers) 171 else: 172 print('ADDED') # ADD OFFER 173 print(new_offer) 174 headers = {'Content-type': 'application/json'} 175 requests.post('http://localhost:8080/phoneoffer/addoffer', 176 headers=headers, data=json.dumps(new_offer.__dict__, default=str)) 177 178 print('------------------------------------') 141 179 142 180 for old_offer in database_offers: 143 144 if new_offer.offer_name == old_offer.offer_name: 145 flag = True 146 if new_offer.price != old_offer.price: 147 flag_price = True 148 offer_id = old_offer.offer_id 149 150 if flag: 151 print('ALREADY IN DATABASE') 152 print(new_offer) 153 # if it's already in database, check PRICE and if it's changed, change it !!!!!! 154 if flag_price: 155 print('PRICE CHANGED!') # CHANGE PRICE 156 print('offer id: ' + str(offer_id)) 157 headers = {'Content-type': 'application/json'} 158 requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price), 159 headers=headers) 160 else: 161 print('ADDED') # ADD OFFER 162 print(new_offer) 163 headers = {'Content-type': 'application/json'} 164 requests.post('http://localhost:8080/phoneoffer/addoffer', 165 headers=headers, data=json.dumps(new_offer.__dict__, default=str)) 166 167 print('------------------------------------') 168 169 for old_offer in database_offers: 170 flag = False 171 for new_offer in new_offers: 172 if old_offer.offer_name == new_offer.offer_name: 173 flag = True 174 175 if not flag: 176 print('OFFER DELETED') 177 print(old_offer) 178 # DELETE OFFER 179 requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id)) 181 flag = False 182 for new_offer in new_offers: 183 if old_offer.offer_name == new_offer.offer_name: 184 flag = True 185 186 if not flag: 187 print('OFFER DELETED') 188 print(old_offer) 189 # DELETE OFFER 190 requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id)) 191 except Exception: 192 traceback.print_exc() 193 insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \ 194 ' VALUES (%s, %s, %s);' 195 insert_value = (offer_shop, last_updated, 'failed') 196 cur.execute(insert_script, insert_value) 197 db_connection.commit() 198 cur.close() 199 db_connection.close() 200 else: 201 insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \ 202 ' VALUES (%s, %s, %s);' 203 insert_value = (offer_shop, last_updated, 'success') 204 cur.execute(insert_script, insert_value) 205 db_connection.commit() 206 cur.close() 207 db_connection.close()
Note:
See TracChangeset
for help on using the changeset viewer.