source: phonelux_scrappers/scrappers/mobelix_scrapper.py@ 47f4eaf

Last change on this file since 47f4eaf was 47f4eaf, checked in by Marko <Marko@…>, 2 years ago

Final features implemented

  • Property mode set to 100644
File size: 8.7 KB
Line 
1import json
2import sys
3import unicodedata
4from datetime import datetime
5import traceback
6import psycopg2
7import config_read
8from bs4 import BeautifulSoup
9import requests
10
11# import sys
12from classes.phoneoffer import PhoneOffer
13
14file_path = 'outputfile.txt'
15sys.stdout = open(file_path, "w")
16
17offer_shop = "Mobelix" # offer shop
18last_updated = datetime.now().date()
19is_validated = False
20
21# Call to read the configuration file and connect to database
22cinfo = config_read.get_databaseconfig("../postgresdb.config")
23db_connection = psycopg2.connect(
24 database=cinfo[0],
25 host=cinfo[1],
26 user=cinfo[2],
27 password=cinfo[3]
28)
29cur = db_connection.cursor()
30
31try:
32 # Mobelix phone offers that are already in database
33 offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/mobelix').text))
34
35 database_offers = []
36
37 for offer in offers:
38 phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'],
39 offer['ram_memory'],
40 offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'],
41 offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'],
42 offer['image_url'],
43 offer['offer_url'], offer['last_updated'], offer['is_validated'],
44 offer['offer_description'],
45 offer['offer_shop_code'])
46 database_offers.append(phoneOffer)
47
48 new_offers = []
49
50 for i in range(1, 17):
51 mobelix_url = "https://mobelix.com.mk/mk/mobilni-telefoni?page=" + str(i)
52
53 response1 = requests.get(mobelix_url)
54 soup1 = BeautifulSoup(response1.content, 'html.parser')
55
56 phones = soup1.find_all('div', {'class': 'p-2 rounded text-dark bg-white d-flex w-100'})
57
58 for phone in phones:
59 offer_url = phone.find('a').get('href')
60 image_url = phone.find_all('div', {'class': 'col-12'})[0].find('img').get('src')
61 brand = phone.find_all('div', {'class': 'col-12'})[1].find('h5', {'class': 'mb-0'}).get_text().strip()
62 offer_name = phone.find_all('div', {'class': 'col-12'})[1] \
63 .find('h3', {'class': 'h5 font-weight-normal'}).get_text().strip()
64
65 if 'Watch' in offer_name or 'Pad' in offer_name or 'Tab' in offer_name or 'Pods' in offer_name or 'Buds' in offer_name or 'HomePod' in offer_name:
66 continue
67
68 if brand not in offer_name:
69 offer_name = brand + " " + offer_name
70
71 temp_prices = phone.find_all('div', {'class': 'col-12'})[1] \
72 .find('p', {'class': 'h5 price'}).get_text(separator='/').strip()
73
74 if len(temp_prices.split('/')) > 1:
75 price = int(float(temp_prices.split('/')[1].replace(',', '').replace('ден', '').strip()))
76 else:
77 price = int(float(temp_prices.split('/')[0].replace(',', '').replace('ден', '').strip()))
78
79 response2 = requests.get(offer_url)
80 soup2 = BeautifulSoup(response2.content, 'html.parser')
81
82 colors_divs = soup2.find('div', {'class': 'color-wrapper mt-2 mb-1'}) \
83 .find_all('div', {'class': 'color-box d-inline-block'}) # color div tags
84
85 temp_colors = []
86 for div in colors_divs:
87 temp_colors.append(div.get('title'))
88
89 color = ",".join(temp_colors) # available colors for offer
90
91 tables = soup2.find('div', {'class': 'mobelix-specs table-white bordered-table'}).find_all('table')
92
93 operating_system = None
94 chipset = None
95 battery = None
96 ram_memory = None
97 rom_memory = None
98 front_camera = ''
99 back_camera = ''
100 cpu = None
101 offer_shop_code = None
102 offer_description = None
103
104 for table in tables:
105 for cell in table.find_all('td'):
106 if cell.get('data-spec') is None:
107 continue
108
109 if cell.get('data-spec') == 'os':
110 operating_system = unicodedata.normalize('NFKD', cell.get_text().strip())
111
112 if cell.get('data-spec') == 'chipset':
113 chipset = unicodedata.normalize('NFKD', cell.get_text().strip())
114
115 if cell.get('data-spec') == 'cpu':
116 cpu = unicodedata.normalize('NFKD', cell.get_text().strip())
117
118 if cell.get('data-spec') == 'internalmemory':
119 temp_rom = []
120 temp_ram = []
121 temp_internalmemory = unicodedata.normalize('NFKD', cell.get_text().strip())
122 for internalmemory in temp_internalmemory.split(','):
123 temp_rom.append(internalmemory.strip().split(' ')[0])
124 if len(internalmemory.strip().split(' ')) > 1:
125 temp_ram.append(internalmemory.strip().split(' ')[1])
126 rom_memory = ','.join(temp_rom)
127 ram_memory = ','.join(temp_ram)
128
129 if cell.get('data-spec') == 'cam1modules' or cell.get('data-spec') == 'cam1features' or cell.get(
130 'data-spec') == 'cam1video':
131 back_camera += unicodedata.normalize('NFKD', cell.get_text().strip()) + '\n'
132
133 if cell.get('data-spec') == 'cam2modules' or cell.get('data-spec') == 'cam2features' or cell.get(
134 'data-spec') == 'cam2video':
135 front_camera += unicodedata.normalize('NFKD', cell.get_text().strip()) + '\n'
136
137 if cell.get('data-spec') == 'batdescription1':
138 battery = unicodedata.normalize('NFKD', cell.get_text().strip())
139
140 if front_camera == 'No':
141 front_camera = None
142
143 if back_camera == 'No':
144 back_camera = None
145
146 new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory,
147 color, front_camera, back_camera, chipset, battery, operating_system, cpu,
148 image_url,
149 offer_url, last_updated, is_validated, offer_description, offer_shop_code))
150
151
152 for new_offer in new_offers:
153 flag = False
154 flag_price = False
155 offer_id = None
156
157 for old_offer in database_offers:
158
159 if new_offer.offer_name == old_offer.offer_name:
160 flag = True
161 if new_offer.price != old_offer.price:
162 flag_price = True
163 offer_id = old_offer.offer_id
164
165 if flag:
166 # print('ALREADY IN DATABASE')
167 # print(new_offer)
168 # if it's already in database, check PRICE and if it's changed, change it !!!!!!
169 if flag_price:
170 print('PRICE CHANGED!') # CHANGE PRICE
171 print('offer id: ' + str(offer_id))
172 headers = {'Content-type': 'application/json'}
173 requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price),
174 headers=headers)
175 else:
176 print('ADDED') # ADD OFFER
177 print(new_offer)
178 headers = {'Content-type': 'application/json'}
179 requests.post('http://localhost:8080/phoneoffer/addoffer',
180 headers=headers, data=json.dumps(new_offer.__dict__, default=str))
181
182 print('------------------------------------')
183
184 for old_offer in database_offers:
185 flag = False
186 for new_offer in new_offers:
187 if old_offer.offer_name == new_offer.offer_name:
188 flag = True
189
190 if not flag:
191 print('OFFER DELETED')
192 print(old_offer)
193 # DELETE OFFER
194 requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id))
195except Exception:
196 traceback.print_exc()
197 insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \
198 ' VALUES (%s, %s, %s);'
199 insert_value = (offer_shop, last_updated, 'failed')
200 cur.execute(insert_script, insert_value)
201 db_connection.commit()
202 cur.close()
203 db_connection.close()
204else:
205 insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \
206 ' VALUES (%s, %s, %s);'
207 insert_value = (offer_shop, last_updated, 'success')
208 cur.execute(insert_script, insert_value)
209 db_connection.commit()
210 cur.close()
211 db_connection.close()
212
Note: See TracBrowser for help on using the repository browser.