source: phonelux_scrappers/scrappers/mobigo_scrapper.py@ 895cd87

Last change on this file since 895cd87 was 895cd87, checked in by Marko <Marko@…>, 21 months ago

Refactored code

  • Property mode set to 100644
File size: 6.7 KB
Line 
1import json
2import unicodedata
3from datetime import datetime
4
5import psycopg2
6import config_read
7from bs4 import BeautifulSoup
8import requests
9import sys
10
11from classes.phoneoffer import PhoneOffer
12
13file_path = 'outputfile.txt'
14sys.stdout = open(file_path, "w")
15
16offer_shop = "Mobi Go" # offer shop
17last_updated = datetime.now().date()
18is_validated = False
19
20# Mobi Go phone offers that are already in database
21
22offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/mobigo').text))
23
24database_offers = []
25
26for offer in offers:
27 phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'],
28 offer['ram_memory'],
29 offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'],
30 offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'],
31 offer['image_url'],
32 offer['offer_url'], offer['last_updated'], offer['is_validated'],
33 offer['offer_description'],
34 offer['offer_shop_code'])
35 database_offers.append(phoneOffer)
36
37new_offers = []
38
39
40for i in range(1, 6):
41 mobigo_url = "https://mobigo.mk/page/" + str(i) + "/"
42
43 response1 = requests.get(mobigo_url)
44
45 soup1 = BeautifulSoup(response1.content, 'html.parser')
46
47 phone_sections = soup1.find_all('ul', {'class': 'recent-posts'})
48 phones = phone_sections[len(phone_sections) - 1].find_all('li')
49
50 for phone in phones:
51 offer_url = phone.find('div', {'class', 'post-thumb'}).find('a').get('href') # offer url
52 image_url = phone.find('div', {'class', 'post-thumb'}).find('a').find('img').get('src') # image url
53 offer_name = phone.find('div', {'class', 'post-content'}).find_all('h2')[0].get_text().strip() # offer_name
54
55 if "Watch" in offer_name or "Tab" in offer_name: # if the product is watch or tablet, continue
56 continue
57
58 price = int(float(phone.find('div', {'class', 'post-content'}).find_all('h2')[1] \
59 .get_text().replace('ден.', '').replace('.', '').strip())) # price
60
61 response2 = requests.get(offer_url)
62 soup2 = BeautifulSoup(response2.content, 'html.parser')
63
64 brand = soup2.find('a', {'rel': 'category tag'}).get_text().strip() # brand
65
66 if brand not in offer_name:
67 offer_name = brand + " " + offer_name
68
69 specifications = soup2.find('table', {'id': 'singlet'}).find_all('tr')
70
71 ram_memory = None
72 rom_memory = None
73 battery = None
74 back_camera = None
75 front_camera = None
76 chipset = None
77 operating_system = None
78 cpu = None
79 offer_shop_code = None
80 offer_description = None
81 color = None
82
83 for specification in specifications:
84 if specification.find('td') == None:
85 continue
86
87 # operating system
88 if specification.find('td').get_text() == "Платформа":
89 if specification.find('i').get_text() != "/":
90 operating_system = specification.find('i').get_text().strip()
91 else:
92 operating_system = None
93
94 # chipset
95 if specification.find('td').get_text() == "Chipset":
96 if specification.find('i').get_text() != "/":
97 chipset = specification.find('i').get_text().strip()
98 else:
99 chipset = None
100
101 # ram and rom memory
102 if specification.find('td').get_text() == "Меморија":
103 if specification.find('i').get_text() != "/":
104 rom_memory = specification.find('i').get_text().replace(',', '').split(' ')[0].strip()
105 ram_memory = specification.find('i').get_text().replace(',', '').split(' ')[1].strip()
106 else:
107 rom_memory = None
108 ram_memory = None
109
110 # back camera
111 if specification.find('td').get_text() == "Главна Камера":
112 if specification.find('i').get_text() != "/":
113 back_camera = specification.find('i').get_text().strip()
114 else:
115 back_camera = None
116
117 # front camera
118 if specification.find('td').get_text() == "Селфи Камера":
119 if specification.find('i').get_text() != "/":
120 front_camera = specification.find('i').get_text().strip()
121 else:
122 front_camera = None
123
124 # battery
125 if specification.find('td').get_text() == "Батерија":
126 if specification.find('i').get_text() != "/":
127 battery = specification.find('i').get_text().strip()
128 else:
129 battery = None
130
131 new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory,
132 color, front_camera, back_camera, chipset, battery, operating_system, cpu,
133 image_url,
134 offer_url, last_updated, is_validated, offer_description, offer_shop_code))
135
136
137for new_offer in new_offers:
138 flag = False
139 flag_price = False
140 offer_id = None
141
142 for old_offer in database_offers:
143
144 if new_offer.offer_name == old_offer.offer_name:
145 flag = True
146 if new_offer.price != old_offer.price:
147 flag_price = True
148 offer_id = old_offer.offer_id
149
150 if flag:
151 print('ALREADY IN DATABASE')
152 print(new_offer)
153 # if it's already in database, check PRICE and if it's changed, change it !!!!!!
154 if flag_price:
155 print('PRICE CHANGED!') # CHANGE PRICE
156 print('offer id: ' + str(offer_id))
157 headers = {'Content-type': 'application/json'}
158 requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price),
159 headers=headers)
160 else:
161 print('ADDED') # ADD OFFER
162 print(new_offer)
163 headers = {'Content-type': 'application/json'}
164 requests.post('http://localhost:8080/phoneoffer/addoffer',
165 headers=headers, data=json.dumps(new_offer.__dict__, default=str))
166
167print('------------------------------------')
168
169for old_offer in database_offers:
170 flag = False
171 for new_offer in new_offers:
172 if old_offer.offer_name == new_offer.offer_name:
173 flag = True
174
175 if not flag:
176 print('OFFER DELETED')
177 print(old_offer)
178 # DELETE OFFER
179 requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id))
Note: See TracBrowser for help on using the repository browser.