source: backend/GlobeGuru-backend/src/main/java/ScraperThread.java@ 0a7426e

Last change on this file since 0a7426e was 0a7426e, checked in by Kristijan <kristijanzafirovski26@…>, 3 days ago

Added checking for changes - backend

  • Property mode set to 100644
File size: 17.4 KB
Line 
1import com.fasterxml.jackson.databind.JsonNode;
2import com.fasterxml.jackson.databind.ObjectMapper;
3import org.openqa.selenium.By;
4import org.openqa.selenium.WebDriver;
5import org.openqa.selenium.WebElement;
6import org.openqa.selenium.chrome.ChromeDriver;
7import org.openqa.selenium.chrome.ChromeOptions;
8import org.jsoup.Jsoup;
9import org.jsoup.nodes.Document;
10import org.jsoup.nodes.Element;
11import org.jsoup.select.Elements;
12import org.openqa.selenium.support.ui.ExpectedCondition;
13import org.openqa.selenium.support.ui.ExpectedConditions;
14import org.openqa.selenium.support.ui.WebDriverWait;
15
16import javax.xml.crypto.Data;
17import java.io.File;
18import java.io.IOException;
19import java.sql.Connection;
20import java.sql.DriverManager;
21import java.sql.PreparedStatement;
22import java.sql.SQLException;
23import java.text.ParseException;
24import java.text.SimpleDateFormat;
25import java.util.*;
26import java.util.concurrent.ConcurrentLinkedQueue;
27import java.util.concurrent.CountDownLatch;
28
29public class ScraperThread extends Thread {
30 private String url;
31 private ConcurrentLinkedQueue<Option> uniqueOptions;
32 private CountDownLatch latch;
33 private Set<Option> optionSet;
34
35 public ScraperThread(String url, ConcurrentLinkedQueue<Option> optionsQueue, CountDownLatch latch) {
36 this.url = url;
37 this.uniqueOptions = optionsQueue;
38 this.latch = latch;
39 this.optionSet = new HashSet<>();
40 }
41
42 public WebDriver driver;
43
44 private void initializeWebDriver() {
45 System.setProperty("webdriver.chrome.driver", "C:\\chromedriver-win64\\chromedriver.exe");
46 ChromeOptions options = new ChromeOptions();
47 options.setBinary("C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe");
48 options.addArguments("--headless");
49 options.addArguments("--disable-gpu");
50 options.addArguments("--remote-allow-origins=*");
51 options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
52 driver = new ChromeDriver(options);
53 }
54
55 private void closeWebDriver() {
56 if (driver != null) {
57 driver.quit();
58 }
59 }
60
61 private void connectToWeb(String queryUrl, int numPeople) {
62 driver.get(queryUrl);
63
64 WebDriverWait wait = new WebDriverWait(driver, 40); // 40s timeout buffer
65 switch (url) {
66 case "https://booking.escapetravel.mk/":
67 wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#hotels-container")));
68 try { Thread.sleep(10000);} catch (InterruptedException e) { e.printStackTrace(); }//price fetch
69 break;
70 case "https://magelantravel.mk/":
71 wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div.sodrzina")));
72 break;
73 }
74
75 String pageSource = driver.getPageSource();
76 System.out.println("Connected to " + queryUrl);
77 Document doc = Jsoup.parse(pageSource);
78 Element parentDiv;
79 Elements childDivs;
80
81 switch (url) {
82 case "https://booking.escapetravel.mk/":
83 parentDiv = doc.selectFirst("#hotels-container");
84 if (parentDiv != null) {
85 childDivs = parentDiv.select("a.hotel-item");
86 for (Element div : childDivs) {
87 String data = div.outerHtml();
88 Option option = optionParser(data,numPeople);
89 if (option != null) {
90 Option existingOption = DatabaseUtil.findOption(option);
91 if (existingOption != null) {
92 if (existingOption.equals(option)) {
93 option.setPriceChanged(true);
94 option.setNewPrice(option.getPrice());
95 }
96 DatabaseUtil.updateOptionInDatabase(option);
97 } else if (optionSet.add(option)) {
98 uniqueOptions.add(option);
99 option.setId(DatabaseUtil.saveOptionToDatabase(option));
100 scrapeOptionInfo(option);
101 System.out.println("Parsed " + option);
102 }
103 }
104 }
105 } else {
106 System.out.println("Parent div not found");
107 }
108 break;
109 case "https://magelantravel.mk/":
110 parentDiv = doc.selectFirst("div.sodrzina");
111 if (parentDiv != null) {
112 childDivs = parentDiv.select("div.destinacija");
113 childDivs.removeIf(div -> div.attr("style").contains("display:none") || div.attr("style").contains("display: none"));
114 System.out.println("Filtered childDivs size: " + childDivs.size());
115 for (Element div : childDivs) {
116 String data = div.outerHtml();
117 Option newOption = optionParser(data,numPeople);
118 if (newOption != null) {
119 if (optionSet.add(newOption)) {
120 uniqueOptions.add(newOption);
121
122 newOption.setId(DatabaseUtil.saveOptionToDatabase(newOption));
123 scrapeOptionInfo(newOption);
124 System.out.println("Parsed " + newOption);
125 }
126 }
127 }
128
129 } else {
130 System.out.println("Parent div not found");
131 }
132 break;
133 default:
134 System.out.println("URL not recognized for parsing.");
135 }
136 }
137 private void scrapeOptionInfo(Option option) {
138 String url = option.getLink();
139 if(url.contains("magelantravel.mk")) {
140 System.out.println("Scraping info for " + option.getHotelName());
141 String[] dates = option.getDateRange().split(" - ");
142 url += "&checkin=" + dates[0] + "&checkout=" + dates[1] + "&adult=" + option.getNumPeople();
143
144 driver.get(url);
145 try { Thread.sleep(5000); } catch (InterruptedException e) { e.printStackTrace(); } //data fetch
146 String pageSource = driver.getPageSource();
147 Document doc = Jsoup.parse(pageSource);
148 Elements roomOptions = doc.select(".tblroom > tbody > tr");
149 for (Element roomOption : roomOptions) {
150 String type = roomOption.select("a.tblroom-type").text();
151
152 String board = roomOption.select(".rezervacija-objekt").text();
153 if(board.length() > 2) {
154 board = board.substring(0, 2);
155 }
156 if(board.isEmpty() || type.isEmpty()) continue;
157
158 Elements amenityElement = roomOption.select(".objekt-opis");
159 String amenity = (amenityElement != null ? amenityElement.text() : "");
160 System.out.println(amenity + " " + board + " " + type );
161 String priceText = roomOption.select(".tbl-cena").text().replace("€", "").trim();
162 float price;
163 if (!priceText.isEmpty()) {
164 price = Float.parseFloat(priceText);
165 }else continue;
166
167 //Check for changes
168 int odId = checkForChanges(option.getId(), type, board,amenity,price);
169 if(odId != 0) { //true = changes found - update details
170 DatabaseUtil.updateOptionDetails(odId,type,board,amenity,price);
171 }else{ //false = not found / no changes - save regular
172 DatabaseUtil.saveOptionDetails(option.getId(), type, board, amenity, price);
173 }
174 }
175 }
176 else if(url.contains("booking.escapetravel.mk")){
177 System.out.println("Scraping info for " + url);
178
179 driver.get(url);
180 try { Thread.sleep(5000); } catch (InterruptedException e) { e.printStackTrace(); } //data fetch
181 String pageSource = driver.getPageSource();
182 Document doc = Jsoup.parse(pageSource);
183 Elements roomOptions = doc.select("#hotel-rooms-container .hotel-room-row");
184 for(Element roomOption : roomOptions){
185 String type = roomOption.select("td.align-middle").first().text();
186 String board = roomOption.select("td.align-middle.text-primary.lead").text();
187 if (board.isEmpty() || type.isEmpty()) continue;
188 String priceText = roomOption.select("td.align-middle.text-end .text-success.d-block.lead").text().replace("€", "").trim();
189 float price;
190 if (!priceText.isEmpty()) {
191 price = Float.parseFloat(priceText.replace(",", ""));
192 } else continue;
193
194 Elements amenityElements = doc.select("div.row > div.col-6.col-md-3.col-xl-2");
195 StringBuilder amenities = new StringBuilder();
196 for (Element amenityElement : amenityElements) {
197 amenities.append(amenityElement.text()).append(", ");
198 }
199 if (!amenities.isEmpty()) {
200 amenities.setLength(amenities.length() - 2);
201 }
202 System.out.println(type + board + price + amenities);
203 int odId = checkForChanges(option.getId(), type, board,amenities.toString(),price);
204 if(odId != 0) { //true = changes found - update details
205 DatabaseUtil.updateOptionDetails(odId,type,board,amenities.toString(),price);
206 }else{ //false = not found / no changes - save regular
207 DatabaseUtil.saveOptionDetails(option.getId(), type, board, amenities.toString(), price);
208 }
209 }
210
211 }
212 }
213 private int checkForChanges(int id, String type, String board, String amenities, float price){ //return true for changes, false for no changes
214 try {
215 List<Option> pooled = DatabaseUtil.poolOptionDetails(id);
216 if (pooled.isEmpty()) { //not saved = no changes - save regular
217 return 0;
218 }else{ //got the options saved details
219 for(Option o : pooled){
220 if(o.getType().equals(type) && o.getBoard().equals(board)){//for the room and board check amenity and price changes (Assumption type of room and board do not change)
221 if((!o.getAmenities().equals(amenities)) || o.getPrice() != price){
222 return o.getDetail_id(); //Change
223 }
224 }
225 }
226 }
227 }catch(SQLException e){
228 e.printStackTrace();
229 }
230 return 0; //no changes detected
231 }
232 private Option optionParser(String data, int numPeople){
233 Document doc = Jsoup.parse(data);
234 Option created = new Option();
235 switch (url) {
236 case "https://magelantravel.mk/":
237 created = parseMagelan(doc);
238 created.setNumPeople(numPeople);
239 break;
240 case "https://booking.escapetravel.mk/":
241 created = parseEscapeTravel(doc);
242 created.setNumPeople(numPeople);
243 break;
244 default:
245 System.out.println("URL not recognized for parsing.");
246 break;
247 }
248 if (created.isEmpty()) {
249 return null;
250 }
251 return created;
252 }
253
254 private Option parseMagelan(Document doc) {
255 Option created = new Option();
256 Element linkElement = doc.selectFirst("div.ponuda-sredina");
257 int id = Integer.parseInt(linkElement.attr("data-id"));
258 int turop = Integer.parseInt(linkElement.attr("data-turop"));
259 created.setLink("https://magelantravel.mk/ponudi.php?type=1&objektid=" + id + "&turop=" + turop);
260 Element imgElement = doc.selectFirst("div.imgLiquidFill.imgLiquid.ponuda-img.zoom");
261 created.setImgSrc(imgElement != null ? url + imgElement.attr("style")
262 .split("url\\(")[1].split("\\)")[0].replace("'", "").replace("./", "/") : null);
263 Element hotelNameElement = doc.selectFirst("div.ponuda-objekt");
264 created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
265 Element countryElement = doc.selectFirst("l.ponuda-lokacija");
266 created.setCountry(countryElement != null ? countryElement.text() : null);
267 Element dateElement = doc.selectFirst("l.ponuda-opis.termin");
268 created.setDateRange(dateElement != null ? dateElement.text() : null);
269 return created;
270 }
271 private Option parseEscapeTravel(Document doc) {
272 Option created = new Option();
273 Element card = doc.selectFirst("a.hotel-item");
274 String link = card.attr("href");
275 created.setLink(link);
276 created.setImgSrc(card.attr("data-picture"));
277 created.setHotelName(card.attr("data-title"));
278 Element countryP = doc.selectFirst("p.text-info");
279 String country = countryP.text().replaceAll("leto hoteli", "");
280 created.setCountry(country);
281 String[] queryParams = link.split("[?&]");
282 String startDateStr = null;
283 int nights = 0;
284 for (String param : queryParams) {
285 if (param.startsWith("Date=")) {
286 startDateStr = param.split("=")[1];
287 }
288 if (param.startsWith("Nights=")) {
289 nights = Integer.parseInt(param.split("=")[1]);
290 }
291 }
292 if (startDateStr != null && nights > 0)
293 {
294 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
295 try {
296 Date startDate = dateFormat.parse(startDateStr);
297
298 Calendar calendar = Calendar.getInstance();
299 calendar.setTime(startDate);
300 calendar.add(Calendar.DAY_OF_YEAR, nights);
301 Date endDate = calendar.getTime();
302 String dateRange = dateFormat.format(startDate) + " - " + dateFormat.format(endDate);
303 created.setDateRange(dateRange);
304 }catch (ParseException e){
305 e.printStackTrace();
306 }
307 }
308 return created;
309 }
310
311 @Override
312 public void run() {
313 System.out.println("Thread started for url: " + url);
314 initializeWebDriver();
315 if ("https://magelantravel.mk/".equals(url)) {
316 ObjectMapper mapper = new ObjectMapper();
317 try {
318 ClassLoader classLoader = getClass().getClassLoader();
319 JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
320 JsonNode countries = root.get("countries");
321 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
322 Calendar calendar = Calendar.getInstance();
323 calendar.add(Calendar.DAY_OF_YEAR, 1);
324
325 for (int i = 0; i < 90; i++) { // next three months
326 String date = dateFormat.format(calendar.getTime());
327 for (JsonNode countryNode : countries) {
328 String country = countryNode.asText();
329 for (int nokevanja = 2; nokevanja <= 10; nokevanja++) {
330 for(int lugje = 1; lugje <= 4; lugje++) {
331 String queryUrl = url + "/destinacii?ah_tip=1&iframe=&affiliate_code=&carter_id=0&carter_region=&carter_dataod=&carter_datado=&destinacija=" + country + "&oddatum=" + date + "&nokevanja=" + nokevanja + "&dodatum=&broj_vozrasni=" + lugje + "&broj_deca=0&spdete1=0&spdete2=0&spdete3=0&spdete4=0";
332 connectToWeb(queryUrl,lugje);
333 }
334 }
335 }
336 calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
337 }
338
339 } catch (IOException e) {
340 e.printStackTrace();
341 }
342 } else if ("https://booking.escapetravel.mk/".equals(url)) {
343 ObjectMapper mapper = new ObjectMapper();
344 try {
345 ClassLoader classLoader = getClass().getClassLoader();
346 JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
347 JsonNode countries = root.get("countries");
348 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
349 Calendar calendar = Calendar.getInstance();
350 calendar.add(Calendar.DAY_OF_YEAR, 1);
351
352 for (int i = 0; i < 90; i++) { // next three months
353 String date = dateFormat.format(calendar.getTime());
354 for (JsonNode countryNode : countries) {
355 String country = countryNode.asText();
356 for(int nokevanja = 2; nokevanja <=10; nokevanja ++) {
357 for(int lugje = 1; lugje <= 4; lugje++) {
358 String queryUrl = url + "/hotels?Search=" + country + "&Date=" + date + "&Nights=" + nokevanja + "&Rooms=1&Adults=" + lugje;
359 connectToWeb(queryUrl,lugje);
360 }
361 }
362 }
363 calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
364 }
365 } catch (IOException e) {
366 e.printStackTrace();
367 }
368 } else {
369 // Handle other URLs
370 }
371 closeWebDriver();
372 latch.countDown();
373 }
374
375}
Note: See TracBrowser for help on using the repository browser.