source: backend/GlobeGuru-backend/src/main/java/ScraperThread.java

Last change on this file was df7f390, checked in by Kristijan <kristijanzafirovski26@…>, 2 days ago

Added frontend functionality for changes and refactored code

  • Property mode set to 100644
File size: 16.4 KB
RevLine 
[c164f8f]1import com.fasterxml.jackson.databind.JsonNode;
2import com.fasterxml.jackson.databind.ObjectMapper;
3import org.openqa.selenium.By;
[d4d8f61]4import org.openqa.selenium.WebDriver;
[c164f8f]5import org.openqa.selenium.WebElement;
[d4d8f61]6import org.openqa.selenium.chrome.ChromeDriver;
7import org.openqa.selenium.chrome.ChromeOptions;
8import org.jsoup.Jsoup;
9import org.jsoup.nodes.Document;
10import org.jsoup.nodes.Element;
11import org.jsoup.select.Elements;
[c164f8f]12import org.openqa.selenium.support.ui.ExpectedCondition;
13import org.openqa.selenium.support.ui.ExpectedConditions;
14import org.openqa.selenium.support.ui.WebDriverWait;
[d4d8f61]15
[0a7426e]16import javax.xml.crypto.Data;
[c164f8f]17import java.io.File;
18import java.io.IOException;
19import java.sql.Connection;
20import java.sql.DriverManager;
21import java.sql.PreparedStatement;
22import java.sql.SQLException;
23import java.text.ParseException;
24import java.text.SimpleDateFormat;
25import java.util.*;
[d4d8f61]26import java.util.concurrent.ConcurrentLinkedQueue;
27import java.util.concurrent.CountDownLatch;
28
29public class ScraperThread extends Thread {
30 private String url;
31 private CountDownLatch latch;
32
[df7f390]33 public ScraperThread(String url, CountDownLatch latch) {
[d4d8f61]34 this.url = url;
35 this.latch = latch;
36 }
37
[1c51912]38 public WebDriver driver;
[c164f8f]39
40 private void initializeWebDriver() {
41 System.setProperty("webdriver.chrome.driver", "C:\\chromedriver-win64\\chromedriver.exe");
[d4d8f61]42 ChromeOptions options = new ChromeOptions();
[c164f8f]43 options.setBinary("C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe");
44 options.addArguments("--headless");
[d4d8f61]45 options.addArguments("--disable-gpu");
[c164f8f]46 options.addArguments("--remote-allow-origins=*");
47 options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
48 driver = new ChromeDriver(options);
49 }
50
51 private void closeWebDriver() {
52 if (driver != null) {
53 driver.quit();
54 }
55 }
56
[53bad7e]57 private void connectToWeb(String queryUrl, int numPeople) {
[c164f8f]58 driver.get(queryUrl);
59
60 WebDriverWait wait = new WebDriverWait(driver, 40); // 40s timeout buffer
61 switch (url) {
62 case "https://booking.escapetravel.mk/":
63 wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#hotels-container")));
[53bad7e]64 try { Thread.sleep(10000);} catch (InterruptedException e) { e.printStackTrace(); }//price fetch
[c164f8f]65 break;
66 case "https://magelantravel.mk/":
67 wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div.sodrzina")));
68 break;
69 }
70
71 String pageSource = driver.getPageSource();
72 System.out.println("Connected to " + queryUrl);
73 Document doc = Jsoup.parse(pageSource);
74 Element parentDiv;
75 Elements childDivs;
76
77 switch (url) {
78 case "https://booking.escapetravel.mk/":
79 parentDiv = doc.selectFirst("#hotels-container");
80 if (parentDiv != null) {
81 childDivs = parentDiv.select("a.hotel-item");
82 for (Element div : childDivs) {
83 String data = div.outerHtml();
[df7f390]84 Option option = optionParser(data, numPeople);
[c164f8f]85 if (option != null) {
[df7f390]86 option.setId(DatabaseUtil.saveOptionToDatabase(option));
87 scrapeOptionInfo(option);
88 System.out.println("Parsed " + option);
[d4d8f61]89 }
90 }
[c164f8f]91 } else {
92 System.out.println("Parent div not found");
93 }
94 break;
95 case "https://magelantravel.mk/":
96 parentDiv = doc.selectFirst("div.sodrzina");
97 if (parentDiv != null) {
98 childDivs = parentDiv.select("div.destinacija");
99 childDivs.removeIf(div -> div.attr("style").contains("display:none") || div.attr("style").contains("display: none"));
100 System.out.println("Filtered childDivs size: " + childDivs.size());
101 for (Element div : childDivs) {
102 String data = div.outerHtml();
[df7f390]103 Option newOption = optionParser(data, numPeople);
[c164f8f]104 if (newOption != null) {
[df7f390]105 newOption.setId(DatabaseUtil.saveOptionToDatabase(newOption));
106 scrapeOptionInfo(newOption);
107 System.out.println("Parsed " + newOption);
[d4d8f61]108 }
109 }
[df7f390]110 } else {
[c164f8f]111 System.out.println("Parent div not found");
112 }
113 break;
114 default:
115 System.out.println("URL not recognized for parsing.");
[d4d8f61]116 }
[df7f390]117
[d4d8f61]118 }
[1c51912]119 private void scrapeOptionInfo(Option option) {
120 String url = option.getLink();
121 if(url.contains("magelantravel.mk")) {
122 System.out.println("Scraping info for " + option.getHotelName());
123 String[] dates = option.getDateRange().split(" - ");
124 url += "&checkin=" + dates[0] + "&checkout=" + dates[1] + "&adult=" + option.getNumPeople();
[d4d8f61]125
[1c51912]126 driver.get(url);
127 try { Thread.sleep(5000); } catch (InterruptedException e) { e.printStackTrace(); } //data fetch
128 String pageSource = driver.getPageSource();
129 Document doc = Jsoup.parse(pageSource);
130 Elements roomOptions = doc.select(".tblroom > tbody > tr");
131 for (Element roomOption : roomOptions) {
132 String type = roomOption.select("a.tblroom-type").text();
[c164f8f]133
[1c51912]134 String board = roomOption.select(".rezervacija-objekt").text();
[cd64b06]135 if(board.length() > 2) {
136 board = board.substring(0, 2);
[1c51912]137 }
[cd64b06]138 if(board.isEmpty() || type.isEmpty()) continue;
139
[1c51912]140 Elements amenityElement = roomOption.select(".objekt-opis");
141 String amenity = (amenityElement != null ? amenityElement.text() : "");
142 System.out.println(amenity + " " + board + " " + type );
143 String priceText = roomOption.select(".tbl-cena").text().replace("€", "").trim();
144 float price;
145 if (!priceText.isEmpty()) {
146 price = Float.parseFloat(priceText);
147 }else continue;
[c164f8f]148
[0a7426e]149 //Check for changes
150 int odId = checkForChanges(option.getId(), type, board,amenity,price);
151 if(odId != 0) { //true = changes found - update details
152 DatabaseUtil.updateOptionDetails(odId,type,board,amenity,price);
153 }else{ //false = not found / no changes - save regular
154 DatabaseUtil.saveOptionDetails(option.getId(), type, board, amenity, price);
155 }
[1c51912]156 }
157 }
[cd64b06]158 else if(url.contains("booking.escapetravel.mk")){
159 System.out.println("Scraping info for " + url);
160
161 driver.get(url);
162 try { Thread.sleep(5000); } catch (InterruptedException e) { e.printStackTrace(); } //data fetch
163 String pageSource = driver.getPageSource();
164 Document doc = Jsoup.parse(pageSource);
165 Elements roomOptions = doc.select("#hotel-rooms-container .hotel-room-row");
166 for(Element roomOption : roomOptions){
167 String type = roomOption.select("td.align-middle").first().text();
168 String board = roomOption.select("td.align-middle.text-primary.lead").text();
169 if (board.isEmpty() || type.isEmpty()) continue;
170 String priceText = roomOption.select("td.align-middle.text-end .text-success.d-block.lead").text().replace("€", "").trim();
171 float price;
172 if (!priceText.isEmpty()) {
173 price = Float.parseFloat(priceText.replace(",", ""));
174 } else continue;
175
176 Elements amenityElements = doc.select("div.row > div.col-6.col-md-3.col-xl-2");
177 StringBuilder amenities = new StringBuilder();
178 for (Element amenityElement : amenityElements) {
179 amenities.append(amenityElement.text()).append(", ");
180 }
181 if (!amenities.isEmpty()) {
182 amenities.setLength(amenities.length() - 2);
183 }
184 System.out.println(type + board + price + amenities);
[0a7426e]185 int odId = checkForChanges(option.getId(), type, board,amenities.toString(),price);
186 if(odId != 0) { //true = changes found - update details
187 DatabaseUtil.updateOptionDetails(odId,type,board,amenities.toString(),price);
188 }else{ //false = not found / no changes - save regular
189 DatabaseUtil.saveOptionDetails(option.getId(), type, board, amenities.toString(), price);
190 }
[cd64b06]191 }
192
193 }
[1c51912]194 }
[0a7426e]195 private int checkForChanges(int id, String type, String board, String amenities, float price){ //return true for changes, false for no changes
196 try {
197 List<Option> pooled = DatabaseUtil.poolOptionDetails(id);
198 if (pooled.isEmpty()) { //not saved = no changes - save regular
199 return 0;
200 }else{ //got the options saved details
201 for(Option o : pooled){
202 if(o.getType().equals(type) && o.getBoard().equals(board)){//for the room and board check amenity and price changes (Assumption type of room and board do not change)
203 if((!o.getAmenities().equals(amenities)) || o.getPrice() != price){
204 return o.getDetail_id(); //Change
205 }
206 }
207 }
208 }
209 }catch(SQLException e){
210 e.printStackTrace();
211 }
212 return 0; //no changes detected
213 }
[1c51912]214 private Option optionParser(String data, int numPeople){
[d4d8f61]215 Document doc = Jsoup.parse(data);
216 Option created = new Option();
217 switch (url) {
[c164f8f]218 case "https://magelantravel.mk/":
219 created = parseMagelan(doc);
[53bad7e]220 created.setNumPeople(numPeople);
[d4d8f61]221 break;
222 case "https://booking.escapetravel.mk/":
223 created = parseEscapeTravel(doc);
[53bad7e]224 created.setNumPeople(numPeople);
[d4d8f61]225 break;
226 default:
227 System.out.println("URL not recognized for parsing.");
228 break;
229 }
230 if (created.isEmpty()) {
231 return null;
232 }
233 return created;
234 }
235
[c164f8f]236 private Option parseMagelan(Document doc) {
[d4d8f61]237 Option created = new Option();
[c164f8f]238 Element linkElement = doc.selectFirst("div.ponuda-sredina");
239 int id = Integer.parseInt(linkElement.attr("data-id"));
240 int turop = Integer.parseInt(linkElement.attr("data-turop"));
241 created.setLink("https://magelantravel.mk/ponudi.php?type=1&objektid=" + id + "&turop=" + turop);
242 Element imgElement = doc.selectFirst("div.imgLiquidFill.imgLiquid.ponuda-img.zoom");
243 created.setImgSrc(imgElement != null ? url + imgElement.attr("style")
244 .split("url\\(")[1].split("\\)")[0].replace("'", "").replace("./", "/") : null);
245 Element hotelNameElement = doc.selectFirst("div.ponuda-objekt");
[d4d8f61]246 created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
[c164f8f]247 Element countryElement = doc.selectFirst("l.ponuda-lokacija");
[d4d8f61]248 created.setCountry(countryElement != null ? countryElement.text() : null);
[c164f8f]249 Element dateElement = doc.selectFirst("l.ponuda-opis.termin");
250 created.setDateRange(dateElement != null ? dateElement.text() : null);
[d4d8f61]251 return created;
252 }
253 private Option parseEscapeTravel(Document doc) {
254 Option created = new Option();
[c164f8f]255 Element card = doc.selectFirst("a.hotel-item");
256 String link = card.attr("href");
257 created.setLink(link);
258 created.setImgSrc(card.attr("data-picture"));
259 created.setHotelName(card.attr("data-title"));
260 Element countryP = doc.selectFirst("p.text-info");
[53bad7e]261 String country = countryP.text().replaceAll("leto hoteli", "");
262 created.setCountry(country);
[c164f8f]263 String[] queryParams = link.split("[?&]");
264 String startDateStr = null;
265 int nights = 0;
266 for (String param : queryParams) {
267 if (param.startsWith("Date=")) {
268 startDateStr = param.split("=")[1];
269 }
270 if (param.startsWith("Nights=")) {
271 nights = Integer.parseInt(param.split("=")[1]);
272 }
273 }
274 if (startDateStr != null && nights > 0)
275 {
276 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
277 try {
278 Date startDate = dateFormat.parse(startDateStr);
[d4d8f61]279
[c164f8f]280 Calendar calendar = Calendar.getInstance();
281 calendar.setTime(startDate);
282 calendar.add(Calendar.DAY_OF_YEAR, nights);
283 Date endDate = calendar.getTime();
284 String dateRange = dateFormat.format(startDate) + " - " + dateFormat.format(endDate);
285 created.setDateRange(dateRange);
286 }catch (ParseException e){
287 e.printStackTrace();
288 }
289 }
[d4d8f61]290 return created;
291 }
292
293 @Override
[c164f8f]294 public void run() {
295 System.out.println("Thread started for url: " + url);
296 initializeWebDriver();
297 if ("https://magelantravel.mk/".equals(url)) {
298 ObjectMapper mapper = new ObjectMapper();
299 try {
300 ClassLoader classLoader = getClass().getClassLoader();
301 JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
302 JsonNode countries = root.get("countries");
303 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
304 Calendar calendar = Calendar.getInstance();
305 calendar.add(Calendar.DAY_OF_YEAR, 1);
306
307 for (int i = 0; i < 90; i++) { // next three months
308 String date = dateFormat.format(calendar.getTime());
309 for (JsonNode countryNode : countries) {
310 String country = countryNode.asText();
311 for (int nokevanja = 2; nokevanja <= 10; nokevanja++) {
[53bad7e]312 for(int lugje = 1; lugje <= 4; lugje++) {
313 String queryUrl = url + "/destinacii?ah_tip=1&iframe=&affiliate_code=&carter_id=0&carter_region=&carter_dataod=&carter_datado=&destinacija=" + country + "&oddatum=" + date + "&nokevanja=" + nokevanja + "&dodatum=&broj_vozrasni=" + lugje + "&broj_deca=0&spdete1=0&spdete2=0&spdete3=0&spdete4=0";
314 connectToWeb(queryUrl,lugje);
315 }
[c164f8f]316 }
317 }
318 calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
319 }
320
321 } catch (IOException e) {
322 e.printStackTrace();
323 }
324 } else if ("https://booking.escapetravel.mk/".equals(url)) {
325 ObjectMapper mapper = new ObjectMapper();
326 try {
327 ClassLoader classLoader = getClass().getClassLoader();
328 JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
[53bad7e]329 JsonNode countries = root.get("countries");
[c164f8f]330 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
331 Calendar calendar = Calendar.getInstance();
332 calendar.add(Calendar.DAY_OF_YEAR, 1);
333
334 for (int i = 0; i < 90; i++) { // next three months
335 String date = dateFormat.format(calendar.getTime());
336 for (JsonNode countryNode : countries) {
337 String country = countryNode.asText();
338 for(int nokevanja = 2; nokevanja <=10; nokevanja ++) {
[53bad7e]339 for(int lugje = 1; lugje <= 4; lugje++) {
340 String queryUrl = url + "/hotels?Search=" + country + "&Date=" + date + "&Nights=" + nokevanja + "&Rooms=1&Adults=" + lugje;
341 connectToWeb(queryUrl,lugje);
342 }
[c164f8f]343 }
[d4d8f61]344 }
[c164f8f]345 calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
346 }
347 } catch (IOException e) {
348 e.printStackTrace();
[d4d8f61]349 }
[c164f8f]350 } else {
351 // Handle other URLs
[d4d8f61]352 }
[c164f8f]353 closeWebDriver();
354 latch.countDown();
[d4d8f61]355 }
[c164f8f]356
357}
Note: See TracBrowser for help on using the repository browser.