source: backend/GlobeGuru-backend/src/main/java/ScraperThread.java

Last change on this file was df7f390, checked in by Kristijan <kristijanzafirovski26@…>, 2 days ago

Added frontend functionality for changes and refactored code

  • Property mode set to 100644
File size: 16.4 KB
Line 
1import com.fasterxml.jackson.databind.JsonNode;
2import com.fasterxml.jackson.databind.ObjectMapper;
3import org.openqa.selenium.By;
4import org.openqa.selenium.WebDriver;
5import org.openqa.selenium.WebElement;
6import org.openqa.selenium.chrome.ChromeDriver;
7import org.openqa.selenium.chrome.ChromeOptions;
8import org.jsoup.Jsoup;
9import org.jsoup.nodes.Document;
10import org.jsoup.nodes.Element;
11import org.jsoup.select.Elements;
12import org.openqa.selenium.support.ui.ExpectedCondition;
13import org.openqa.selenium.support.ui.ExpectedConditions;
14import org.openqa.selenium.support.ui.WebDriverWait;
15
16import javax.xml.crypto.Data;
17import java.io.File;
18import java.io.IOException;
19import java.sql.Connection;
20import java.sql.DriverManager;
21import java.sql.PreparedStatement;
22import java.sql.SQLException;
23import java.text.ParseException;
24import java.text.SimpleDateFormat;
25import java.util.*;
26import java.util.concurrent.ConcurrentLinkedQueue;
27import java.util.concurrent.CountDownLatch;
28
29public class ScraperThread extends Thread {
30 private String url;
31 private CountDownLatch latch;
32
33 public ScraperThread(String url, CountDownLatch latch) {
34 this.url = url;
35 this.latch = latch;
36 }
37
38 public WebDriver driver;
39
40 private void initializeWebDriver() {
41 System.setProperty("webdriver.chrome.driver", "C:\\chromedriver-win64\\chromedriver.exe");
42 ChromeOptions options = new ChromeOptions();
43 options.setBinary("C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe");
44 options.addArguments("--headless");
45 options.addArguments("--disable-gpu");
46 options.addArguments("--remote-allow-origins=*");
47 options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
48 driver = new ChromeDriver(options);
49 }
50
51 private void closeWebDriver() {
52 if (driver != null) {
53 driver.quit();
54 }
55 }
56
57 private void connectToWeb(String queryUrl, int numPeople) {
58 driver.get(queryUrl);
59
60 WebDriverWait wait = new WebDriverWait(driver, 40); // 40s timeout buffer
61 switch (url) {
62 case "https://booking.escapetravel.mk/":
63 wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#hotels-container")));
64 try { Thread.sleep(10000);} catch (InterruptedException e) { e.printStackTrace(); }//price fetch
65 break;
66 case "https://magelantravel.mk/":
67 wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div.sodrzina")));
68 break;
69 }
70
71 String pageSource = driver.getPageSource();
72 System.out.println("Connected to " + queryUrl);
73 Document doc = Jsoup.parse(pageSource);
74 Element parentDiv;
75 Elements childDivs;
76
77 switch (url) {
78 case "https://booking.escapetravel.mk/":
79 parentDiv = doc.selectFirst("#hotels-container");
80 if (parentDiv != null) {
81 childDivs = parentDiv.select("a.hotel-item");
82 for (Element div : childDivs) {
83 String data = div.outerHtml();
84 Option option = optionParser(data, numPeople);
85 if (option != null) {
86 option.setId(DatabaseUtil.saveOptionToDatabase(option));
87 scrapeOptionInfo(option);
88 System.out.println("Parsed " + option);
89 }
90 }
91 } else {
92 System.out.println("Parent div not found");
93 }
94 break;
95 case "https://magelantravel.mk/":
96 parentDiv = doc.selectFirst("div.sodrzina");
97 if (parentDiv != null) {
98 childDivs = parentDiv.select("div.destinacija");
99 childDivs.removeIf(div -> div.attr("style").contains("display:none") || div.attr("style").contains("display: none"));
100 System.out.println("Filtered childDivs size: " + childDivs.size());
101 for (Element div : childDivs) {
102 String data = div.outerHtml();
103 Option newOption = optionParser(data, numPeople);
104 if (newOption != null) {
105 newOption.setId(DatabaseUtil.saveOptionToDatabase(newOption));
106 scrapeOptionInfo(newOption);
107 System.out.println("Parsed " + newOption);
108 }
109 }
110 } else {
111 System.out.println("Parent div not found");
112 }
113 break;
114 default:
115 System.out.println("URL not recognized for parsing.");
116 }
117
118 }
119 private void scrapeOptionInfo(Option option) {
120 String url = option.getLink();
121 if(url.contains("magelantravel.mk")) {
122 System.out.println("Scraping info for " + option.getHotelName());
123 String[] dates = option.getDateRange().split(" - ");
124 url += "&checkin=" + dates[0] + "&checkout=" + dates[1] + "&adult=" + option.getNumPeople();
125
126 driver.get(url);
127 try { Thread.sleep(5000); } catch (InterruptedException e) { e.printStackTrace(); } //data fetch
128 String pageSource = driver.getPageSource();
129 Document doc = Jsoup.parse(pageSource);
130 Elements roomOptions = doc.select(".tblroom > tbody > tr");
131 for (Element roomOption : roomOptions) {
132 String type = roomOption.select("a.tblroom-type").text();
133
134 String board = roomOption.select(".rezervacija-objekt").text();
135 if(board.length() > 2) {
136 board = board.substring(0, 2);
137 }
138 if(board.isEmpty() || type.isEmpty()) continue;
139
140 Elements amenityElement = roomOption.select(".objekt-opis");
141 String amenity = (amenityElement != null ? amenityElement.text() : "");
142 System.out.println(amenity + " " + board + " " + type );
143 String priceText = roomOption.select(".tbl-cena").text().replace("€", "").trim();
144 float price;
145 if (!priceText.isEmpty()) {
146 price = Float.parseFloat(priceText);
147 }else continue;
148
149 //Check for changes
150 int odId = checkForChanges(option.getId(), type, board,amenity,price);
151 if(odId != 0) { //true = changes found - update details
152 DatabaseUtil.updateOptionDetails(odId,type,board,amenity,price);
153 }else{ //false = not found / no changes - save regular
154 DatabaseUtil.saveOptionDetails(option.getId(), type, board, amenity, price);
155 }
156 }
157 }
158 else if(url.contains("booking.escapetravel.mk")){
159 System.out.println("Scraping info for " + url);
160
161 driver.get(url);
162 try { Thread.sleep(5000); } catch (InterruptedException e) { e.printStackTrace(); } //data fetch
163 String pageSource = driver.getPageSource();
164 Document doc = Jsoup.parse(pageSource);
165 Elements roomOptions = doc.select("#hotel-rooms-container .hotel-room-row");
166 for(Element roomOption : roomOptions){
167 String type = roomOption.select("td.align-middle").first().text();
168 String board = roomOption.select("td.align-middle.text-primary.lead").text();
169 if (board.isEmpty() || type.isEmpty()) continue;
170 String priceText = roomOption.select("td.align-middle.text-end .text-success.d-block.lead").text().replace("€", "").trim();
171 float price;
172 if (!priceText.isEmpty()) {
173 price = Float.parseFloat(priceText.replace(",", ""));
174 } else continue;
175
176 Elements amenityElements = doc.select("div.row > div.col-6.col-md-3.col-xl-2");
177 StringBuilder amenities = new StringBuilder();
178 for (Element amenityElement : amenityElements) {
179 amenities.append(amenityElement.text()).append(", ");
180 }
181 if (!amenities.isEmpty()) {
182 amenities.setLength(amenities.length() - 2);
183 }
184 System.out.println(type + board + price + amenities);
185 int odId = checkForChanges(option.getId(), type, board,amenities.toString(),price);
186 if(odId != 0) { //true = changes found - update details
187 DatabaseUtil.updateOptionDetails(odId,type,board,amenities.toString(),price);
188 }else{ //false = not found / no changes - save regular
189 DatabaseUtil.saveOptionDetails(option.getId(), type, board, amenities.toString(), price);
190 }
191 }
192
193 }
194 }
195 private int checkForChanges(int id, String type, String board, String amenities, float price){ //return true for changes, false for no changes
196 try {
197 List<Option> pooled = DatabaseUtil.poolOptionDetails(id);
198 if (pooled.isEmpty()) { //not saved = no changes - save regular
199 return 0;
200 }else{ //got the options saved details
201 for(Option o : pooled){
202 if(o.getType().equals(type) && o.getBoard().equals(board)){//for the room and board check amenity and price changes (Assumption type of room and board do not change)
203 if((!o.getAmenities().equals(amenities)) || o.getPrice() != price){
204 return o.getDetail_id(); //Change
205 }
206 }
207 }
208 }
209 }catch(SQLException e){
210 e.printStackTrace();
211 }
212 return 0; //no changes detected
213 }
214 private Option optionParser(String data, int numPeople){
215 Document doc = Jsoup.parse(data);
216 Option created = new Option();
217 switch (url) {
218 case "https://magelantravel.mk/":
219 created = parseMagelan(doc);
220 created.setNumPeople(numPeople);
221 break;
222 case "https://booking.escapetravel.mk/":
223 created = parseEscapeTravel(doc);
224 created.setNumPeople(numPeople);
225 break;
226 default:
227 System.out.println("URL not recognized for parsing.");
228 break;
229 }
230 if (created.isEmpty()) {
231 return null;
232 }
233 return created;
234 }
235
236 private Option parseMagelan(Document doc) {
237 Option created = new Option();
238 Element linkElement = doc.selectFirst("div.ponuda-sredina");
239 int id = Integer.parseInt(linkElement.attr("data-id"));
240 int turop = Integer.parseInt(linkElement.attr("data-turop"));
241 created.setLink("https://magelantravel.mk/ponudi.php?type=1&objektid=" + id + "&turop=" + turop);
242 Element imgElement = doc.selectFirst("div.imgLiquidFill.imgLiquid.ponuda-img.zoom");
243 created.setImgSrc(imgElement != null ? url + imgElement.attr("style")
244 .split("url\\(")[1].split("\\)")[0].replace("'", "").replace("./", "/") : null);
245 Element hotelNameElement = doc.selectFirst("div.ponuda-objekt");
246 created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
247 Element countryElement = doc.selectFirst("l.ponuda-lokacija");
248 created.setCountry(countryElement != null ? countryElement.text() : null);
249 Element dateElement = doc.selectFirst("l.ponuda-opis.termin");
250 created.setDateRange(dateElement != null ? dateElement.text() : null);
251 return created;
252 }
253 private Option parseEscapeTravel(Document doc) {
254 Option created = new Option();
255 Element card = doc.selectFirst("a.hotel-item");
256 String link = card.attr("href");
257 created.setLink(link);
258 created.setImgSrc(card.attr("data-picture"));
259 created.setHotelName(card.attr("data-title"));
260 Element countryP = doc.selectFirst("p.text-info");
261 String country = countryP.text().replaceAll("leto hoteli", "");
262 created.setCountry(country);
263 String[] queryParams = link.split("[?&]");
264 String startDateStr = null;
265 int nights = 0;
266 for (String param : queryParams) {
267 if (param.startsWith("Date=")) {
268 startDateStr = param.split("=")[1];
269 }
270 if (param.startsWith("Nights=")) {
271 nights = Integer.parseInt(param.split("=")[1]);
272 }
273 }
274 if (startDateStr != null && nights > 0)
275 {
276 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
277 try {
278 Date startDate = dateFormat.parse(startDateStr);
279
280 Calendar calendar = Calendar.getInstance();
281 calendar.setTime(startDate);
282 calendar.add(Calendar.DAY_OF_YEAR, nights);
283 Date endDate = calendar.getTime();
284 String dateRange = dateFormat.format(startDate) + " - " + dateFormat.format(endDate);
285 created.setDateRange(dateRange);
286 }catch (ParseException e){
287 e.printStackTrace();
288 }
289 }
290 return created;
291 }
292
293 @Override
294 public void run() {
295 System.out.println("Thread started for url: " + url);
296 initializeWebDriver();
297 if ("https://magelantravel.mk/".equals(url)) {
298 ObjectMapper mapper = new ObjectMapper();
299 try {
300 ClassLoader classLoader = getClass().getClassLoader();
301 JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
302 JsonNode countries = root.get("countries");
303 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
304 Calendar calendar = Calendar.getInstance();
305 calendar.add(Calendar.DAY_OF_YEAR, 1);
306
307 for (int i = 0; i < 90; i++) { // next three months
308 String date = dateFormat.format(calendar.getTime());
309 for (JsonNode countryNode : countries) {
310 String country = countryNode.asText();
311 for (int nokevanja = 2; nokevanja <= 10; nokevanja++) {
312 for(int lugje = 1; lugje <= 4; lugje++) {
313 String queryUrl = url + "/destinacii?ah_tip=1&iframe=&affiliate_code=&carter_id=0&carter_region=&carter_dataod=&carter_datado=&destinacija=" + country + "&oddatum=" + date + "&nokevanja=" + nokevanja + "&dodatum=&broj_vozrasni=" + lugje + "&broj_deca=0&spdete1=0&spdete2=0&spdete3=0&spdete4=0";
314 connectToWeb(queryUrl,lugje);
315 }
316 }
317 }
318 calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
319 }
320
321 } catch (IOException e) {
322 e.printStackTrace();
323 }
324 } else if ("https://booking.escapetravel.mk/".equals(url)) {
325 ObjectMapper mapper = new ObjectMapper();
326 try {
327 ClassLoader classLoader = getClass().getClassLoader();
328 JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
329 JsonNode countries = root.get("countries");
330 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
331 Calendar calendar = Calendar.getInstance();
332 calendar.add(Calendar.DAY_OF_YEAR, 1);
333
334 for (int i = 0; i < 90; i++) { // next three months
335 String date = dateFormat.format(calendar.getTime());
336 for (JsonNode countryNode : countries) {
337 String country = countryNode.asText();
338 for(int nokevanja = 2; nokevanja <=10; nokevanja ++) {
339 for(int lugje = 1; lugje <= 4; lugje++) {
340 String queryUrl = url + "/hotels?Search=" + country + "&Date=" + date + "&Nights=" + nokevanja + "&Rooms=1&Adults=" + lugje;
341 connectToWeb(queryUrl,lugje);
342 }
343 }
344 }
345 calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
346 }
347 } catch (IOException e) {
348 e.printStackTrace();
349 }
350 } else {
351 // Handle other URLs
352 }
353 closeWebDriver();
354 latch.countDown();
355 }
356
357}
Note: See TracBrowser for help on using the repository browser.