source: backend/GlobeGuru-backend/src/main/java/ScraperThread.java@ cd64b06

Last change on this file since cd64b06 was cd64b06, checked in by Kristijan <kristijanzafirovski26@…>, 3 weeks ago

Added info scraping for escape travel

  • Property mode set to 100644
File size: 16.3 KB
Line 
1import com.fasterxml.jackson.databind.JsonNode;
2import com.fasterxml.jackson.databind.ObjectMapper;
3import org.openqa.selenium.By;
4import org.openqa.selenium.WebDriver;
5import org.openqa.selenium.WebElement;
6import org.openqa.selenium.chrome.ChromeDriver;
7import org.openqa.selenium.chrome.ChromeOptions;
8import org.jsoup.Jsoup;
9import org.jsoup.nodes.Document;
10import org.jsoup.nodes.Element;
11import org.jsoup.select.Elements;
12import org.openqa.selenium.support.ui.ExpectedCondition;
13import org.openqa.selenium.support.ui.ExpectedConditions;
14import org.openqa.selenium.support.ui.WebDriverWait;
15
16import java.io.File;
17import java.io.IOException;
18import java.sql.Connection;
19import java.sql.DriverManager;
20import java.sql.PreparedStatement;
21import java.sql.SQLException;
22import java.text.ParseException;
23import java.text.SimpleDateFormat;
24import java.util.*;
25import java.util.concurrent.ConcurrentLinkedQueue;
26import java.util.concurrent.CountDownLatch;
27
28public class ScraperThread extends Thread {
29 private String url;
30 private ConcurrentLinkedQueue<Option> uniqueOptions;
31 private CountDownLatch latch;
32 private Set<Option> optionSet;
33
34 public ScraperThread(String url, ConcurrentLinkedQueue<Option> optionsQueue, CountDownLatch latch) {
35 this.url = url;
36 this.uniqueOptions = optionsQueue;
37 this.latch = latch;
38 this.optionSet = new HashSet<>();
39 }
40
41 public WebDriver driver;
42
43 private void initializeWebDriver() {
44 System.setProperty("webdriver.chrome.driver", "C:\\chromedriver-win64\\chromedriver.exe");
45 ChromeOptions options = new ChromeOptions();
46 options.setBinary("C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe");
47 options.addArguments("--headless");
48 options.addArguments("--disable-gpu");
49 options.addArguments("--remote-allow-origins=*");
50 options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
51 driver = new ChromeDriver(options);
52 }
53
54 private void closeWebDriver() {
55 if (driver != null) {
56 driver.quit();
57 }
58 }
59
60 private void connectToWeb(String queryUrl, int numPeople) {
61 driver.get(queryUrl);
62
63 WebDriverWait wait = new WebDriverWait(driver, 40); // 40s timeout buffer
64 switch (url) {
65 case "https://booking.escapetravel.mk/":
66 wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#hotels-container")));
67 try { Thread.sleep(10000);} catch (InterruptedException e) { e.printStackTrace(); }//price fetch
68 break;
69 case "https://magelantravel.mk/":
70 wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div.sodrzina")));
71 break;
72 }
73
74 String pageSource = driver.getPageSource();
75 System.out.println("Connected to " + queryUrl);
76 Document doc = Jsoup.parse(pageSource);
77 Element parentDiv;
78 Elements childDivs;
79
80 switch (url) {
81 case "https://booking.escapetravel.mk/":
82 parentDiv = doc.selectFirst("#hotels-container");
83 if (parentDiv != null) {
84 childDivs = parentDiv.select("a.hotel-item");
85 for (Element div : childDivs) {
86 String data = div.outerHtml();
87 Option option = optionParser(data,numPeople);
88 if (option != null) {
89 Option existingOption = DatabaseUtil.findOption(option);
90 if (existingOption != null) {
91 if (existingOption.equals(option)) {
92 option.setPriceChanged(true);
93 option.setNewPrice(option.getPrice());
94 }
95 DatabaseUtil.updateOptionInDatabase(option);
96 } else if (optionSet.add(option)) {
97 uniqueOptions.add(option);
98 option.setId(DatabaseUtil.saveOptionToDatabase(option));
99 scrapeOptionInfo(option);
100 System.out.println("Parsed " + option);
101 }
102 }
103 }
104 } else {
105 System.out.println("Parent div not found");
106 }
107 break;
108 case "https://magelantravel.mk/":
109 parentDiv = doc.selectFirst("div.sodrzina");
110 if (parentDiv != null) {
111 childDivs = parentDiv.select("div.destinacija");
112 childDivs.removeIf(div -> div.attr("style").contains("display:none") || div.attr("style").contains("display: none"));
113 System.out.println("Filtered childDivs size: " + childDivs.size());
114 for (Element div : childDivs) {
115 String data = div.outerHtml();
116 Option newOption = optionParser(data,numPeople);
117 if (newOption != null) {
118 if (optionSet.add(newOption)) {
119 uniqueOptions.add(newOption);
120
121 newOption.setId(DatabaseUtil.saveOptionToDatabase(newOption));
122 scrapeOptionInfo(newOption);
123 System.out.println("Parsed " + newOption);
124 }
125 }
126 }
127
128 } else {
129 System.out.println("Parent div not found");
130 }
131 break;
132 default:
133 System.out.println("URL not recognized for parsing.");
134 }
135 }
136 private void scrapeOptionInfo(Option option) {
137 String url = option.getLink();
138 if(url.contains("magelantravel.mk")) {
139 System.out.println("Scraping info for " + option.getHotelName());
140 String[] dates = option.getDateRange().split(" - ");
141 url += "&checkin=" + dates[0] + "&checkout=" + dates[1] + "&adult=" + option.getNumPeople();
142
143 driver.get(url);
144 try { Thread.sleep(5000); } catch (InterruptedException e) { e.printStackTrace(); } //data fetch
145 String pageSource = driver.getPageSource();
146 Document doc = Jsoup.parse(pageSource);
147 Elements roomOptions = doc.select(".tblroom > tbody > tr");
148 for (Element roomOption : roomOptions) {
149 String type = roomOption.select("a.tblroom-type").text();
150
151 String board = roomOption.select(".rezervacija-objekt").text();
152 if(board.length() > 2) {
153 board = board.substring(0, 2);
154 }
155 if(board.isEmpty() || type.isEmpty()) continue;
156
157 Elements amenityElement = roomOption.select(".objekt-opis");
158 String amenity = (amenityElement != null ? amenityElement.text() : "");
159 System.out.println(amenity + " " + board + " " + type );
160 String priceText = roomOption.select(".tbl-cena").text().replace("€", "").trim();
161 float price;
162 if (!priceText.isEmpty()) {
163 price = Float.parseFloat(priceText);
164 }else continue;
165
166 DatabaseUtil.saveOptionDetails(option.getId(), type,board,amenity, price);
167 }
168 }
169 else if(url.contains("booking.escapetravel.mk")){
170 System.out.println("Scraping info for " + url);
171
172 driver.get(url);
173 try { Thread.sleep(5000); } catch (InterruptedException e) { e.printStackTrace(); } //data fetch
174 String pageSource = driver.getPageSource();
175 Document doc = Jsoup.parse(pageSource);
176 Elements roomOptions = doc.select("#hotel-rooms-container .hotel-room-row");
177 for(Element roomOption : roomOptions){
178 String type = roomOption.select("td.align-middle").first().text();
179 String board = roomOption.select("td.align-middle.text-primary.lead").text();
180 if (board.isEmpty() || type.isEmpty()) continue;
181 String priceText = roomOption.select("td.align-middle.text-end .text-success.d-block.lead").text().replace("€", "").trim();
182 float price;
183 if (!priceText.isEmpty()) {
184 price = Float.parseFloat(priceText.replace(",", ""));
185 } else continue;
186
187 Elements amenityElements = doc.select("div.row > div.col-6.col-md-3.col-xl-2");
188 StringBuilder amenities = new StringBuilder();
189 for (Element amenityElement : amenityElements) {
190 amenities.append(amenityElement.text()).append(", ");
191 }
192 if (!amenities.isEmpty()) {
193 amenities.setLength(amenities.length() - 2);
194 }
195 System.out.println(type + board + price + amenities);
196 DatabaseUtil.saveOptionDetails(option.getId(), type, board, amenities.toString(), price);
197 }
198
199 }
200 }
201 private Option optionParser(String data, int numPeople){
202 Document doc = Jsoup.parse(data);
203 Option created = new Option();
204 switch (url) {
205 case "https://magelantravel.mk/":
206 created = parseMagelan(doc);
207 created.setNumPeople(numPeople);
208 break;
209 case "https://booking.escapetravel.mk/":
210 created = parseEscapeTravel(doc);
211 created.setNumPeople(numPeople);
212 break;
213 default:
214 System.out.println("URL not recognized for parsing.");
215 break;
216 }
217 if (created.isEmpty()) {
218 return null;
219 }
220 //scrapeOptionInfo(created);
221 return created;
222 }
223
224 private Option parseMagelan(Document doc) {
225 Option created = new Option();
226 Element linkElement = doc.selectFirst("div.ponuda-sredina");
227 int id = Integer.parseInt(linkElement.attr("data-id"));
228 int turop = Integer.parseInt(linkElement.attr("data-turop"));
229 created.setLink("https://magelantravel.mk/ponudi.php?type=1&objektid=" + id + "&turop=" + turop);
230 Element imgElement = doc.selectFirst("div.imgLiquidFill.imgLiquid.ponuda-img.zoom");
231 created.setImgSrc(imgElement != null ? url + imgElement.attr("style")
232 .split("url\\(")[1].split("\\)")[0].replace("'", "").replace("./", "/") : null);
233 Element hotelNameElement = doc.selectFirst("div.ponuda-objekt");
234 created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
235 Element countryElement = doc.selectFirst("l.ponuda-lokacija");
236 created.setCountry(countryElement != null ? countryElement.text() : null);
237 //Element priceElement = doc.selectFirst("div.ponuda-cena");
238 Element dateElement = doc.selectFirst("l.ponuda-opis.termin");
239 created.setDateRange(dateElement != null ? dateElement.text() : null);
240 /*float price = Float.parseFloat(priceElement != null ? priceElement.text().replaceAll("[^\\d.]", "") : "0");
241 created.setPrice(price);*/
242 return created;
243 }
244 private Option parseEscapeTravel(Document doc) {
245 Option created = new Option();
246 Element card = doc.selectFirst("a.hotel-item");
247 String link = card.attr("href");
248 created.setLink(link);
249 created.setImgSrc(card.attr("data-picture"));
250 created.setHotelName(card.attr("data-title"));
251 Element countryP = doc.selectFirst("p.text-info");
252 String country = countryP.text().replaceAll("leto hoteli", "");
253 created.setCountry(country);
254 /*Element priceElem = doc.selectFirst("span.hotel-price");
255 String priceText = priceElem.text();
256 float price = 0;
257 if(!priceText.isEmpty()) {
258 price = Float.parseFloat(priceText.replace("€", ""));
259 }
260 created.setPrice(price);*/
261 String[] queryParams = link.split("[?&]");
262 String startDateStr = null;
263 int nights = 0;
264 for (String param : queryParams) {
265 if (param.startsWith("Date=")) {
266 startDateStr = param.split("=")[1];
267 }
268 if (param.startsWith("Nights=")) {
269 nights = Integer.parseInt(param.split("=")[1]);
270 }
271 }
272 if (startDateStr != null && nights > 0)
273 {
274 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
275 try {
276 Date startDate = dateFormat.parse(startDateStr);
277
278 Calendar calendar = Calendar.getInstance();
279 calendar.setTime(startDate);
280 calendar.add(Calendar.DAY_OF_YEAR, nights);
281 Date endDate = calendar.getTime();
282 String dateRange = dateFormat.format(startDate) + " - " + dateFormat.format(endDate);
283 created.setDateRange(dateRange);
284 }catch (ParseException e){
285 e.printStackTrace();
286 }
287 }
288 return created;
289 }
290
291 @Override
292 public void run() {
293 System.out.println("Thread started for url: " + url);
294 initializeWebDriver();
295 if ("https://magelantravel.mk/".equals(url)) {
296 ObjectMapper mapper = new ObjectMapper();
297 try {
298 ClassLoader classLoader = getClass().getClassLoader();
299 JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
300 JsonNode countries = root.get("countries");
301 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
302 Calendar calendar = Calendar.getInstance();
303 calendar.add(Calendar.DAY_OF_YEAR, 1);
304
305 for (int i = 0; i < 90; i++) { // next three months
306 String date = dateFormat.format(calendar.getTime());
307 for (JsonNode countryNode : countries) {
308 String country = countryNode.asText();
309 for (int nokevanja = 2; nokevanja <= 10; nokevanja++) {
310 for(int lugje = 1; lugje <= 4; lugje++) {
311 String queryUrl = url + "/destinacii?ah_tip=1&iframe=&affiliate_code=&carter_id=0&carter_region=&carter_dataod=&carter_datado=&destinacija=" + country + "&oddatum=" + date + "&nokevanja=" + nokevanja + "&dodatum=&broj_vozrasni=" + lugje + "&broj_deca=0&spdete1=0&spdete2=0&spdete3=0&spdete4=0";
312 connectToWeb(queryUrl,lugje);
313 }
314 }
315 }
316 calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
317 }
318
319 } catch (IOException e) {
320 e.printStackTrace();
321 }
322 } else if ("https://booking.escapetravel.mk/".equals(url)) {
323 ObjectMapper mapper = new ObjectMapper();
324 try {
325 ClassLoader classLoader = getClass().getClassLoader();
326 JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
327 JsonNode countries = root.get("countries");
328 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
329 Calendar calendar = Calendar.getInstance();
330 calendar.add(Calendar.DAY_OF_YEAR, 1);
331
332 for (int i = 0; i < 90; i++) { // next three months
333 String date = dateFormat.format(calendar.getTime());
334 for (JsonNode countryNode : countries) {
335 String country = countryNode.asText();
336 for(int nokevanja = 2; nokevanja <=10; nokevanja ++) {
337 for(int lugje = 1; lugje <= 4; lugje++) {
338 String queryUrl = url + "/hotels?Search=" + country + "&Date=" + date + "&Nights=" + nokevanja + "&Rooms=1&Adults=" + lugje;
339 connectToWeb(queryUrl,lugje);
340 }
341 }
342 }
343 calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
344 }
345 } catch (IOException e) {
346 e.printStackTrace();
347 }
348 } else {
349 // Handle other URLs
350 }
351 closeWebDriver();
352 latch.countDown();
353 }
354
355}
Note: See TracBrowser for help on using the repository browser.