source: backend/GlobeGuru-backend/src/main/java/ScraperThread.java@ 1c51912

Last change on this file since 1c51912 was 1c51912, checked in by Kristijan <kristijanzafirovski26@…>, 5 days ago

Added details for magelan

  • Property mode set to 100644
File size: 14.6 KB
RevLine 
[c164f8f]1import com.fasterxml.jackson.databind.JsonNode;
2import com.fasterxml.jackson.databind.ObjectMapper;
3import org.openqa.selenium.By;
[d4d8f61]4import org.openqa.selenium.WebDriver;
[c164f8f]5import org.openqa.selenium.WebElement;
[d4d8f61]6import org.openqa.selenium.chrome.ChromeDriver;
7import org.openqa.selenium.chrome.ChromeOptions;
8import org.jsoup.Jsoup;
9import org.jsoup.nodes.Document;
10import org.jsoup.nodes.Element;
11import org.jsoup.select.Elements;
[c164f8f]12import org.openqa.selenium.support.ui.ExpectedCondition;
13import org.openqa.selenium.support.ui.ExpectedConditions;
14import org.openqa.selenium.support.ui.WebDriverWait;
[d4d8f61]15
[c164f8f]16import java.io.File;
17import java.io.IOException;
18import java.sql.Connection;
19import java.sql.DriverManager;
20import java.sql.PreparedStatement;
21import java.sql.SQLException;
22import java.text.ParseException;
23import java.text.SimpleDateFormat;
24import java.util.*;
[d4d8f61]25import java.util.concurrent.ConcurrentLinkedQueue;
26import java.util.concurrent.CountDownLatch;
27
28public class ScraperThread extends Thread {
29 private String url;
30 private ConcurrentLinkedQueue<Option> uniqueOptions;
31 private CountDownLatch latch;
[c164f8f]32 private Set<Option> optionSet;
[d4d8f61]33
[c164f8f]34 public ScraperThread(String url, ConcurrentLinkedQueue<Option> optionsQueue, CountDownLatch latch) {
[d4d8f61]35 this.url = url;
36 this.uniqueOptions = optionsQueue;
37 this.latch = latch;
[c164f8f]38 this.optionSet = new HashSet<>();
[d4d8f61]39 }
40
[1c51912]41 public WebDriver driver;
[c164f8f]42
43 private void initializeWebDriver() {
44 System.setProperty("webdriver.chrome.driver", "C:\\chromedriver-win64\\chromedriver.exe");
[d4d8f61]45 ChromeOptions options = new ChromeOptions();
[c164f8f]46 options.setBinary("C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe");
47 options.addArguments("--headless");
[d4d8f61]48 options.addArguments("--disable-gpu");
[c164f8f]49 options.addArguments("--remote-allow-origins=*");
50 options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
51 driver = new ChromeDriver(options);
52 }
53
54 private void closeWebDriver() {
55 if (driver != null) {
56 driver.quit();
57 }
58 }
59
[53bad7e]60 private void connectToWeb(String queryUrl, int numPeople) {
[c164f8f]61 driver.get(queryUrl);
62
63 WebDriverWait wait = new WebDriverWait(driver, 40); // 40s timeout buffer
64 switch (url) {
65 case "https://booking.escapetravel.mk/":
66 wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#hotels-container")));
[53bad7e]67 try { Thread.sleep(10000);} catch (InterruptedException e) { e.printStackTrace(); }//price fetch
[c164f8f]68 break;
69 case "https://magelantravel.mk/":
70 wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div.sodrzina")));
71 break;
72 }
73
74 String pageSource = driver.getPageSource();
75 System.out.println("Connected to " + queryUrl);
76 Document doc = Jsoup.parse(pageSource);
77 Element parentDiv;
78 Elements childDivs;
79
80 switch (url) {
81 case "https://booking.escapetravel.mk/":
82 parentDiv = doc.selectFirst("#hotels-container");
83 if (parentDiv != null) {
84 childDivs = parentDiv.select("a.hotel-item");
85 for (Element div : childDivs) {
86 String data = div.outerHtml();
[53bad7e]87 Option option = optionParser(data,numPeople);
[c164f8f]88 if (option != null) {
89 Option existingOption = DatabaseUtil.findOption(option);
90 if (existingOption != null) {
[1c51912]91 if (existingOption.equals(option)) {
[c164f8f]92 option.setPriceChanged(true);
93 option.setNewPrice(option.getPrice());
[d4d8f61]94 }
[c164f8f]95 DatabaseUtil.updateOptionInDatabase(option);
96 } else if (optionSet.add(option)) {
97 uniqueOptions.add(option);
98 DatabaseUtil.saveOptionToDatabase(option);
99 System.out.println("Parsed " + option);
[d4d8f61]100 }
101 }
102 }
[c164f8f]103 } else {
104 System.out.println("Parent div not found");
105 }
106 break;
107 case "https://magelantravel.mk/":
108 parentDiv = doc.selectFirst("div.sodrzina");
109 if (parentDiv != null) {
110 childDivs = parentDiv.select("div.destinacija");
111 childDivs.removeIf(div -> div.attr("style").contains("display:none") || div.attr("style").contains("display: none"));
112 System.out.println("Filtered childDivs size: " + childDivs.size());
113 for (Element div : childDivs) {
114 String data = div.outerHtml();
[53bad7e]115 Option newOption = optionParser(data,numPeople);
[c164f8f]116 if (newOption != null) {
[1c51912]117 if (optionSet.add(newOption)) {
[c164f8f]118 uniqueOptions.add(newOption);
[1c51912]119
120 newOption.setId(DatabaseUtil.saveOptionToDatabase(newOption));
121 scrapeOptionInfo(newOption);
[c164f8f]122 System.out.println("Parsed " + newOption);
[d4d8f61]123 }
124 }
125 }
[c164f8f]126
127 } else {
128 System.out.println("Parent div not found");
129 }
130 break;
131 default:
132 System.out.println("URL not recognized for parsing.");
[d4d8f61]133 }
134 }
[1c51912]135 private void scrapeOptionInfo(Option option) {
136 String url = option.getLink();
137 if(url.contains("magelantravel.mk")) {
138 System.out.println("Scraping info for " + option.getHotelName());
139 String[] dates = option.getDateRange().split(" - ");
140 url += "&checkin=" + dates[0] + "&checkout=" + dates[1] + "&adult=" + option.getNumPeople();
[d4d8f61]141
[1c51912]142 driver.get(url);
143 try { Thread.sleep(5000); } catch (InterruptedException e) { e.printStackTrace(); } //data fetch
144 String pageSource = driver.getPageSource();
145 Document doc = Jsoup.parse(pageSource);
146 Elements roomOptions = doc.select(".tblroom > tbody > tr");
147 for (Element roomOption : roomOptions) {
148 String type = roomOption.select("a.tblroom-type").text();
[c164f8f]149
[1c51912]150 String board = roomOption.select(".rezervacija-objekt").text();
151 if(board.length() > 2){
152 board = board.substring(0,2);
153 }
154 if(board.isEmpty() || type.isEmpty()){
155 continue;
156 }
157 Elements amenityElement = roomOption.select(".objekt-opis");
158 String amenity = (amenityElement != null ? amenityElement.text() : "");
159 System.out.println(amenity + " " + board + " " + type );
160 String priceText = roomOption.select(".tbl-cena").text().replace("€", "").trim();
161 float price;
162 if (!priceText.isEmpty()) {
163 price = Float.parseFloat(priceText);
164 }else continue;
[c164f8f]165
[1c51912]166 DatabaseUtil.saveOptionDetails(option.getId(), type,board,amenity, price);
167 }
168 }
169 }
170 private Option optionParser(String data, int numPeople){
[d4d8f61]171 Document doc = Jsoup.parse(data);
172 Option created = new Option();
173 switch (url) {
[c164f8f]174 case "https://magelantravel.mk/":
175 created = parseMagelan(doc);
[53bad7e]176 created.setNumPeople(numPeople);
[d4d8f61]177 break;
178 case "https://booking.escapetravel.mk/":
179 created = parseEscapeTravel(doc);
[53bad7e]180 created.setNumPeople(numPeople);
[d4d8f61]181 break;
182 default:
183 System.out.println("URL not recognized for parsing.");
184 break;
185 }
186 if (created.isEmpty()) {
187 return null;
188 }
[1c51912]189 //scrapeOptionInfo(created);
[d4d8f61]190 return created;
191 }
192
[c164f8f]193 private Option parseMagelan(Document doc) {
[d4d8f61]194 Option created = new Option();
[c164f8f]195 Element linkElement = doc.selectFirst("div.ponuda-sredina");
196 int id = Integer.parseInt(linkElement.attr("data-id"));
197 int turop = Integer.parseInt(linkElement.attr("data-turop"));
198 created.setLink("https://magelantravel.mk/ponudi.php?type=1&objektid=" + id + "&turop=" + turop);
199 Element imgElement = doc.selectFirst("div.imgLiquidFill.imgLiquid.ponuda-img.zoom");
200 created.setImgSrc(imgElement != null ? url + imgElement.attr("style")
201 .split("url\\(")[1].split("\\)")[0].replace("'", "").replace("./", "/") : null);
202 Element hotelNameElement = doc.selectFirst("div.ponuda-objekt");
[d4d8f61]203 created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
[c164f8f]204 Element countryElement = doc.selectFirst("l.ponuda-lokacija");
[d4d8f61]205 created.setCountry(countryElement != null ? countryElement.text() : null);
[1c51912]206 //Element priceElement = doc.selectFirst("div.ponuda-cena");
[c164f8f]207 Element dateElement = doc.selectFirst("l.ponuda-opis.termin");
208 created.setDateRange(dateElement != null ? dateElement.text() : null);
[1c51912]209 /*float price = Float.parseFloat(priceElement != null ? priceElement.text().replaceAll("[^\\d.]", "") : "0");
210 created.setPrice(price);*/
[d4d8f61]211 return created;
212 }
213 private Option parseEscapeTravel(Document doc) {
214 Option created = new Option();
[c164f8f]215 Element card = doc.selectFirst("a.hotel-item");
216 String link = card.attr("href");
217 created.setLink(link);
218 created.setImgSrc(card.attr("data-picture"));
219 created.setHotelName(card.attr("data-title"));
220 Element countryP = doc.selectFirst("p.text-info");
[53bad7e]221 String country = countryP.text().replaceAll("leto hoteli", "");
222 created.setCountry(country);
[1c51912]223 /*Element priceElem = doc.selectFirst("span.hotel-price");
[c164f8f]224 String priceText = priceElem.text();
225 float price = 0;
226 if(!priceText.isEmpty()) {
227 price = Float.parseFloat(priceText.replace("€", ""));
228 }
[1c51912]229 created.setPrice(price);*/
[c164f8f]230 String[] queryParams = link.split("[?&]");
231 String startDateStr = null;
232 int nights = 0;
233 for (String param : queryParams) {
234 if (param.startsWith("Date=")) {
235 startDateStr = param.split("=")[1];
236 }
237 if (param.startsWith("Nights=")) {
238 nights = Integer.parseInt(param.split("=")[1]);
239 }
240 }
241 if (startDateStr != null && nights > 0)
242 {
243 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
244 try {
245 Date startDate = dateFormat.parse(startDateStr);
[d4d8f61]246
[c164f8f]247 Calendar calendar = Calendar.getInstance();
248 calendar.setTime(startDate);
249 calendar.add(Calendar.DAY_OF_YEAR, nights);
250 Date endDate = calendar.getTime();
251 String dateRange = dateFormat.format(startDate) + " - " + dateFormat.format(endDate);
252 created.setDateRange(dateRange);
253 }catch (ParseException e){
254 e.printStackTrace();
255 }
256 }
[d4d8f61]257 return created;
258 }
259
260 @Override
[c164f8f]261 public void run() {
262 System.out.println("Thread started for url: " + url);
263 initializeWebDriver();
264 if ("https://magelantravel.mk/".equals(url)) {
265 ObjectMapper mapper = new ObjectMapper();
266 try {
267 ClassLoader classLoader = getClass().getClassLoader();
268 JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
269 JsonNode countries = root.get("countries");
270 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
271 Calendar calendar = Calendar.getInstance();
272 calendar.add(Calendar.DAY_OF_YEAR, 1);
273
274 for (int i = 0; i < 90; i++) { // next three months
275 String date = dateFormat.format(calendar.getTime());
276 for (JsonNode countryNode : countries) {
277 String country = countryNode.asText();
278 for (int nokevanja = 2; nokevanja <= 10; nokevanja++) {
[53bad7e]279 for(int lugje = 1; lugje <= 4; lugje++) {
280 String queryUrl = url + "/destinacii?ah_tip=1&iframe=&affiliate_code=&carter_id=0&carter_region=&carter_dataod=&carter_datado=&destinacija=" + country + "&oddatum=" + date + "&nokevanja=" + nokevanja + "&dodatum=&broj_vozrasni=" + lugje + "&broj_deca=0&spdete1=0&spdete2=0&spdete3=0&spdete4=0";
281 connectToWeb(queryUrl,lugje);
282 }
[c164f8f]283 }
284 }
285 calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
286 }
287
288 } catch (IOException e) {
289 e.printStackTrace();
290 }
291 } else if ("https://booking.escapetravel.mk/".equals(url)) {
292 ObjectMapper mapper = new ObjectMapper();
293 try {
294 ClassLoader classLoader = getClass().getClassLoader();
295 JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
[53bad7e]296 JsonNode countries = root.get("countries");
[c164f8f]297 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
298 Calendar calendar = Calendar.getInstance();
299 calendar.add(Calendar.DAY_OF_YEAR, 1);
300
301 for (int i = 0; i < 90; i++) { // next three months
302 String date = dateFormat.format(calendar.getTime());
303 for (JsonNode countryNode : countries) {
304 String country = countryNode.asText();
305 for(int nokevanja = 2; nokevanja <=10; nokevanja ++) {
[53bad7e]306 for(int lugje = 1; lugje <= 4; lugje++) {
307 String queryUrl = url + "/hotels?Search=" + country + "&Date=" + date + "&Nights=" + nokevanja + "&Rooms=1&Adults=" + lugje;
308 connectToWeb(queryUrl,lugje);
309 }
[c164f8f]310 }
[d4d8f61]311 }
[c164f8f]312 calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
313 }
314 } catch (IOException e) {
315 e.printStackTrace();
[d4d8f61]316 }
[c164f8f]317 } else {
318 // Handle other URLs
[d4d8f61]319 }
[c164f8f]320 closeWebDriver();
321 latch.countDown();
[d4d8f61]322 }
[c164f8f]323
324}
Note: See TracBrowser for help on using the repository browser.