source: backend/GlobeGuru-backend/src/main/java/ScraperThread.java@ 1c51912

Last change on this file since 1c51912 was 1c51912, checked in by Kristijan <kristijanzafirovski26@…>, 5 days ago

Added details for magelan

  • Property mode set to 100644
File size: 14.6 KB
Line 
1import com.fasterxml.jackson.databind.JsonNode;
2import com.fasterxml.jackson.databind.ObjectMapper;
3import org.openqa.selenium.By;
4import org.openqa.selenium.WebDriver;
5import org.openqa.selenium.WebElement;
6import org.openqa.selenium.chrome.ChromeDriver;
7import org.openqa.selenium.chrome.ChromeOptions;
8import org.jsoup.Jsoup;
9import org.jsoup.nodes.Document;
10import org.jsoup.nodes.Element;
11import org.jsoup.select.Elements;
12import org.openqa.selenium.support.ui.ExpectedCondition;
13import org.openqa.selenium.support.ui.ExpectedConditions;
14import org.openqa.selenium.support.ui.WebDriverWait;
15
16import java.io.File;
17import java.io.IOException;
18import java.sql.Connection;
19import java.sql.DriverManager;
20import java.sql.PreparedStatement;
21import java.sql.SQLException;
22import java.text.ParseException;
23import java.text.SimpleDateFormat;
24import java.util.*;
25import java.util.concurrent.ConcurrentLinkedQueue;
26import java.util.concurrent.CountDownLatch;
27
28public class ScraperThread extends Thread {
29 private String url;
30 private ConcurrentLinkedQueue<Option> uniqueOptions;
31 private CountDownLatch latch;
32 private Set<Option> optionSet;
33
34 public ScraperThread(String url, ConcurrentLinkedQueue<Option> optionsQueue, CountDownLatch latch) {
35 this.url = url;
36 this.uniqueOptions = optionsQueue;
37 this.latch = latch;
38 this.optionSet = new HashSet<>();
39 }
40
41 public WebDriver driver;
42
43 private void initializeWebDriver() {
44 System.setProperty("webdriver.chrome.driver", "C:\\chromedriver-win64\\chromedriver.exe");
45 ChromeOptions options = new ChromeOptions();
46 options.setBinary("C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe");
47 options.addArguments("--headless");
48 options.addArguments("--disable-gpu");
49 options.addArguments("--remote-allow-origins=*");
50 options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
51 driver = new ChromeDriver(options);
52 }
53
54 private void closeWebDriver() {
55 if (driver != null) {
56 driver.quit();
57 }
58 }
59
60 private void connectToWeb(String queryUrl, int numPeople) {
61 driver.get(queryUrl);
62
63 WebDriverWait wait = new WebDriverWait(driver, 40); // 40s timeout buffer
64 switch (url) {
65 case "https://booking.escapetravel.mk/":
66 wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#hotels-container")));
67 try { Thread.sleep(10000);} catch (InterruptedException e) { e.printStackTrace(); }//price fetch
68 break;
69 case "https://magelantravel.mk/":
70 wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div.sodrzina")));
71 break;
72 }
73
74 String pageSource = driver.getPageSource();
75 System.out.println("Connected to " + queryUrl);
76 Document doc = Jsoup.parse(pageSource);
77 Element parentDiv;
78 Elements childDivs;
79
80 switch (url) {
81 case "https://booking.escapetravel.mk/":
82 parentDiv = doc.selectFirst("#hotels-container");
83 if (parentDiv != null) {
84 childDivs = parentDiv.select("a.hotel-item");
85 for (Element div : childDivs) {
86 String data = div.outerHtml();
87 Option option = optionParser(data,numPeople);
88 if (option != null) {
89 Option existingOption = DatabaseUtil.findOption(option);
90 if (existingOption != null) {
91 if (existingOption.equals(option)) {
92 option.setPriceChanged(true);
93 option.setNewPrice(option.getPrice());
94 }
95 DatabaseUtil.updateOptionInDatabase(option);
96 } else if (optionSet.add(option)) {
97 uniqueOptions.add(option);
98 DatabaseUtil.saveOptionToDatabase(option);
99 System.out.println("Parsed " + option);
100 }
101 }
102 }
103 } else {
104 System.out.println("Parent div not found");
105 }
106 break;
107 case "https://magelantravel.mk/":
108 parentDiv = doc.selectFirst("div.sodrzina");
109 if (parentDiv != null) {
110 childDivs = parentDiv.select("div.destinacija");
111 childDivs.removeIf(div -> div.attr("style").contains("display:none") || div.attr("style").contains("display: none"));
112 System.out.println("Filtered childDivs size: " + childDivs.size());
113 for (Element div : childDivs) {
114 String data = div.outerHtml();
115 Option newOption = optionParser(data,numPeople);
116 if (newOption != null) {
117 if (optionSet.add(newOption)) {
118 uniqueOptions.add(newOption);
119
120 newOption.setId(DatabaseUtil.saveOptionToDatabase(newOption));
121 scrapeOptionInfo(newOption);
122 System.out.println("Parsed " + newOption);
123 }
124 }
125 }
126
127 } else {
128 System.out.println("Parent div not found");
129 }
130 break;
131 default:
132 System.out.println("URL not recognized for parsing.");
133 }
134 }
135 private void scrapeOptionInfo(Option option) {
136 String url = option.getLink();
137 if(url.contains("magelantravel.mk")) {
138 System.out.println("Scraping info for " + option.getHotelName());
139 String[] dates = option.getDateRange().split(" - ");
140 url += "&checkin=" + dates[0] + "&checkout=" + dates[1] + "&adult=" + option.getNumPeople();
141
142 driver.get(url);
143 try { Thread.sleep(5000); } catch (InterruptedException e) { e.printStackTrace(); } //data fetch
144 String pageSource = driver.getPageSource();
145 Document doc = Jsoup.parse(pageSource);
146 Elements roomOptions = doc.select(".tblroom > tbody > tr");
147 for (Element roomOption : roomOptions) {
148 String type = roomOption.select("a.tblroom-type").text();
149
150 String board = roomOption.select(".rezervacija-objekt").text();
151 if(board.length() > 2){
152 board = board.substring(0,2);
153 }
154 if(board.isEmpty() || type.isEmpty()){
155 continue;
156 }
157 Elements amenityElement = roomOption.select(".objekt-opis");
158 String amenity = (amenityElement != null ? amenityElement.text() : "");
159 System.out.println(amenity + " " + board + " " + type );
160 String priceText = roomOption.select(".tbl-cena").text().replace("€", "").trim();
161 float price;
162 if (!priceText.isEmpty()) {
163 price = Float.parseFloat(priceText);
164 }else continue;
165
166 DatabaseUtil.saveOptionDetails(option.getId(), type,board,amenity, price);
167 }
168 }
169 }
170 private Option optionParser(String data, int numPeople){
171 Document doc = Jsoup.parse(data);
172 Option created = new Option();
173 switch (url) {
174 case "https://magelantravel.mk/":
175 created = parseMagelan(doc);
176 created.setNumPeople(numPeople);
177 break;
178 case "https://booking.escapetravel.mk/":
179 created = parseEscapeTravel(doc);
180 created.setNumPeople(numPeople);
181 break;
182 default:
183 System.out.println("URL not recognized for parsing.");
184 break;
185 }
186 if (created.isEmpty()) {
187 return null;
188 }
189 //scrapeOptionInfo(created);
190 return created;
191 }
192
193 private Option parseMagelan(Document doc) {
194 Option created = new Option();
195 Element linkElement = doc.selectFirst("div.ponuda-sredina");
196 int id = Integer.parseInt(linkElement.attr("data-id"));
197 int turop = Integer.parseInt(linkElement.attr("data-turop"));
198 created.setLink("https://magelantravel.mk/ponudi.php?type=1&objektid=" + id + "&turop=" + turop);
199 Element imgElement = doc.selectFirst("div.imgLiquidFill.imgLiquid.ponuda-img.zoom");
200 created.setImgSrc(imgElement != null ? url + imgElement.attr("style")
201 .split("url\\(")[1].split("\\)")[0].replace("'", "").replace("./", "/") : null);
202 Element hotelNameElement = doc.selectFirst("div.ponuda-objekt");
203 created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
204 Element countryElement = doc.selectFirst("l.ponuda-lokacija");
205 created.setCountry(countryElement != null ? countryElement.text() : null);
206 //Element priceElement = doc.selectFirst("div.ponuda-cena");
207 Element dateElement = doc.selectFirst("l.ponuda-opis.termin");
208 created.setDateRange(dateElement != null ? dateElement.text() : null);
209 /*float price = Float.parseFloat(priceElement != null ? priceElement.text().replaceAll("[^\\d.]", "") : "0");
210 created.setPrice(price);*/
211 return created;
212 }
213 private Option parseEscapeTravel(Document doc) {
214 Option created = new Option();
215 Element card = doc.selectFirst("a.hotel-item");
216 String link = card.attr("href");
217 created.setLink(link);
218 created.setImgSrc(card.attr("data-picture"));
219 created.setHotelName(card.attr("data-title"));
220 Element countryP = doc.selectFirst("p.text-info");
221 String country = countryP.text().replaceAll("leto hoteli", "");
222 created.setCountry(country);
223 /*Element priceElem = doc.selectFirst("span.hotel-price");
224 String priceText = priceElem.text();
225 float price = 0;
226 if(!priceText.isEmpty()) {
227 price = Float.parseFloat(priceText.replace("€", ""));
228 }
229 created.setPrice(price);*/
230 String[] queryParams = link.split("[?&]");
231 String startDateStr = null;
232 int nights = 0;
233 for (String param : queryParams) {
234 if (param.startsWith("Date=")) {
235 startDateStr = param.split("=")[1];
236 }
237 if (param.startsWith("Nights=")) {
238 nights = Integer.parseInt(param.split("=")[1]);
239 }
240 }
241 if (startDateStr != null && nights > 0)
242 {
243 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
244 try {
245 Date startDate = dateFormat.parse(startDateStr);
246
247 Calendar calendar = Calendar.getInstance();
248 calendar.setTime(startDate);
249 calendar.add(Calendar.DAY_OF_YEAR, nights);
250 Date endDate = calendar.getTime();
251 String dateRange = dateFormat.format(startDate) + " - " + dateFormat.format(endDate);
252 created.setDateRange(dateRange);
253 }catch (ParseException e){
254 e.printStackTrace();
255 }
256 }
257 return created;
258 }
259
260 @Override
261 public void run() {
262 System.out.println("Thread started for url: " + url);
263 initializeWebDriver();
264 if ("https://magelantravel.mk/".equals(url)) {
265 ObjectMapper mapper = new ObjectMapper();
266 try {
267 ClassLoader classLoader = getClass().getClassLoader();
268 JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
269 JsonNode countries = root.get("countries");
270 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
271 Calendar calendar = Calendar.getInstance();
272 calendar.add(Calendar.DAY_OF_YEAR, 1);
273
274 for (int i = 0; i < 90; i++) { // next three months
275 String date = dateFormat.format(calendar.getTime());
276 for (JsonNode countryNode : countries) {
277 String country = countryNode.asText();
278 for (int nokevanja = 2; nokevanja <= 10; nokevanja++) {
279 for(int lugje = 1; lugje <= 4; lugje++) {
280 String queryUrl = url + "/destinacii?ah_tip=1&iframe=&affiliate_code=&carter_id=0&carter_region=&carter_dataod=&carter_datado=&destinacija=" + country + "&oddatum=" + date + "&nokevanja=" + nokevanja + "&dodatum=&broj_vozrasni=" + lugje + "&broj_deca=0&spdete1=0&spdete2=0&spdete3=0&spdete4=0";
281 connectToWeb(queryUrl,lugje);
282 }
283 }
284 }
285 calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
286 }
287
288 } catch (IOException e) {
289 e.printStackTrace();
290 }
291 } else if ("https://booking.escapetravel.mk/".equals(url)) {
292 ObjectMapper mapper = new ObjectMapper();
293 try {
294 ClassLoader classLoader = getClass().getClassLoader();
295 JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
296 JsonNode countries = root.get("countries");
297 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
298 Calendar calendar = Calendar.getInstance();
299 calendar.add(Calendar.DAY_OF_YEAR, 1);
300
301 for (int i = 0; i < 90; i++) { // next three months
302 String date = dateFormat.format(calendar.getTime());
303 for (JsonNode countryNode : countries) {
304 String country = countryNode.asText();
305 for(int nokevanja = 2; nokevanja <=10; nokevanja ++) {
306 for(int lugje = 1; lugje <= 4; lugje++) {
307 String queryUrl = url + "/hotels?Search=" + country + "&Date=" + date + "&Nights=" + nokevanja + "&Rooms=1&Adults=" + lugje;
308 connectToWeb(queryUrl,lugje);
309 }
310 }
311 }
312 calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
313 }
314 } catch (IOException e) {
315 e.printStackTrace();
316 }
317 } else {
318 // Handle other URLs
319 }
320 closeWebDriver();
321 latch.countDown();
322 }
323
324}
Note: See TracBrowser for help on using the repository browser.