1 | import com.fasterxml.jackson.databind.JsonNode;
|
---|
2 | import com.fasterxml.jackson.databind.ObjectMapper;
|
---|
3 | import org.openqa.selenium.By;
|
---|
4 | import org.openqa.selenium.WebDriver;
|
---|
5 | import org.openqa.selenium.WebElement;
|
---|
6 | import org.openqa.selenium.chrome.ChromeDriver;
|
---|
7 | import org.openqa.selenium.chrome.ChromeOptions;
|
---|
8 | import org.jsoup.Jsoup;
|
---|
9 | import org.jsoup.nodes.Document;
|
---|
10 | import org.jsoup.nodes.Element;
|
---|
11 | import org.jsoup.select.Elements;
|
---|
12 | import org.openqa.selenium.support.ui.ExpectedCondition;
|
---|
13 | import org.openqa.selenium.support.ui.ExpectedConditions;
|
---|
14 | import org.openqa.selenium.support.ui.WebDriverWait;
|
---|
15 |
|
---|
16 | import javax.xml.crypto.Data;
|
---|
17 | import java.io.File;
|
---|
18 | import java.io.IOException;
|
---|
19 | import java.sql.Connection;
|
---|
20 | import java.sql.DriverManager;
|
---|
21 | import java.sql.PreparedStatement;
|
---|
22 | import java.sql.SQLException;
|
---|
23 | import java.text.ParseException;
|
---|
24 | import java.text.SimpleDateFormat;
|
---|
25 | import java.util.*;
|
---|
26 | import java.util.concurrent.ConcurrentLinkedQueue;
|
---|
27 | import java.util.concurrent.CountDownLatch;
|
---|
28 |
|
---|
29 | public class ScraperThread extends Thread {
|
---|
30 | private String url;
|
---|
31 | private CountDownLatch latch;
|
---|
32 |
|
---|
33 | public ScraperThread(String url, CountDownLatch latch) {
|
---|
34 | this.url = url;
|
---|
35 | this.latch = latch;
|
---|
36 | }
|
---|
37 |
|
---|
38 | public WebDriver driver;
|
---|
39 |
|
---|
40 | private void initializeWebDriver() {
|
---|
41 | System.setProperty("webdriver.chrome.driver", "C:\\chromedriver-win64\\chromedriver.exe");
|
---|
42 | ChromeOptions options = new ChromeOptions();
|
---|
43 | options.setBinary("C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe");
|
---|
44 | options.addArguments("--headless");
|
---|
45 | options.addArguments("--disable-gpu");
|
---|
46 | options.addArguments("--remote-allow-origins=*");
|
---|
47 | options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
|
---|
48 | driver = new ChromeDriver(options);
|
---|
49 | }
|
---|
50 |
|
---|
51 | private void closeWebDriver() {
|
---|
52 | if (driver != null) {
|
---|
53 | driver.quit();
|
---|
54 | }
|
---|
55 | }
|
---|
56 |
|
---|
57 | private void connectToWeb(String queryUrl, int numPeople) {
|
---|
58 | driver.get(queryUrl);
|
---|
59 |
|
---|
60 | WebDriverWait wait = new WebDriverWait(driver, 40); // 40s timeout buffer
|
---|
61 | switch (url) {
|
---|
62 | case "https://booking.escapetravel.mk/":
|
---|
63 | wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#hotels-container")));
|
---|
64 | try { Thread.sleep(10000);} catch (InterruptedException e) { e.printStackTrace(); }//price fetch
|
---|
65 | break;
|
---|
66 | case "https://magelantravel.mk/":
|
---|
67 | wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div.sodrzina")));
|
---|
68 | break;
|
---|
69 | }
|
---|
70 |
|
---|
71 | String pageSource = driver.getPageSource();
|
---|
72 | System.out.println("Connected to " + queryUrl);
|
---|
73 | Document doc = Jsoup.parse(pageSource);
|
---|
74 | Element parentDiv;
|
---|
75 | Elements childDivs;
|
---|
76 |
|
---|
77 | switch (url) {
|
---|
78 | case "https://booking.escapetravel.mk/":
|
---|
79 | parentDiv = doc.selectFirst("#hotels-container");
|
---|
80 | if (parentDiv != null) {
|
---|
81 | childDivs = parentDiv.select("a.hotel-item");
|
---|
82 | for (Element div : childDivs) {
|
---|
83 | String data = div.outerHtml();
|
---|
84 | Option option = optionParser(data, numPeople);
|
---|
85 | if (option != null) {
|
---|
86 | option.setId(DatabaseUtil.saveOptionToDatabase(option));
|
---|
87 | scrapeOptionInfo(option);
|
---|
88 | System.out.println("Parsed " + option);
|
---|
89 | }
|
---|
90 | }
|
---|
91 | } else {
|
---|
92 | System.out.println("Parent div not found");
|
---|
93 | }
|
---|
94 | break;
|
---|
95 | case "https://magelantravel.mk/":
|
---|
96 | parentDiv = doc.selectFirst("div.sodrzina");
|
---|
97 | if (parentDiv != null) {
|
---|
98 | childDivs = parentDiv.select("div.destinacija");
|
---|
99 | childDivs.removeIf(div -> div.attr("style").contains("display:none") || div.attr("style").contains("display: none"));
|
---|
100 | System.out.println("Filtered childDivs size: " + childDivs.size());
|
---|
101 | for (Element div : childDivs) {
|
---|
102 | String data = div.outerHtml();
|
---|
103 | Option newOption = optionParser(data, numPeople);
|
---|
104 | if (newOption != null) {
|
---|
105 | newOption.setId(DatabaseUtil.saveOptionToDatabase(newOption));
|
---|
106 | scrapeOptionInfo(newOption);
|
---|
107 | System.out.println("Parsed " + newOption);
|
---|
108 | }
|
---|
109 | }
|
---|
110 | } else {
|
---|
111 | System.out.println("Parent div not found");
|
---|
112 | }
|
---|
113 | break;
|
---|
114 | default:
|
---|
115 | System.out.println("URL not recognized for parsing.");
|
---|
116 | }
|
---|
117 |
|
---|
118 | }
|
---|
119 | private void scrapeOptionInfo(Option option) {
|
---|
120 | String url = option.getLink();
|
---|
121 | if(url.contains("magelantravel.mk")) {
|
---|
122 | System.out.println("Scraping info for " + option.getHotelName());
|
---|
123 | String[] dates = option.getDateRange().split(" - ");
|
---|
124 | url += "&checkin=" + dates[0] + "&checkout=" + dates[1] + "&adult=" + option.getNumPeople();
|
---|
125 |
|
---|
126 | driver.get(url);
|
---|
127 | try { Thread.sleep(5000); } catch (InterruptedException e) { e.printStackTrace(); } //data fetch
|
---|
128 | String pageSource = driver.getPageSource();
|
---|
129 | Document doc = Jsoup.parse(pageSource);
|
---|
130 | Elements roomOptions = doc.select(".tblroom > tbody > tr");
|
---|
131 | for (Element roomOption : roomOptions) {
|
---|
132 | String type = roomOption.select("a.tblroom-type").text();
|
---|
133 |
|
---|
134 | String board = roomOption.select(".rezervacija-objekt").text();
|
---|
135 | if(board.length() > 2) {
|
---|
136 | board = board.substring(0, 2);
|
---|
137 | }
|
---|
138 | if(board.isEmpty() || type.isEmpty()) continue;
|
---|
139 |
|
---|
140 | Elements amenityElement = roomOption.select(".objekt-opis");
|
---|
141 | String amenity = (amenityElement != null ? amenityElement.text() : "");
|
---|
142 | System.out.println(amenity + " " + board + " " + type );
|
---|
143 | String priceText = roomOption.select(".tbl-cena").text().replace("€", "").trim();
|
---|
144 | float price;
|
---|
145 | if (!priceText.isEmpty()) {
|
---|
146 | price = Float.parseFloat(priceText);
|
---|
147 | }else continue;
|
---|
148 |
|
---|
149 | //Check for changes
|
---|
150 | int odId = checkForChanges(option.getId(), type, board,amenity,price);
|
---|
151 | if(odId != 0) { //true = changes found - update details
|
---|
152 | DatabaseUtil.updateOptionDetails(odId,type,board,amenity,price);
|
---|
153 | }else{ //false = not found / no changes - save regular
|
---|
154 | DatabaseUtil.saveOptionDetails(option.getId(), type, board, amenity, price);
|
---|
155 | }
|
---|
156 | }
|
---|
157 | }
|
---|
158 | else if(url.contains("booking.escapetravel.mk")){
|
---|
159 | System.out.println("Scraping info for " + url);
|
---|
160 |
|
---|
161 | driver.get(url);
|
---|
162 | try { Thread.sleep(5000); } catch (InterruptedException e) { e.printStackTrace(); } //data fetch
|
---|
163 | String pageSource = driver.getPageSource();
|
---|
164 | Document doc = Jsoup.parse(pageSource);
|
---|
165 | Elements roomOptions = doc.select("#hotel-rooms-container .hotel-room-row");
|
---|
166 | for(Element roomOption : roomOptions){
|
---|
167 | String type = roomOption.select("td.align-middle").first().text();
|
---|
168 | String board = roomOption.select("td.align-middle.text-primary.lead").text();
|
---|
169 | if (board.isEmpty() || type.isEmpty()) continue;
|
---|
170 | String priceText = roomOption.select("td.align-middle.text-end .text-success.d-block.lead").text().replace("€", "").trim();
|
---|
171 | float price;
|
---|
172 | if (!priceText.isEmpty()) {
|
---|
173 | price = Float.parseFloat(priceText.replace(",", ""));
|
---|
174 | } else continue;
|
---|
175 |
|
---|
176 | Elements amenityElements = doc.select("div.row > div.col-6.col-md-3.col-xl-2");
|
---|
177 | StringBuilder amenities = new StringBuilder();
|
---|
178 | for (Element amenityElement : amenityElements) {
|
---|
179 | amenities.append(amenityElement.text()).append(", ");
|
---|
180 | }
|
---|
181 | if (!amenities.isEmpty()) {
|
---|
182 | amenities.setLength(amenities.length() - 2);
|
---|
183 | }
|
---|
184 | System.out.println(type + board + price + amenities);
|
---|
185 | int odId = checkForChanges(option.getId(), type, board,amenities.toString(),price);
|
---|
186 | if(odId != 0) { //true = changes found - update details
|
---|
187 | DatabaseUtil.updateOptionDetails(odId,type,board,amenities.toString(),price);
|
---|
188 | }else{ //false = not found / no changes - save regular
|
---|
189 | DatabaseUtil.saveOptionDetails(option.getId(), type, board, amenities.toString(), price);
|
---|
190 | }
|
---|
191 | }
|
---|
192 |
|
---|
193 | }
|
---|
194 | }
|
---|
195 | private int checkForChanges(int id, String type, String board, String amenities, float price){ //return true for changes, false for no changes
|
---|
196 | try {
|
---|
197 | List<Option> pooled = DatabaseUtil.poolOptionDetails(id);
|
---|
198 | if (pooled.isEmpty()) { //not saved = no changes - save regular
|
---|
199 | return 0;
|
---|
200 | }else{ //got the options saved details
|
---|
201 | for(Option o : pooled){
|
---|
202 | if(o.getType().equals(type) && o.getBoard().equals(board)){//for the room and board check amenity and price changes (Assumption type of room and board do not change)
|
---|
203 | if((!o.getAmenities().equals(amenities)) || o.getPrice() != price){
|
---|
204 | return o.getDetail_id(); //Change
|
---|
205 | }
|
---|
206 | }
|
---|
207 | }
|
---|
208 | }
|
---|
209 | }catch(SQLException e){
|
---|
210 | e.printStackTrace();
|
---|
211 | }
|
---|
212 | return 0; //no changes detected
|
---|
213 | }
|
---|
214 | private Option optionParser(String data, int numPeople){
|
---|
215 | Document doc = Jsoup.parse(data);
|
---|
216 | Option created = new Option();
|
---|
217 | switch (url) {
|
---|
218 | case "https://magelantravel.mk/":
|
---|
219 | created = parseMagelan(doc);
|
---|
220 | created.setNumPeople(numPeople);
|
---|
221 | break;
|
---|
222 | case "https://booking.escapetravel.mk/":
|
---|
223 | created = parseEscapeTravel(doc);
|
---|
224 | created.setNumPeople(numPeople);
|
---|
225 | break;
|
---|
226 | default:
|
---|
227 | System.out.println("URL not recognized for parsing.");
|
---|
228 | break;
|
---|
229 | }
|
---|
230 | if (created.isEmpty()) {
|
---|
231 | return null;
|
---|
232 | }
|
---|
233 | return created;
|
---|
234 | }
|
---|
235 |
|
---|
236 | private Option parseMagelan(Document doc) {
|
---|
237 | Option created = new Option();
|
---|
238 | Element linkElement = doc.selectFirst("div.ponuda-sredina");
|
---|
239 | int id = Integer.parseInt(linkElement.attr("data-id"));
|
---|
240 | int turop = Integer.parseInt(linkElement.attr("data-turop"));
|
---|
241 | created.setLink("https://magelantravel.mk/ponudi.php?type=1&objektid=" + id + "&turop=" + turop);
|
---|
242 | Element imgElement = doc.selectFirst("div.imgLiquidFill.imgLiquid.ponuda-img.zoom");
|
---|
243 | created.setImgSrc(imgElement != null ? url + imgElement.attr("style")
|
---|
244 | .split("url\\(")[1].split("\\)")[0].replace("'", "").replace("./", "/") : null);
|
---|
245 | Element hotelNameElement = doc.selectFirst("div.ponuda-objekt");
|
---|
246 | created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
|
---|
247 | Element countryElement = doc.selectFirst("l.ponuda-lokacija");
|
---|
248 | created.setCountry(countryElement != null ? countryElement.text() : null);
|
---|
249 | Element dateElement = doc.selectFirst("l.ponuda-opis.termin");
|
---|
250 | created.setDateRange(dateElement != null ? dateElement.text() : null);
|
---|
251 | return created;
|
---|
252 | }
|
---|
253 | private Option parseEscapeTravel(Document doc) {
|
---|
254 | Option created = new Option();
|
---|
255 | Element card = doc.selectFirst("a.hotel-item");
|
---|
256 | String link = card.attr("href");
|
---|
257 | created.setLink(link);
|
---|
258 | created.setImgSrc(card.attr("data-picture"));
|
---|
259 | created.setHotelName(card.attr("data-title"));
|
---|
260 | Element countryP = doc.selectFirst("p.text-info");
|
---|
261 | String country = countryP.text().replaceAll("leto hoteli", "");
|
---|
262 | created.setCountry(country);
|
---|
263 | String[] queryParams = link.split("[?&]");
|
---|
264 | String startDateStr = null;
|
---|
265 | int nights = 0;
|
---|
266 | for (String param : queryParams) {
|
---|
267 | if (param.startsWith("Date=")) {
|
---|
268 | startDateStr = param.split("=")[1];
|
---|
269 | }
|
---|
270 | if (param.startsWith("Nights=")) {
|
---|
271 | nights = Integer.parseInt(param.split("=")[1]);
|
---|
272 | }
|
---|
273 | }
|
---|
274 | if (startDateStr != null && nights > 0)
|
---|
275 | {
|
---|
276 | SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
|
---|
277 | try {
|
---|
278 | Date startDate = dateFormat.parse(startDateStr);
|
---|
279 |
|
---|
280 | Calendar calendar = Calendar.getInstance();
|
---|
281 | calendar.setTime(startDate);
|
---|
282 | calendar.add(Calendar.DAY_OF_YEAR, nights);
|
---|
283 | Date endDate = calendar.getTime();
|
---|
284 | String dateRange = dateFormat.format(startDate) + " - " + dateFormat.format(endDate);
|
---|
285 | created.setDateRange(dateRange);
|
---|
286 | }catch (ParseException e){
|
---|
287 | e.printStackTrace();
|
---|
288 | }
|
---|
289 | }
|
---|
290 | return created;
|
---|
291 | }
|
---|
292 |
|
---|
293 | @Override
|
---|
294 | public void run() {
|
---|
295 | System.out.println("Thread started for url: " + url);
|
---|
296 | initializeWebDriver();
|
---|
297 | if ("https://magelantravel.mk/".equals(url)) {
|
---|
298 | ObjectMapper mapper = new ObjectMapper();
|
---|
299 | try {
|
---|
300 | ClassLoader classLoader = getClass().getClassLoader();
|
---|
301 | JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
|
---|
302 | JsonNode countries = root.get("countries");
|
---|
303 | SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
|
---|
304 | Calendar calendar = Calendar.getInstance();
|
---|
305 | calendar.add(Calendar.DAY_OF_YEAR, 1);
|
---|
306 |
|
---|
307 | for (int i = 0; i < 90; i++) { // next three months
|
---|
308 | String date = dateFormat.format(calendar.getTime());
|
---|
309 | for (JsonNode countryNode : countries) {
|
---|
310 | String country = countryNode.asText();
|
---|
311 | for (int nokevanja = 2; nokevanja <= 10; nokevanja++) {
|
---|
312 | for(int lugje = 1; lugje <= 4; lugje++) {
|
---|
313 | String queryUrl = url + "/destinacii?ah_tip=1&iframe=&affiliate_code=&carter_id=0&carter_region=&carter_dataod=&carter_datado=&destinacija=" + country + "&oddatum=" + date + "&nokevanja=" + nokevanja + "&dodatum=&broj_vozrasni=" + lugje + "&broj_deca=0&spdete1=0&spdete2=0&spdete3=0&spdete4=0";
|
---|
314 | connectToWeb(queryUrl,lugje);
|
---|
315 | }
|
---|
316 | }
|
---|
317 | }
|
---|
318 | calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
|
---|
319 | }
|
---|
320 |
|
---|
321 | } catch (IOException e) {
|
---|
322 | e.printStackTrace();
|
---|
323 | }
|
---|
324 | } else if ("https://booking.escapetravel.mk/".equals(url)) {
|
---|
325 | ObjectMapper mapper = new ObjectMapper();
|
---|
326 | try {
|
---|
327 | ClassLoader classLoader = getClass().getClassLoader();
|
---|
328 | JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
|
---|
329 | JsonNode countries = root.get("countries");
|
---|
330 | SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
|
---|
331 | Calendar calendar = Calendar.getInstance();
|
---|
332 | calendar.add(Calendar.DAY_OF_YEAR, 1);
|
---|
333 |
|
---|
334 | for (int i = 0; i < 90; i++) { // next three months
|
---|
335 | String date = dateFormat.format(calendar.getTime());
|
---|
336 | for (JsonNode countryNode : countries) {
|
---|
337 | String country = countryNode.asText();
|
---|
338 | for(int nokevanja = 2; nokevanja <=10; nokevanja ++) {
|
---|
339 | for(int lugje = 1; lugje <= 4; lugje++) {
|
---|
340 | String queryUrl = url + "/hotels?Search=" + country + "&Date=" + date + "&Nights=" + nokevanja + "&Rooms=1&Adults=" + lugje;
|
---|
341 | connectToWeb(queryUrl,lugje);
|
---|
342 | }
|
---|
343 | }
|
---|
344 | }
|
---|
345 | calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
|
---|
346 | }
|
---|
347 | } catch (IOException e) {
|
---|
348 | e.printStackTrace();
|
---|
349 | }
|
---|
350 | } else {
|
---|
351 | // Handle other URLs
|
---|
352 | }
|
---|
353 | closeWebDriver();
|
---|
354 | latch.countDown();
|
---|
355 | }
|
---|
356 |
|
---|
357 | }
|
---|