[c164f8f] | 1 | import com.fasterxml.jackson.databind.JsonNode;
|
---|
| 2 | import com.fasterxml.jackson.databind.ObjectMapper;
|
---|
| 3 | import org.openqa.selenium.By;
|
---|
[d4d8f61] | 4 | import org.openqa.selenium.WebDriver;
|
---|
[c164f8f] | 5 | import org.openqa.selenium.WebElement;
|
---|
[d4d8f61] | 6 | import org.openqa.selenium.chrome.ChromeDriver;
|
---|
| 7 | import org.openqa.selenium.chrome.ChromeOptions;
|
---|
| 8 | import org.jsoup.Jsoup;
|
---|
| 9 | import org.jsoup.nodes.Document;
|
---|
| 10 | import org.jsoup.nodes.Element;
|
---|
| 11 | import org.jsoup.select.Elements;
|
---|
[c164f8f] | 12 | import org.openqa.selenium.support.ui.ExpectedCondition;
|
---|
| 13 | import org.openqa.selenium.support.ui.ExpectedConditions;
|
---|
| 14 | import org.openqa.selenium.support.ui.WebDriverWait;
|
---|
[d4d8f61] | 15 |
|
---|
[c164f8f] | 16 | import java.io.File;
|
---|
| 17 | import java.io.IOException;
|
---|
| 18 | import java.sql.Connection;
|
---|
| 19 | import java.sql.DriverManager;
|
---|
| 20 | import java.sql.PreparedStatement;
|
---|
| 21 | import java.sql.SQLException;
|
---|
| 22 | import java.text.ParseException;
|
---|
| 23 | import java.text.SimpleDateFormat;
|
---|
| 24 | import java.util.*;
|
---|
[d4d8f61] | 25 | import java.util.concurrent.ConcurrentLinkedQueue;
|
---|
| 26 | import java.util.concurrent.CountDownLatch;
|
---|
| 27 |
|
---|
| 28 | public class ScraperThread extends Thread {
|
---|
| 29 | private String url;
|
---|
| 30 | private ConcurrentLinkedQueue<Option> uniqueOptions;
|
---|
| 31 | private CountDownLatch latch;
|
---|
[c164f8f] | 32 | private Set<Option> optionSet;
|
---|
[d4d8f61] | 33 |
|
---|
[c164f8f] | 34 | public ScraperThread(String url, ConcurrentLinkedQueue<Option> optionsQueue, CountDownLatch latch) {
|
---|
[d4d8f61] | 35 | this.url = url;
|
---|
| 36 | this.uniqueOptions = optionsQueue;
|
---|
| 37 | this.latch = latch;
|
---|
[c164f8f] | 38 | this.optionSet = new HashSet<>();
|
---|
[d4d8f61] | 39 | }
|
---|
| 40 |
|
---|
[1c51912] | 41 | public WebDriver driver;
|
---|
[c164f8f] | 42 |
|
---|
| 43 | private void initializeWebDriver() {
|
---|
| 44 | System.setProperty("webdriver.chrome.driver", "C:\\chromedriver-win64\\chromedriver.exe");
|
---|
[d4d8f61] | 45 | ChromeOptions options = new ChromeOptions();
|
---|
[c164f8f] | 46 | options.setBinary("C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe");
|
---|
| 47 | options.addArguments("--headless");
|
---|
[d4d8f61] | 48 | options.addArguments("--disable-gpu");
|
---|
[c164f8f] | 49 | options.addArguments("--remote-allow-origins=*");
|
---|
| 50 | options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
|
---|
| 51 | driver = new ChromeDriver(options);
|
---|
| 52 | }
|
---|
| 53 |
|
---|
| 54 | private void closeWebDriver() {
|
---|
| 55 | if (driver != null) {
|
---|
| 56 | driver.quit();
|
---|
| 57 | }
|
---|
| 58 | }
|
---|
| 59 |
|
---|
[53bad7e] | 60 | private void connectToWeb(String queryUrl, int numPeople) {
|
---|
[c164f8f] | 61 | driver.get(queryUrl);
|
---|
| 62 |
|
---|
| 63 | WebDriverWait wait = new WebDriverWait(driver, 40); // 40s timeout buffer
|
---|
| 64 | switch (url) {
|
---|
| 65 | case "https://booking.escapetravel.mk/":
|
---|
| 66 | wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#hotels-container")));
|
---|
[53bad7e] | 67 | try { Thread.sleep(10000);} catch (InterruptedException e) { e.printStackTrace(); }//price fetch
|
---|
[c164f8f] | 68 | break;
|
---|
| 69 | case "https://magelantravel.mk/":
|
---|
| 70 | wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div.sodrzina")));
|
---|
| 71 | break;
|
---|
| 72 | }
|
---|
| 73 |
|
---|
| 74 | String pageSource = driver.getPageSource();
|
---|
| 75 | System.out.println("Connected to " + queryUrl);
|
---|
| 76 | Document doc = Jsoup.parse(pageSource);
|
---|
| 77 | Element parentDiv;
|
---|
| 78 | Elements childDivs;
|
---|
| 79 |
|
---|
| 80 | switch (url) {
|
---|
| 81 | case "https://booking.escapetravel.mk/":
|
---|
| 82 | parentDiv = doc.selectFirst("#hotels-container");
|
---|
| 83 | if (parentDiv != null) {
|
---|
| 84 | childDivs = parentDiv.select("a.hotel-item");
|
---|
| 85 | for (Element div : childDivs) {
|
---|
| 86 | String data = div.outerHtml();
|
---|
[53bad7e] | 87 | Option option = optionParser(data,numPeople);
|
---|
[c164f8f] | 88 | if (option != null) {
|
---|
| 89 | Option existingOption = DatabaseUtil.findOption(option);
|
---|
| 90 | if (existingOption != null) {
|
---|
[1c51912] | 91 | if (existingOption.equals(option)) {
|
---|
[c164f8f] | 92 | option.setPriceChanged(true);
|
---|
| 93 | option.setNewPrice(option.getPrice());
|
---|
[d4d8f61] | 94 | }
|
---|
[c164f8f] | 95 | DatabaseUtil.updateOptionInDatabase(option);
|
---|
| 96 | } else if (optionSet.add(option)) {
|
---|
| 97 | uniqueOptions.add(option);
|
---|
| 98 | DatabaseUtil.saveOptionToDatabase(option);
|
---|
| 99 | System.out.println("Parsed " + option);
|
---|
[d4d8f61] | 100 | }
|
---|
| 101 | }
|
---|
| 102 | }
|
---|
[c164f8f] | 103 | } else {
|
---|
| 104 | System.out.println("Parent div not found");
|
---|
| 105 | }
|
---|
| 106 | break;
|
---|
| 107 | case "https://magelantravel.mk/":
|
---|
| 108 | parentDiv = doc.selectFirst("div.sodrzina");
|
---|
| 109 | if (parentDiv != null) {
|
---|
| 110 | childDivs = parentDiv.select("div.destinacija");
|
---|
| 111 | childDivs.removeIf(div -> div.attr("style").contains("display:none") || div.attr("style").contains("display: none"));
|
---|
| 112 | System.out.println("Filtered childDivs size: " + childDivs.size());
|
---|
| 113 | for (Element div : childDivs) {
|
---|
| 114 | String data = div.outerHtml();
|
---|
[53bad7e] | 115 | Option newOption = optionParser(data,numPeople);
|
---|
[c164f8f] | 116 | if (newOption != null) {
|
---|
[1c51912] | 117 | if (optionSet.add(newOption)) {
|
---|
[c164f8f] | 118 | uniqueOptions.add(newOption);
|
---|
[1c51912] | 119 |
|
---|
| 120 | newOption.setId(DatabaseUtil.saveOptionToDatabase(newOption));
|
---|
| 121 | scrapeOptionInfo(newOption);
|
---|
[c164f8f] | 122 | System.out.println("Parsed " + newOption);
|
---|
[d4d8f61] | 123 | }
|
---|
| 124 | }
|
---|
| 125 | }
|
---|
[c164f8f] | 126 |
|
---|
| 127 | } else {
|
---|
| 128 | System.out.println("Parent div not found");
|
---|
| 129 | }
|
---|
| 130 | break;
|
---|
| 131 | default:
|
---|
| 132 | System.out.println("URL not recognized for parsing.");
|
---|
[d4d8f61] | 133 | }
|
---|
| 134 | }
|
---|
[1c51912] | 135 | private void scrapeOptionInfo(Option option) {
|
---|
| 136 | String url = option.getLink();
|
---|
| 137 | if(url.contains("magelantravel.mk")) {
|
---|
| 138 | System.out.println("Scraping info for " + option.getHotelName());
|
---|
| 139 | String[] dates = option.getDateRange().split(" - ");
|
---|
| 140 | url += "&checkin=" + dates[0] + "&checkout=" + dates[1] + "&adult=" + option.getNumPeople();
|
---|
[d4d8f61] | 141 |
|
---|
[1c51912] | 142 | driver.get(url);
|
---|
| 143 | try { Thread.sleep(5000); } catch (InterruptedException e) { e.printStackTrace(); } //data fetch
|
---|
| 144 | String pageSource = driver.getPageSource();
|
---|
| 145 | Document doc = Jsoup.parse(pageSource);
|
---|
| 146 | Elements roomOptions = doc.select(".tblroom > tbody > tr");
|
---|
| 147 | for (Element roomOption : roomOptions) {
|
---|
| 148 | String type = roomOption.select("a.tblroom-type").text();
|
---|
[c164f8f] | 149 |
|
---|
[1c51912] | 150 | String board = roomOption.select(".rezervacija-objekt").text();
|
---|
| 151 | if(board.length() > 2){
|
---|
| 152 | board = board.substring(0,2);
|
---|
| 153 | }
|
---|
| 154 | if(board.isEmpty() || type.isEmpty()){
|
---|
| 155 | continue;
|
---|
| 156 | }
|
---|
| 157 | Elements amenityElement = roomOption.select(".objekt-opis");
|
---|
| 158 | String amenity = (amenityElement != null ? amenityElement.text() : "");
|
---|
| 159 | System.out.println(amenity + " " + board + " " + type );
|
---|
| 160 | String priceText = roomOption.select(".tbl-cena").text().replace("€", "").trim();
|
---|
| 161 | float price;
|
---|
| 162 | if (!priceText.isEmpty()) {
|
---|
| 163 | price = Float.parseFloat(priceText);
|
---|
| 164 | }else continue;
|
---|
[c164f8f] | 165 |
|
---|
[1c51912] | 166 | DatabaseUtil.saveOptionDetails(option.getId(), type,board,amenity, price);
|
---|
| 167 | }
|
---|
| 168 | }
|
---|
| 169 | }
|
---|
| 170 | private Option optionParser(String data, int numPeople){
|
---|
[d4d8f61] | 171 | Document doc = Jsoup.parse(data);
|
---|
| 172 | Option created = new Option();
|
---|
| 173 | switch (url) {
|
---|
[c164f8f] | 174 | case "https://magelantravel.mk/":
|
---|
| 175 | created = parseMagelan(doc);
|
---|
[53bad7e] | 176 | created.setNumPeople(numPeople);
|
---|
[d4d8f61] | 177 | break;
|
---|
| 178 | case "https://booking.escapetravel.mk/":
|
---|
| 179 | created = parseEscapeTravel(doc);
|
---|
[53bad7e] | 180 | created.setNumPeople(numPeople);
|
---|
[d4d8f61] | 181 | break;
|
---|
| 182 | default:
|
---|
| 183 | System.out.println("URL not recognized for parsing.");
|
---|
| 184 | break;
|
---|
| 185 | }
|
---|
| 186 | if (created.isEmpty()) {
|
---|
| 187 | return null;
|
---|
| 188 | }
|
---|
[1c51912] | 189 | //scrapeOptionInfo(created);
|
---|
[d4d8f61] | 190 | return created;
|
---|
| 191 | }
|
---|
| 192 |
|
---|
[c164f8f] | 193 | private Option parseMagelan(Document doc) {
|
---|
[d4d8f61] | 194 | Option created = new Option();
|
---|
[c164f8f] | 195 | Element linkElement = doc.selectFirst("div.ponuda-sredina");
|
---|
| 196 | int id = Integer.parseInt(linkElement.attr("data-id"));
|
---|
| 197 | int turop = Integer.parseInt(linkElement.attr("data-turop"));
|
---|
| 198 | created.setLink("https://magelantravel.mk/ponudi.php?type=1&objektid=" + id + "&turop=" + turop);
|
---|
| 199 | Element imgElement = doc.selectFirst("div.imgLiquidFill.imgLiquid.ponuda-img.zoom");
|
---|
| 200 | created.setImgSrc(imgElement != null ? url + imgElement.attr("style")
|
---|
| 201 | .split("url\\(")[1].split("\\)")[0].replace("'", "").replace("./", "/") : null);
|
---|
| 202 | Element hotelNameElement = doc.selectFirst("div.ponuda-objekt");
|
---|
[d4d8f61] | 203 | created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
|
---|
[c164f8f] | 204 | Element countryElement = doc.selectFirst("l.ponuda-lokacija");
|
---|
[d4d8f61] | 205 | created.setCountry(countryElement != null ? countryElement.text() : null);
|
---|
[1c51912] | 206 | //Element priceElement = doc.selectFirst("div.ponuda-cena");
|
---|
[c164f8f] | 207 | Element dateElement = doc.selectFirst("l.ponuda-opis.termin");
|
---|
| 208 | created.setDateRange(dateElement != null ? dateElement.text() : null);
|
---|
[1c51912] | 209 | /*float price = Float.parseFloat(priceElement != null ? priceElement.text().replaceAll("[^\\d.]", "") : "0");
|
---|
| 210 | created.setPrice(price);*/
|
---|
[d4d8f61] | 211 | return created;
|
---|
| 212 | }
|
---|
| 213 | private Option parseEscapeTravel(Document doc) {
|
---|
| 214 | Option created = new Option();
|
---|
[c164f8f] | 215 | Element card = doc.selectFirst("a.hotel-item");
|
---|
| 216 | String link = card.attr("href");
|
---|
| 217 | created.setLink(link);
|
---|
| 218 | created.setImgSrc(card.attr("data-picture"));
|
---|
| 219 | created.setHotelName(card.attr("data-title"));
|
---|
| 220 | Element countryP = doc.selectFirst("p.text-info");
|
---|
[53bad7e] | 221 | String country = countryP.text().replaceAll("leto hoteli", "");
|
---|
| 222 | created.setCountry(country);
|
---|
[1c51912] | 223 | /*Element priceElem = doc.selectFirst("span.hotel-price");
|
---|
[c164f8f] | 224 | String priceText = priceElem.text();
|
---|
| 225 | float price = 0;
|
---|
| 226 | if(!priceText.isEmpty()) {
|
---|
| 227 | price = Float.parseFloat(priceText.replace("€", ""));
|
---|
| 228 | }
|
---|
[1c51912] | 229 | created.setPrice(price);*/
|
---|
[c164f8f] | 230 | String[] queryParams = link.split("[?&]");
|
---|
| 231 | String startDateStr = null;
|
---|
| 232 | int nights = 0;
|
---|
| 233 | for (String param : queryParams) {
|
---|
| 234 | if (param.startsWith("Date=")) {
|
---|
| 235 | startDateStr = param.split("=")[1];
|
---|
| 236 | }
|
---|
| 237 | if (param.startsWith("Nights=")) {
|
---|
| 238 | nights = Integer.parseInt(param.split("=")[1]);
|
---|
| 239 | }
|
---|
| 240 | }
|
---|
| 241 | if (startDateStr != null && nights > 0)
|
---|
| 242 | {
|
---|
| 243 | SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
|
---|
| 244 | try {
|
---|
| 245 | Date startDate = dateFormat.parse(startDateStr);
|
---|
[d4d8f61] | 246 |
|
---|
[c164f8f] | 247 | Calendar calendar = Calendar.getInstance();
|
---|
| 248 | calendar.setTime(startDate);
|
---|
| 249 | calendar.add(Calendar.DAY_OF_YEAR, nights);
|
---|
| 250 | Date endDate = calendar.getTime();
|
---|
| 251 | String dateRange = dateFormat.format(startDate) + " - " + dateFormat.format(endDate);
|
---|
| 252 | created.setDateRange(dateRange);
|
---|
| 253 | }catch (ParseException e){
|
---|
| 254 | e.printStackTrace();
|
---|
| 255 | }
|
---|
| 256 | }
|
---|
[d4d8f61] | 257 | return created;
|
---|
| 258 | }
|
---|
| 259 |
|
---|
| 260 | @Override
|
---|
[c164f8f] | 261 | public void run() {
|
---|
| 262 | System.out.println("Thread started for url: " + url);
|
---|
| 263 | initializeWebDriver();
|
---|
| 264 | if ("https://magelantravel.mk/".equals(url)) {
|
---|
| 265 | ObjectMapper mapper = new ObjectMapper();
|
---|
| 266 | try {
|
---|
| 267 | ClassLoader classLoader = getClass().getClassLoader();
|
---|
| 268 | JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
|
---|
| 269 | JsonNode countries = root.get("countries");
|
---|
| 270 | SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
|
---|
| 271 | Calendar calendar = Calendar.getInstance();
|
---|
| 272 | calendar.add(Calendar.DAY_OF_YEAR, 1);
|
---|
| 273 |
|
---|
| 274 | for (int i = 0; i < 90; i++) { // next three months
|
---|
| 275 | String date = dateFormat.format(calendar.getTime());
|
---|
| 276 | for (JsonNode countryNode : countries) {
|
---|
| 277 | String country = countryNode.asText();
|
---|
| 278 | for (int nokevanja = 2; nokevanja <= 10; nokevanja++) {
|
---|
[53bad7e] | 279 | for(int lugje = 1; lugje <= 4; lugje++) {
|
---|
| 280 | String queryUrl = url + "/destinacii?ah_tip=1&iframe=&affiliate_code=&carter_id=0&carter_region=&carter_dataod=&carter_datado=&destinacija=" + country + "&oddatum=" + date + "&nokevanja=" + nokevanja + "&dodatum=&broj_vozrasni=" + lugje + "&broj_deca=0&spdete1=0&spdete2=0&spdete3=0&spdete4=0";
|
---|
| 281 | connectToWeb(queryUrl,lugje);
|
---|
| 282 | }
|
---|
[c164f8f] | 283 | }
|
---|
| 284 | }
|
---|
| 285 | calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
|
---|
| 286 | }
|
---|
| 287 |
|
---|
| 288 | } catch (IOException e) {
|
---|
| 289 | e.printStackTrace();
|
---|
| 290 | }
|
---|
| 291 | } else if ("https://booking.escapetravel.mk/".equals(url)) {
|
---|
| 292 | ObjectMapper mapper = new ObjectMapper();
|
---|
| 293 | try {
|
---|
| 294 | ClassLoader classLoader = getClass().getClassLoader();
|
---|
| 295 | JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
|
---|
[53bad7e] | 296 | JsonNode countries = root.get("countries");
|
---|
[c164f8f] | 297 | SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
|
---|
| 298 | Calendar calendar = Calendar.getInstance();
|
---|
| 299 | calendar.add(Calendar.DAY_OF_YEAR, 1);
|
---|
| 300 |
|
---|
| 301 | for (int i = 0; i < 90; i++) { // next three months
|
---|
| 302 | String date = dateFormat.format(calendar.getTime());
|
---|
| 303 | for (JsonNode countryNode : countries) {
|
---|
| 304 | String country = countryNode.asText();
|
---|
| 305 | for(int nokevanja = 2; nokevanja <=10; nokevanja ++) {
|
---|
[53bad7e] | 306 | for(int lugje = 1; lugje <= 4; lugje++) {
|
---|
| 307 | String queryUrl = url + "/hotels?Search=" + country + "&Date=" + date + "&Nights=" + nokevanja + "&Rooms=1&Adults=" + lugje;
|
---|
| 308 | connectToWeb(queryUrl,lugje);
|
---|
| 309 | }
|
---|
[c164f8f] | 310 | }
|
---|
[d4d8f61] | 311 | }
|
---|
[c164f8f] | 312 | calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
|
---|
| 313 | }
|
---|
| 314 | } catch (IOException e) {
|
---|
| 315 | e.printStackTrace();
|
---|
[d4d8f61] | 316 | }
|
---|
[c164f8f] | 317 | } else {
|
---|
| 318 | // Handle other URLs
|
---|
[d4d8f61] | 319 | }
|
---|
[c164f8f] | 320 | closeWebDriver();
|
---|
| 321 | latch.countDown();
|
---|
[d4d8f61] | 322 | }
|
---|
[c164f8f] | 323 |
|
---|
| 324 | }
|
---|