[d4d8f61] | 1 | import org.openqa.selenium.WebDriver;
|
---|
| 2 | import org.openqa.selenium.chrome.ChromeDriver;
|
---|
| 3 | import org.openqa.selenium.chrome.ChromeOptions;
|
---|
| 4 | import org.jsoup.Jsoup;
|
---|
| 5 | import org.jsoup.nodes.Document;
|
---|
| 6 | import org.jsoup.nodes.Element;
|
---|
| 7 | import org.jsoup.select.Elements;
|
---|
| 8 |
|
---|
| 9 | import java.util.concurrent.ConcurrentLinkedQueue;
|
---|
| 10 | import java.util.concurrent.CountDownLatch;
|
---|
| 11 |
|
---|
| 12 | public class ScraperThread extends Thread {
|
---|
| 13 | private String url;
|
---|
| 14 | private String destination;
|
---|
| 15 | private String departureDate;
|
---|
| 16 | private int numberOfPeople;
|
---|
| 17 | private ConcurrentLinkedQueue<Option> uniqueOptions;
|
---|
| 18 | private CountDownLatch latch;
|
---|
| 19 |
|
---|
| 20 | public ScraperThread(String url, String destination, String departureDate, int numberOfPeople, ConcurrentLinkedQueue<Option> optionsQueue, CountDownLatch latch) {
|
---|
| 21 | this.url = url;
|
---|
| 22 | this.destination = destination;
|
---|
| 23 | this.departureDate = departureDate;
|
---|
| 24 | this.numberOfPeople = numberOfPeople;
|
---|
| 25 | this.uniqueOptions = optionsQueue;
|
---|
| 26 | this.latch = latch;
|
---|
| 27 | }
|
---|
| 28 |
|
---|
| 29 | private void connectToWeb(String queryUrl) {
|
---|
| 30 | // Selenium
|
---|
| 31 | ChromeOptions options = new ChromeOptions();
|
---|
| 32 | options.setBinary("C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"); // Path to Brave, remove for Chrome compatibility
|
---|
| 33 | options.addArguments("--headless"); // Run in headless mode
|
---|
| 34 | options.addArguments("--disable-gpu");
|
---|
| 35 | options.addArguments("--window-size=1920,1080");
|
---|
| 36 | options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); // User-Agent
|
---|
| 37 |
|
---|
| 38 | // chromeDriver
|
---|
| 39 | System.setProperty("webdriver.chrome.driver", "C:\\drivers\\chromedriver.exe");
|
---|
| 40 | System.setProperty("webdriver.http.factory", "jdk-http-client");
|
---|
| 41 | WebDriver driver = new ChromeDriver(options);
|
---|
| 42 | try {
|
---|
| 43 | // Navigate to URL
|
---|
| 44 | driver.get(queryUrl);
|
---|
| 45 | Thread.sleep(10000); // Sleep to fetch all data
|
---|
| 46 |
|
---|
| 47 | // Get page source
|
---|
| 48 | String pageSource = driver.getPageSource();
|
---|
| 49 | System.out.println("Thread " + Thread.currentThread().getId() + " connected to " + queryUrl);
|
---|
| 50 |
|
---|
| 51 | // Get only options
|
---|
| 52 | Document doc = Jsoup.parse(pageSource);
|
---|
| 53 | Element parentDiv;
|
---|
| 54 | Elements childDivs;
|
---|
| 55 | switch (url) {
|
---|
| 56 | case "https://www.fibula.com.mk/":
|
---|
| 57 | parentDiv = doc.selectFirst("div.flex.flex-col.gap-5");
|
---|
| 58 | if (parentDiv != null) {
|
---|
| 59 | childDivs = parentDiv.select("div");
|
---|
| 60 | for (Element div : childDivs) {
|
---|
| 61 | String data = div.html();
|
---|
| 62 | Option option = optionParser(data);
|
---|
| 63 | if (option != null) {
|
---|
| 64 | if (uniqueOptions.add(option)) {
|
---|
| 65 | System.out.println("Parsed Option: " + option);
|
---|
| 66 | }
|
---|
| 67 | }
|
---|
| 68 | }
|
---|
| 69 | } else {
|
---|
| 70 | System.out.println("Parent div not found");
|
---|
| 71 | }
|
---|
| 72 | break;
|
---|
| 73 | case "https://booking.escapetravel.mk/":
|
---|
| 74 | parentDiv = doc.selectFirst("div.container.pt-4.pt-md-6.scroll-into-view");
|
---|
| 75 | Element subParent;
|
---|
| 76 | System.out.println(parentDiv);
|
---|
| 77 | if(parentDiv != null) {
|
---|
| 78 | subParent = parentDiv.selectFirst("div.row");
|
---|
| 79 | }else{
|
---|
| 80 | System.out.println("Parent div not found");
|
---|
| 81 | break;
|
---|
| 82 | }
|
---|
| 83 |
|
---|
| 84 | if (subParent != null) {
|
---|
| 85 | childDivs = subParent.select("div.col-md-3");
|
---|
| 86 |
|
---|
| 87 | for (Element div : childDivs) {
|
---|
| 88 | String data = div.html();
|
---|
| 89 | Option option = optionParser(data);
|
---|
| 90 | if (option != null) {
|
---|
| 91 | if (uniqueOptions.add(option)) {
|
---|
| 92 | System.out.println("Parsed option: " + option);
|
---|
| 93 | }
|
---|
| 94 | }
|
---|
| 95 | }
|
---|
| 96 | }else {
|
---|
| 97 | System.out.println("subparent div not found");
|
---|
| 98 | }
|
---|
| 99 | break;
|
---|
| 100 | }
|
---|
| 101 | } catch (InterruptedException e) {
|
---|
| 102 | e.printStackTrace();
|
---|
| 103 | } finally {
|
---|
| 104 | driver.quit();
|
---|
| 105 | latch.countDown();
|
---|
| 106 | }
|
---|
| 107 | }
|
---|
| 108 |
|
---|
| 109 | private Option optionParser(String data) {
|
---|
| 110 | Document doc = Jsoup.parse(data);
|
---|
| 111 | Option created = new Option();
|
---|
| 112 |
|
---|
| 113 | switch (url) {
|
---|
| 114 | case "https://www.fibula.com.mk/":
|
---|
| 115 | created = parseFibula(doc);
|
---|
| 116 | break;
|
---|
| 117 | case "https://booking.escapetravel.mk/":
|
---|
| 118 | created = parseEscapeTravel(doc);
|
---|
| 119 | break;
|
---|
| 120 | default:
|
---|
| 121 | System.out.println("URL not recognized for parsing.");
|
---|
| 122 | break;
|
---|
| 123 | }
|
---|
| 124 |
|
---|
| 125 | if (created.isEmpty()) {
|
---|
| 126 | return null;
|
---|
| 127 | }
|
---|
| 128 |
|
---|
| 129 | return created;
|
---|
| 130 | }
|
---|
| 131 |
|
---|
| 132 | private Option parseFibula(Document doc) {
|
---|
| 133 | Option created = new Option();
|
---|
| 134 |
|
---|
| 135 | Element linkElement = doc.selectFirst("a[target='_blank']");
|
---|
| 136 | created.setLink(linkElement != null ? url + linkElement.attr("href") : null);
|
---|
| 137 |
|
---|
| 138 | Element imgElement = doc.selectFirst("div.md\\:aspect-none img");
|
---|
| 139 | created.setImgSrc(imgElement != null ? imgElement.attr("src") : null);
|
---|
| 140 |
|
---|
| 141 | Element hotelNameElement = doc.selectFirst("h5.text-md");
|
---|
| 142 | created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
|
---|
| 143 |
|
---|
| 144 | Element countryElement = doc.selectFirst("small.text-navy");
|
---|
| 145 | created.setCountry(countryElement != null ? countryElement.text() : null);
|
---|
| 146 |
|
---|
| 147 | Element priceElement = doc.selectFirst("small.line-through");
|
---|
| 148 | String price = priceElement != null ? priceElement.text().replaceAll("[^\\d.]", "") : "0";
|
---|
| 149 | created.setPrice(price);
|
---|
| 150 |
|
---|
| 151 | return created;
|
---|
| 152 | }
|
---|
| 153 |
|
---|
| 154 | private Option parseEscapeTravel(Document doc) {
|
---|
| 155 | Option created = new Option();
|
---|
| 156 |
|
---|
| 157 | // Extract link
|
---|
| 158 | Element linkElement = doc.selectFirst("a[target='_blank']");
|
---|
| 159 | created.setLink(linkElement != null ? linkElement.attr("href") : null);
|
---|
| 160 |
|
---|
| 161 | // Extract image source
|
---|
| 162 | Element imgElement = doc.selectFirst("img.card-img-top");
|
---|
| 163 | created.setImgSrc(imgElement != null ? imgElement.attr("src") : null);
|
---|
| 164 |
|
---|
| 165 | // Extract hotel name
|
---|
| 166 | Element hotelNameElement = doc.selectFirst("h3.fw-bold.text-body.mb-2");
|
---|
| 167 | created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
|
---|
| 168 |
|
---|
| 169 | // Extract country/location
|
---|
| 170 | Element countryElement = doc.selectFirst("h5.fw-light.text-primary.mb-1");
|
---|
| 171 | created.setCountry(countryElement != null ? countryElement.text() : null);
|
---|
| 172 |
|
---|
| 173 | // Extract price
|
---|
| 174 | Element priceElement = doc.selectFirst("h4.fw-light.text-success.mb-0");
|
---|
| 175 | String price = priceElement != null ? priceElement.text().replaceAll("[^\\d.]", "") : "0";
|
---|
| 176 | created.setPrice(price);
|
---|
| 177 |
|
---|
| 178 | return created;
|
---|
| 179 | }
|
---|
| 180 |
|
---|
| 181 |
|
---|
| 182 | @Override
|
---|
| 183 | public void run() {
|
---|
| 184 | System.out.println("Thread started for url: " + url);
|
---|
| 185 | StringBuilder builder = new StringBuilder();
|
---|
| 186 | builder.append(url);
|
---|
| 187 | String queryUrl;
|
---|
| 188 | switch (url) {
|
---|
| 189 | case "https://www.fibula.com.mk/":
|
---|
| 190 | builder.append("search?productType=2&"); // search for hotels
|
---|
| 191 | for (int i = 0; i < numberOfPeople; i++) { // add all passengers (default adults)
|
---|
| 192 | builder.append("passengers=1993-01-01&");
|
---|
| 193 | }
|
---|
| 194 | queryUrl = builder.toString();
|
---|
| 195 | System.out.println(queryUrl);
|
---|
| 196 | connectToWeb(queryUrl);
|
---|
| 197 | break;
|
---|
| 198 | case "https://booking.escapetravel.mk/":
|
---|
| 199 | builder.append("destinations?Category=&Search=&DateFrom=");
|
---|
| 200 | builder.append(departureDate);
|
---|
| 201 | builder.append("&Rooms=1&Adults=");
|
---|
| 202 | builder.append(numberOfPeople);
|
---|
| 203 | queryUrl = builder.toString();
|
---|
| 204 | System.out.println(queryUrl);
|
---|
| 205 | connectToWeb(queryUrl);
|
---|
| 206 | break;
|
---|
| 207 | default:
|
---|
| 208 | System.out.println("Not available for current url");
|
---|
| 209 | latch.countDown();
|
---|
| 210 | break;
|
---|
| 211 | }
|
---|
| 212 | }
|
---|
| 213 | }
|
---|