1 | import org.openqa.selenium.WebDriver;
|
---|
2 | import org.openqa.selenium.chrome.ChromeDriver;
|
---|
3 | import org.openqa.selenium.chrome.ChromeOptions;
|
---|
4 | import org.jsoup.Jsoup;
|
---|
5 | import org.jsoup.nodes.Document;
|
---|
6 | import org.jsoup.nodes.Element;
|
---|
7 | import org.jsoup.select.Elements;
|
---|
8 |
|
---|
9 | import java.util.concurrent.ConcurrentLinkedQueue;
|
---|
10 | import java.util.concurrent.CountDownLatch;
|
---|
11 |
|
---|
12 | public class ScraperThread extends Thread {
|
---|
13 | private String url;
|
---|
14 | private String destination;
|
---|
15 | private String departureDate;
|
---|
16 | private int numberOfPeople;
|
---|
17 | private ConcurrentLinkedQueue<Option> uniqueOptions;
|
---|
18 | private CountDownLatch latch;
|
---|
19 |
|
---|
20 | public ScraperThread(String url, String destination, String departureDate, int numberOfPeople, ConcurrentLinkedQueue<Option> optionsQueue, CountDownLatch latch) {
|
---|
21 | this.url = url;
|
---|
22 | this.destination = destination;
|
---|
23 | this.departureDate = departureDate;
|
---|
24 | this.numberOfPeople = numberOfPeople;
|
---|
25 | this.uniqueOptions = optionsQueue;
|
---|
26 | this.latch = latch;
|
---|
27 | }
|
---|
28 |
|
---|
29 | private void connectToWeb(String queryUrl) {
|
---|
30 | // Selenium
|
---|
31 | ChromeOptions options = new ChromeOptions();
|
---|
32 | options.setBinary("C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"); // Path to Brave, remove for Chrome compatibility
|
---|
33 | options.addArguments("--headless"); // Run in headless mode
|
---|
34 | options.addArguments("--disable-gpu");
|
---|
35 | options.addArguments("--window-size=1920,1080");
|
---|
36 | options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); // User-Agent
|
---|
37 |
|
---|
38 | // chromeDriver
|
---|
39 | System.setProperty("webdriver.chrome.driver", "C:\\drivers\\chromedriver.exe");
|
---|
40 | System.setProperty("webdriver.http.factory", "jdk-http-client");
|
---|
41 | WebDriver driver = new ChromeDriver(options);
|
---|
42 | try {
|
---|
43 | // Navigate to URL
|
---|
44 | driver.get(queryUrl);
|
---|
45 | Thread.sleep(10000); // Sleep to fetch all data
|
---|
46 |
|
---|
47 | // Get page source
|
---|
48 | String pageSource = driver.getPageSource();
|
---|
49 | System.out.println("Thread " + Thread.currentThread().getId() + " connected to " + queryUrl);
|
---|
50 |
|
---|
51 | // Get only options
|
---|
52 | Document doc = Jsoup.parse(pageSource);
|
---|
53 | Element parentDiv;
|
---|
54 | Elements childDivs;
|
---|
55 | switch (url) {
|
---|
56 | case "https://www.fibula.com.mk/":
|
---|
57 | parentDiv = doc.selectFirst("div.flex.flex-col.gap-5");
|
---|
58 | if (parentDiv != null) {
|
---|
59 | childDivs = parentDiv.select("div");
|
---|
60 | for (Element div : childDivs) {
|
---|
61 | String data = div.html();
|
---|
62 | Option option = optionParser(data);
|
---|
63 | if (option != null) {
|
---|
64 | if (uniqueOptions.add(option)) {
|
---|
65 | System.out.println("Parsed Option: " + option);
|
---|
66 | }
|
---|
67 | }
|
---|
68 | }
|
---|
69 | } else {
|
---|
70 | System.out.println("Parent div not found");
|
---|
71 | }
|
---|
72 | break;
|
---|
73 | case "https://booking.escapetravel.mk/":
|
---|
74 | parentDiv = doc.selectFirst("div.container.pt-4.pt-md-6.scroll-into-view");
|
---|
75 | Element subParent;
|
---|
76 | System.out.println(parentDiv);
|
---|
77 | if(parentDiv != null) {
|
---|
78 | subParent = parentDiv.selectFirst("div.row");
|
---|
79 | }else{
|
---|
80 | System.out.println("Parent div not found");
|
---|
81 | break;
|
---|
82 | }
|
---|
83 |
|
---|
84 | if (subParent != null) {
|
---|
85 | childDivs = subParent.select("div.col-md-3");
|
---|
86 |
|
---|
87 | for (Element div : childDivs) {
|
---|
88 | String data = div.html();
|
---|
89 | Option option = optionParser(data);
|
---|
90 | if (option != null) {
|
---|
91 | if (uniqueOptions.add(option)) {
|
---|
92 | System.out.println("Parsed option: " + option);
|
---|
93 | }
|
---|
94 | }
|
---|
95 | }
|
---|
96 | }else {
|
---|
97 | System.out.println("subparent div not found");
|
---|
98 | }
|
---|
99 | break;
|
---|
100 | }
|
---|
101 | } catch (InterruptedException e) {
|
---|
102 | e.printStackTrace();
|
---|
103 | } finally {
|
---|
104 | driver.quit();
|
---|
105 | latch.countDown();
|
---|
106 | }
|
---|
107 | }
|
---|
108 |
|
---|
109 | private Option optionParser(String data) {
|
---|
110 | Document doc = Jsoup.parse(data);
|
---|
111 | Option created = new Option();
|
---|
112 |
|
---|
113 | switch (url) {
|
---|
114 | case "https://www.fibula.com.mk/":
|
---|
115 | created = parseFibula(doc);
|
---|
116 | break;
|
---|
117 | case "https://booking.escapetravel.mk/":
|
---|
118 | created = parseEscapeTravel(doc);
|
---|
119 | break;
|
---|
120 | default:
|
---|
121 | System.out.println("URL not recognized for parsing.");
|
---|
122 | break;
|
---|
123 | }
|
---|
124 |
|
---|
125 | if (created.isEmpty()) {
|
---|
126 | return null;
|
---|
127 | }
|
---|
128 |
|
---|
129 | return created;
|
---|
130 | }
|
---|
131 |
|
---|
132 | private Option parseFibula(Document doc) {
|
---|
133 | Option created = new Option();
|
---|
134 |
|
---|
135 | Element linkElement = doc.selectFirst("a[target='_blank']");
|
---|
136 | created.setLink(linkElement != null ? url + linkElement.attr("href") : null);
|
---|
137 |
|
---|
138 | Element imgElement = doc.selectFirst("div.md\\:aspect-none img");
|
---|
139 | created.setImgSrc(imgElement != null ? imgElement.attr("src") : null);
|
---|
140 |
|
---|
141 | Element hotelNameElement = doc.selectFirst("h5.text-md");
|
---|
142 | created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
|
---|
143 |
|
---|
144 | Element countryElement = doc.selectFirst("small.text-navy");
|
---|
145 | created.setCountry(countryElement != null ? countryElement.text() : null);
|
---|
146 |
|
---|
147 | Element priceElement = doc.selectFirst("small.line-through");
|
---|
148 | String price = priceElement != null ? priceElement.text().replaceAll("[^\\d.]", "") : "0";
|
---|
149 | created.setPrice(price);
|
---|
150 |
|
---|
151 | return created;
|
---|
152 | }
|
---|
153 |
|
---|
154 | private Option parseEscapeTravel(Document doc) {
|
---|
155 | Option created = new Option();
|
---|
156 |
|
---|
157 | // Extract link
|
---|
158 | Element linkElement = doc.selectFirst("a[target='_blank']");
|
---|
159 | created.setLink(linkElement != null ? linkElement.attr("href") : null);
|
---|
160 |
|
---|
161 | // Extract image source
|
---|
162 | Element imgElement = doc.selectFirst("img.card-img-top");
|
---|
163 | created.setImgSrc(imgElement != null ? imgElement.attr("src") : null);
|
---|
164 |
|
---|
165 | // Extract hotel name
|
---|
166 | Element hotelNameElement = doc.selectFirst("h3.fw-bold.text-body.mb-2");
|
---|
167 | created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
|
---|
168 |
|
---|
169 | // Extract country/location
|
---|
170 | Element countryElement = doc.selectFirst("h5.fw-light.text-primary.mb-1");
|
---|
171 | created.setCountry(countryElement != null ? countryElement.text() : null);
|
---|
172 |
|
---|
173 | // Extract price
|
---|
174 | Element priceElement = doc.selectFirst("h4.fw-light.text-success.mb-0");
|
---|
175 | String price = priceElement != null ? priceElement.text().replaceAll("[^\\d.]", "") : "0";
|
---|
176 | created.setPrice(price);
|
---|
177 |
|
---|
178 | return created;
|
---|
179 | }
|
---|
180 |
|
---|
181 |
|
---|
182 | @Override
|
---|
183 | public void run() {
|
---|
184 | System.out.println("Thread started for url: " + url);
|
---|
185 | StringBuilder builder = new StringBuilder();
|
---|
186 | builder.append(url);
|
---|
187 | String queryUrl;
|
---|
188 | switch (url) {
|
---|
189 | case "https://www.fibula.com.mk/":
|
---|
190 | builder.append("search?productType=2&"); // search for hotels
|
---|
191 | for (int i = 0; i < numberOfPeople; i++) { // add all passengers (default adults)
|
---|
192 | builder.append("passengers=1993-01-01&");
|
---|
193 | }
|
---|
194 | queryUrl = builder.toString();
|
---|
195 | System.out.println(queryUrl);
|
---|
196 | connectToWeb(queryUrl);
|
---|
197 | break;
|
---|
198 | case "https://booking.escapetravel.mk/":
|
---|
199 | builder.append("destinations?Category=&Search=&DateFrom=");
|
---|
200 | builder.append(departureDate);
|
---|
201 | builder.append("&Rooms=1&Adults=");
|
---|
202 | builder.append(numberOfPeople);
|
---|
203 | queryUrl = builder.toString();
|
---|
204 | System.out.println(queryUrl);
|
---|
205 | connectToWeb(queryUrl);
|
---|
206 | break;
|
---|
207 | default:
|
---|
208 | System.out.println("Not available for current url");
|
---|
209 | latch.countDown();
|
---|
210 | break;
|
---|
211 | }
|
---|
212 | }
|
---|
213 | }
|
---|