source: backend/GlobeGuru-backend/src/main/java/ScraperThread.java@ d4d8f61

Last change on this file since d4d8f61 was d4d8f61, checked in by Kikac324 <kristijanzafirovski26@…>, 7 months ago

init

  • Property mode set to 100644
File size: 8.5 KB
Line 
1import org.openqa.selenium.WebDriver;
2import org.openqa.selenium.chrome.ChromeDriver;
3import org.openqa.selenium.chrome.ChromeOptions;
4import org.jsoup.Jsoup;
5import org.jsoup.nodes.Document;
6import org.jsoup.nodes.Element;
7import org.jsoup.select.Elements;
8
9import java.util.concurrent.ConcurrentLinkedQueue;
10import java.util.concurrent.CountDownLatch;
11
12public class ScraperThread extends Thread {
13 private String url;
14 private String destination;
15 private String departureDate;
16 private int numberOfPeople;
17 private ConcurrentLinkedQueue<Option> uniqueOptions;
18 private CountDownLatch latch;
19
20 public ScraperThread(String url, String destination, String departureDate, int numberOfPeople, ConcurrentLinkedQueue<Option> optionsQueue, CountDownLatch latch) {
21 this.url = url;
22 this.destination = destination;
23 this.departureDate = departureDate;
24 this.numberOfPeople = numberOfPeople;
25 this.uniqueOptions = optionsQueue;
26 this.latch = latch;
27 }
28
29 private void connectToWeb(String queryUrl) {
30 // Selenium
31 ChromeOptions options = new ChromeOptions();
32 options.setBinary("C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe"); // Path to Brave, remove for Chrome compatibility
33 options.addArguments("--headless"); // Run in headless mode
34 options.addArguments("--disable-gpu");
35 options.addArguments("--window-size=1920,1080");
36 options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"); // User-Agent
37
38 // chromeDriver
39 System.setProperty("webdriver.chrome.driver", "C:\\drivers\\chromedriver.exe");
40 System.setProperty("webdriver.http.factory", "jdk-http-client");
41 WebDriver driver = new ChromeDriver(options);
42 try {
43 // Navigate to URL
44 driver.get(queryUrl);
45 Thread.sleep(10000); // Sleep to fetch all data
46
47 // Get page source
48 String pageSource = driver.getPageSource();
49 System.out.println("Thread " + Thread.currentThread().getId() + " connected to " + queryUrl);
50
51 // Get only options
52 Document doc = Jsoup.parse(pageSource);
53 Element parentDiv;
54 Elements childDivs;
55 switch (url) {
56 case "https://www.fibula.com.mk/":
57 parentDiv = doc.selectFirst("div.flex.flex-col.gap-5");
58 if (parentDiv != null) {
59 childDivs = parentDiv.select("div");
60 for (Element div : childDivs) {
61 String data = div.html();
62 Option option = optionParser(data);
63 if (option != null) {
64 if (uniqueOptions.add(option)) {
65 System.out.println("Parsed Option: " + option);
66 }
67 }
68 }
69 } else {
70 System.out.println("Parent div not found");
71 }
72 break;
73 case "https://booking.escapetravel.mk/":
74 parentDiv = doc.selectFirst("div.container.pt-4.pt-md-6.scroll-into-view");
75 Element subParent;
76 System.out.println(parentDiv);
77 if(parentDiv != null) {
78 subParent = parentDiv.selectFirst("div.row");
79 }else{
80 System.out.println("Parent div not found");
81 break;
82 }
83
84 if (subParent != null) {
85 childDivs = subParent.select("div.col-md-3");
86
87 for (Element div : childDivs) {
88 String data = div.html();
89 Option option = optionParser(data);
90 if (option != null) {
91 if (uniqueOptions.add(option)) {
92 System.out.println("Parsed option: " + option);
93 }
94 }
95 }
96 }else {
97 System.out.println("subparent div not found");
98 }
99 break;
100 }
101 } catch (InterruptedException e) {
102 e.printStackTrace();
103 } finally {
104 driver.quit();
105 latch.countDown();
106 }
107 }
108
109 private Option optionParser(String data) {
110 Document doc = Jsoup.parse(data);
111 Option created = new Option();
112
113 switch (url) {
114 case "https://www.fibula.com.mk/":
115 created = parseFibula(doc);
116 break;
117 case "https://booking.escapetravel.mk/":
118 created = parseEscapeTravel(doc);
119 break;
120 default:
121 System.out.println("URL not recognized for parsing.");
122 break;
123 }
124
125 if (created.isEmpty()) {
126 return null;
127 }
128
129 return created;
130 }
131
132 private Option parseFibula(Document doc) {
133 Option created = new Option();
134
135 Element linkElement = doc.selectFirst("a[target='_blank']");
136 created.setLink(linkElement != null ? url + linkElement.attr("href") : null);
137
138 Element imgElement = doc.selectFirst("div.md\\:aspect-none img");
139 created.setImgSrc(imgElement != null ? imgElement.attr("src") : null);
140
141 Element hotelNameElement = doc.selectFirst("h5.text-md");
142 created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
143
144 Element countryElement = doc.selectFirst("small.text-navy");
145 created.setCountry(countryElement != null ? countryElement.text() : null);
146
147 Element priceElement = doc.selectFirst("small.line-through");
148 String price = priceElement != null ? priceElement.text().replaceAll("[^\\d.]", "") : "0";
149 created.setPrice(price);
150
151 return created;
152 }
153
154 private Option parseEscapeTravel(Document doc) {
155 Option created = new Option();
156
157 // Extract link
158 Element linkElement = doc.selectFirst("a[target='_blank']");
159 created.setLink(linkElement != null ? linkElement.attr("href") : null);
160
161 // Extract image source
162 Element imgElement = doc.selectFirst("img.card-img-top");
163 created.setImgSrc(imgElement != null ? imgElement.attr("src") : null);
164
165 // Extract hotel name
166 Element hotelNameElement = doc.selectFirst("h3.fw-bold.text-body.mb-2");
167 created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
168
169 // Extract country/location
170 Element countryElement = doc.selectFirst("h5.fw-light.text-primary.mb-1");
171 created.setCountry(countryElement != null ? countryElement.text() : null);
172
173 // Extract price
174 Element priceElement = doc.selectFirst("h4.fw-light.text-success.mb-0");
175 String price = priceElement != null ? priceElement.text().replaceAll("[^\\d.]", "") : "0";
176 created.setPrice(price);
177
178 return created;
179 }
180
181
182 @Override
183 public void run() {
184 System.out.println("Thread started for url: " + url);
185 StringBuilder builder = new StringBuilder();
186 builder.append(url);
187 String queryUrl;
188 switch (url) {
189 case "https://www.fibula.com.mk/":
190 builder.append("search?productType=2&"); // search for hotels
191 for (int i = 0; i < numberOfPeople; i++) { // add all passengers (default adults)
192 builder.append("passengers=1993-01-01&");
193 }
194 queryUrl = builder.toString();
195 System.out.println(queryUrl);
196 connectToWeb(queryUrl);
197 break;
198 case "https://booking.escapetravel.mk/":
199 builder.append("destinations?Category=&Search=&DateFrom=");
200 builder.append(departureDate);
201 builder.append("&Rooms=1&Adults=");
202 builder.append(numberOfPeople);
203 queryUrl = builder.toString();
204 System.out.println(queryUrl);
205 connectToWeb(queryUrl);
206 break;
207 default:
208 System.out.println("Not available for current url");
209 latch.countDown();
210 break;
211 }
212 }
213 }
Note: See TracBrowser for help on using the repository browser.