source: backend/GlobeGuru-backend/src/main/java/ScraperThread.java@ c164f8f

Last change on this file since c164f8f was c164f8f, checked in by Kristijan <kristijanzafirovski26@…>, 3 weeks ago

pred-finalna

  • Property mode set to 100644
File size: 14.0 KB
Line 
1import com.fasterxml.jackson.databind.JsonNode;
2import com.fasterxml.jackson.databind.ObjectMapper;
3import org.openqa.selenium.By;
4import org.openqa.selenium.WebDriver;
5import org.openqa.selenium.WebElement;
6import org.openqa.selenium.chrome.ChromeDriver;
7import org.openqa.selenium.chrome.ChromeOptions;
8import org.jsoup.Jsoup;
9import org.jsoup.nodes.Document;
10import org.jsoup.nodes.Element;
11import org.jsoup.select.Elements;
12import org.openqa.selenium.support.ui.ExpectedCondition;
13import org.openqa.selenium.support.ui.ExpectedConditions;
14import org.openqa.selenium.support.ui.WebDriverWait;
15
16import java.io.File;
17import java.io.IOException;
18import java.sql.Connection;
19import java.sql.DriverManager;
20import java.sql.PreparedStatement;
21import java.sql.SQLException;
22import java.text.ParseException;
23import java.text.SimpleDateFormat;
24import java.util.*;
25import java.util.concurrent.ConcurrentLinkedQueue;
26import java.util.concurrent.CountDownLatch;
27
28public class ScraperThread extends Thread {
29 private String url;
30 private ConcurrentLinkedQueue<Option> uniqueOptions;
31 private CountDownLatch latch;
32 private Set<Option> optionSet;
33
34 public ScraperThread(String url, ConcurrentLinkedQueue<Option> optionsQueue, CountDownLatch latch) {
35 this.url = url;
36 this.uniqueOptions = optionsQueue;
37 this.latch = latch;
38 this.optionSet = new HashSet<>();
39 }
40
41 private WebDriver driver;
42
43 private void initializeWebDriver() {
44 System.setProperty("webdriver.chrome.driver", "C:\\chromedriver-win64\\chromedriver.exe");
45 ChromeOptions options = new ChromeOptions();
46 options.setBinary("C:\\Program Files\\BraveSoftware\\Brave-Browser\\Application\\brave.exe");
47 options.addArguments("--headless");
48 options.addArguments("--disable-gpu");
49 options.addArguments("--remote-allow-origins=*");
50 options.addArguments("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36");
51 driver = new ChromeDriver(options);
52 }
53
54 private void closeWebDriver() {
55 if (driver != null) {
56 driver.quit();
57 }
58 }
59
60 private void connectToWeb(String queryUrl) {
61 driver.get(queryUrl);
62
63 WebDriverWait wait = new WebDriverWait(driver, 40); // 40s timeout buffer
64 switch (url) {
65 case "https://booking.escapetravel.mk/":
66 wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("#hotels-container")));
67 try { Thread.sleep(5000);} catch (InterruptedException e) { e.printStackTrace(); }
68 break;
69 case "https://magelantravel.mk/":
70 wait.until(ExpectedConditions.presenceOfElementLocated(By.cssSelector("div.sodrzina")));
71 break;
72 default:
73 System.out.println("URL not recognized for waiting condition.");
74 // Handle other URLs if needed
75 }
76
77 String pageSource = driver.getPageSource();
78 System.out.println("Connected to " + queryUrl);
79 Document doc = Jsoup.parse(pageSource);
80 Element parentDiv;
81 Elements childDivs;
82
83 switch (url) {
84 case "https://www.fibula.com.mk/":
85 parentDiv = doc.selectFirst("div.flex.flex-col.gap-5");
86 if (parentDiv != null) {
87 childDivs = parentDiv.select("div");
88 for (Element div : childDivs) {
89 String data = div.html();
90 Option option = optionParser(data);
91 if (option != null && optionSet.add(option)) {
92 uniqueOptions.add(option);
93 System.out.println("Parsed " + option);
94 }
95 }
96 } else {
97 System.out.println("Parent div not found");
98 }
99 break;
100 case "https://booking.escapetravel.mk/":
101 parentDiv = doc.selectFirst("#hotels-container");
102 if (parentDiv != null) {
103 childDivs = parentDiv.select("a.hotel-item");
104 for (Element div : childDivs) {
105 String data = div.outerHtml();
106 Option option = optionParser(data);
107 if (option != null) {
108 Option existingOption = DatabaseUtil.findOption(option);
109 if (existingOption != null) {
110 if (existingOption.equals(option) || existingOption.getPrice() != option.getPrice()) {
111 option.setPriceChanged(true);
112 option.setNewPrice(option.getPrice());
113 }
114 DatabaseUtil.updateOptionInDatabase(option);
115 } else if (optionSet.add(option)) {
116 uniqueOptions.add(option);
117 DatabaseUtil.saveOptionToDatabase(option);
118 System.out.println("Parsed " + option);
119 }
120 }
121 }
122 } else {
123 System.out.println("Parent div not found");
124 }
125 break;
126 case "https://magelantravel.mk/":
127 parentDiv = doc.selectFirst("div.sodrzina");
128 if (parentDiv != null) {
129 childDivs = parentDiv.select("div.destinacija");
130 System.out.println(childDivs.size());
131 childDivs.removeIf(div -> div.attr("style").contains("display:none") || div.attr("style").contains("display: none"));
132 System.out.println("Filtered childDivs size: " + childDivs.size());
133 for (Element div : childDivs) {
134 String data = div.outerHtml();
135 Option newOption = optionParser(data);
136 if (newOption != null) {
137 Option existingOption = DatabaseUtil.findOption(newOption);
138 if (existingOption != null) {
139 if (existingOption.equals(newOption) || existingOption.getPrice() != newOption.getPrice()) {
140 newOption.setPriceChanged(true);
141 newOption.setNewPrice(newOption.getPrice());
142 }
143 DatabaseUtil.updateOptionInDatabase(newOption);
144 } else if (optionSet.add(newOption)) {
145 uniqueOptions.add(newOption);
146 DatabaseUtil.saveOptionToDatabase(newOption);
147 System.out.println("Parsed " + newOption);
148 }
149 }
150 }
151
152 } else {
153 System.out.println("Parent div not found");
154 }
155 break;
156 default:
157 System.out.println("URL not recognized for parsing.");
158 }
159 }
160
161
162
163 private Option optionParser(String data) {
164 Document doc = Jsoup.parse(data);
165 Option created = new Option();
166 switch (url) {
167 case "https://magelantravel.mk/":
168 created = parseMagelan(doc);
169 break;
170 case "https://booking.escapetravel.mk/":
171 created = parseEscapeTravel(doc);
172 break;
173 default:
174 System.out.println("URL not recognized for parsing.");
175 break;
176 }
177 if (created.isEmpty()) {
178 System.out.println(created);
179 return null;
180 }
181 return created;
182 }
183
184 private Option parseMagelan(Document doc) {
185 Option created = new Option();
186 Element linkElement = doc.selectFirst("div.ponuda-sredina");
187 int id = Integer.parseInt(linkElement.attr("data-id"));
188 int turop = Integer.parseInt(linkElement.attr("data-turop"));
189 created.setLink("https://magelantravel.mk/ponudi.php?type=1&objektid=" + id + "&turop=" + turop);
190 Element imgElement = doc.selectFirst("div.imgLiquidFill.imgLiquid.ponuda-img.zoom");
191 created.setImgSrc(imgElement != null ? url + imgElement.attr("style")
192 .split("url\\(")[1].split("\\)")[0].replace("'", "").replace("./", "/") : null);
193 Element hotelNameElement = doc.selectFirst("div.ponuda-objekt");
194 created.setHotelName(hotelNameElement != null ? hotelNameElement.text() : null);
195 Element countryElement = doc.selectFirst("l.ponuda-lokacija");
196 created.setCountry(countryElement != null ? countryElement.text() : null);
197 Element priceElement = doc.selectFirst("div.ponuda-cena");
198 Element dateElement = doc.selectFirst("l.ponuda-opis.termin");
199 created.setDateRange(dateElement != null ? dateElement.text() : null);
200 float price = Float.parseFloat(priceElement != null ? priceElement.text().replaceAll("[^\\d.]", "") : "0");
201 created.setPrice(price);
202 return created;
203 }
204 private Option parseEscapeTravel(Document doc) {
205 Option created = new Option();
206 Element card = doc.selectFirst("a.hotel-item");
207 String link = card.attr("href");
208 created.setLink(link);
209 created.setImgSrc(card.attr("data-picture"));
210 created.setHotelName(card.attr("data-title"));
211 Element countryP = doc.selectFirst("p.text-info");
212 created.setCountry(countryP != null ? countryP.text() : null);
213 Element priceElem = doc.selectFirst("span.hotel-price");
214 String priceText = priceElem.text();
215 float price = 0;
216 if(!priceText.isEmpty()) {
217 price = Float.parseFloat(priceText.replace("€", ""));
218 }
219 created.setPrice(price);
220 String[] queryParams = link.split("[?&]");
221 String startDateStr = null;
222 int nights = 0;
223 for (String param : queryParams) {
224 if (param.startsWith("Date=")) {
225 startDateStr = param.split("=")[1];
226 }
227 if (param.startsWith("Nights=")) {
228 nights = Integer.parseInt(param.split("=")[1]);
229 }
230 }
231 if (startDateStr != null && nights > 0)
232 {
233 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
234 try {
235 Date startDate = dateFormat.parse(startDateStr);
236
237 Calendar calendar = Calendar.getInstance();
238 calendar.setTime(startDate);
239 calendar.add(Calendar.DAY_OF_YEAR, nights);
240 Date endDate = calendar.getTime();
241 String dateRange = dateFormat.format(startDate) + " - " + dateFormat.format(endDate);
242 created.setDateRange(dateRange);
243 }catch (ParseException e){
244 e.printStackTrace();
245 }
246 }
247 return created;
248 }
249
250 @Override
251 public void run() {
252 System.out.println("Thread started for url: " + url);
253 initializeWebDriver();
254 if ("https://magelantravel.mk/".equals(url)) {
255 ObjectMapper mapper = new ObjectMapper();
256 try {
257 ClassLoader classLoader = getClass().getClassLoader();
258 JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
259 JsonNode countries = root.get("countries");
260 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
261 Calendar calendar = Calendar.getInstance();
262 calendar.add(Calendar.DAY_OF_YEAR, 1);
263
264 for (int i = 0; i < 90; i++) { // next three months
265 String date = dateFormat.format(calendar.getTime());
266 for (JsonNode countryNode : countries) {
267 String country = countryNode.asText();
268 for (int nokevanja = 2; nokevanja <= 10; nokevanja++) {
269 String queryUrl = url + "/destinacii?ah_tip=1&iframe=&affiliate_code=&carter_id=0&carter_region=&carter_dataod=&carter_datado=&destinacija=" + country + "&oddatum=" + date + "&nokevanja=" + nokevanja + "&dodatum=&broj_vozrasni=2&broj_deca=0&spdete1=0&spdete2=0&spdete3=0&spdete4=0";
270 connectToWeb(queryUrl);
271 }
272 }
273 calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
274 }
275
276 } catch (IOException e) {
277 e.printStackTrace();
278 }
279 } else if ("https://booking.escapetravel.mk/".equals(url)) {
280 ObjectMapper mapper = new ObjectMapper();
281 try {
282 ClassLoader classLoader = getClass().getClassLoader();
283 JsonNode root = mapper.readTree(new File(classLoader.getResource("CountriesList.json").getFile()));
284 JsonNode countries = root.get("countries"); // Assuming "destinations" key in JSON
285 SimpleDateFormat dateFormat = new SimpleDateFormat("dd.MM.yyyy");
286 Calendar calendar = Calendar.getInstance();
287 calendar.add(Calendar.DAY_OF_YEAR, 1);
288
289 for (int i = 0; i < 90; i++) { // next three months
290 String date = dateFormat.format(calendar.getTime());
291 for (JsonNode countryNode : countries) {
292 String country = countryNode.asText();
293 for(int nokevanja = 2; nokevanja <=10; nokevanja ++) {
294 String queryUrl = url + "/hotels?Search=" + country + "&Date=" + date + "&Nights=" + nokevanja + "&Rooms=1&Adults=2";
295 connectToWeb(queryUrl);
296 }
297 }
298 calendar.add(Calendar.DAY_OF_YEAR, 1); // next day
299 }
300 } catch (IOException e) {
301 e.printStackTrace();
302 }
303 } else {
304 // Handle other URLs
305 }
306 closeWebDriver();
307 latch.countDown();
308 }
309
310}
Note: See TracBrowser for help on using the repository browser.