Ignore:
Timestamp:
11/20/22 16:34:52 (2 years ago)
Author:
Marko <Marko@…>
Branches:
master
Parents:
ffd50db
Message:

Final features implemented

File:
1 edited

Legend:

Unmodified
Added
Removed
  • phonelux_scrappers/scrappers/ledikom_scrapper.py

    rffd50db r47f4eaf  
    11import json
     2import traceback
    23import unicodedata
    34from datetime import datetime
     
    1920is_validated = False
    2021
    21 # Ledikom phone offers that are already in database
    22 
    23 offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/ledikom').text))
    24 
    25 database_offers = []
    26 
    27 for offer in offers:
    28     phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'],
    29                             offer['ram_memory'],
    30                             offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'],
    31                             offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'],
    32                             offer['image_url'],
    33                             offer['offer_url'], offer['last_updated'], offer['is_validated'],
    34                             offer['offer_description'],
    35                             offer['offer_shop_code'])
    36     database_offers.append(phoneOffer)
    37 
    38 new_offers = []
    39 
    40 ledikom_phone_urls = [
    41     'https://ledikom.mk/c/416/uredi/apple/iphone?limit=96',
    42     'https://ledikom.mk/c/421/uredi/samsung/telefoni?limit=96',
    43     'https://ledikom.mk/c/424/mobilni-telefoni/xiaomi/telefoni?limit=96',
    44     'https://ledikom.mk/c/430/uredi/huawei/telefoni?limit=96',
    45     'https://ledikom.mk/c/441/uredi/oneplus/telefoni?limit=96',
    46     'https://ledikom.mk/c/413/uredi/google/telefoni?limit=96',
    47     'https://ledikom.mk/c/411/uredi/honor/telefoni?limit=96',
    48     'https://ledikom.mk/c/460/uredi/nokia/telefoni?limit=96',
    49     'https://ledikom.mk/c/461/uredi/asus/telefoni?limit=96',
    50     'https://ledikom.mk/c/488/proizvodi/oppo/telefoni?limit=96'
    51 ]
    52 
    53 for ledikom_url in ledikom_phone_urls:
    54 
    55     # selenium is used because of the dynamic content of the page
    56     driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver')
    57     driver1.get(ledikom_url)
    58     ledikom_html = driver1.page_source
    59 
    60     # closing the driver so the safari instance can pair with another webdriver session
    61     driver1.close()
    62 
    63     soup1 = BeautifulSoup(ledikom_html, 'html.parser')
    64 
    65     phones = soup1.find('div', {'id': 'content'}) \
    66         .find('div', {'class': 'container'}).find('div', {'class': 'row'}).find('div', {'class': 'item-display'}) \
    67         .find_all('div', {'class': 'item-in-grid'})
    68 
    69     if len(phones) == 0:
    70         continue
    71 
    72     for phone in phones:
    73         offer_url = 'https://ledikom.mk' + phone.find('a').get('href')
    74         image_url = phone.find('a').find('img').get('src')
    75         temp_offer_name = phone.find('div', {'class': 'item-name'}).find('a').get_text().strip()
    76         offer_name = ' '.join(temp_offer_name.split())
    77         brand = offer_name.split(' ')[0]
    78         price = int(phone.find('span', {'class': 'price'}).get_text().replace('ден.', '')
    79                     .replace('ден', '')
    80                     .replace('.', '').strip())
    81 
     22# Call to read the configuration file and connect to database
     23cinfo = config_read.get_databaseconfig("../postgresdb.config")
     24db_connection = psycopg2.connect(
     25    database=cinfo[0],
     26    host=cinfo[1],
     27    user=cinfo[2],
     28    password=cinfo[3]
     29)
     30cur = db_connection.cursor()
     31
     32try:
     33    # Ledikom phone offers that are already in database
     34    offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/ledikom').text))
     35
     36    database_offers = []
     37
     38    for offer in offers:
     39        phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'],
     40                                offer['ram_memory'],
     41                                offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'],
     42                                offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'],
     43                                offer['image_url'],
     44                                offer['offer_url'], offer['last_updated'], offer['is_validated'],
     45                                offer['offer_description'],
     46                                offer['offer_shop_code'])
     47        database_offers.append(phoneOffer)
     48
     49    new_offers = []
     50
     51    ledikom_phone_urls = [
     52        'https://ledikom.mk/c/416/uredi/apple/iphone?limit=96',
     53        'https://ledikom.mk/c/421/uredi/samsung/telefoni?limit=96',
     54        'https://ledikom.mk/c/424/mobilni-telefoni/xiaomi/telefoni?limit=96',
     55        'https://ledikom.mk/c/430/uredi/huawei/telefoni?limit=96',
     56        'https://ledikom.mk/c/441/uredi/oneplus/telefoni?limit=96',
     57        'https://ledikom.mk/c/413/uredi/google/telefoni?limit=96',
     58        'https://ledikom.mk/c/411/uredi/honor/telefoni?limit=96',
     59        'https://ledikom.mk/c/460/uredi/nokia/telefoni?limit=96',
     60        'https://ledikom.mk/c/461/uredi/asus/telefoni?limit=96',
     61        'https://ledikom.mk/c/488/proizvodi/oppo/telefoni?limit=96'
     62    ]
     63
     64    for ledikom_url in ledikom_phone_urls:
     65
     66        # selenium is used because of the dynamic content of the page
    8267        driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver')
    83         driver1.get(offer_url)
    84         # getting offer page html
    85         offer_html = driver1.page_source
     68        driver1.get(ledikom_url)
     69        ledikom_html = driver1.page_source
     70
     71        # closing the driver so the safari instance can pair with another webdriver session
    8672        driver1.close()
    8773
    88         soup2 = BeautifulSoup(offer_html, 'html.parser')
    89 
    90         specifications = soup2.find('div', {'id': 'content'}).find('section', {'class': 'padding-section'}) \
    91             .find_all('div', {'class': 'container'})[1].find('div', {'class': 'col-md-7'}) \
    92             .find_all('div', {'class': 'row'})
    93 
    94         color = None
    95         rom_memory = None
    96         ram_memory = None
    97         back_camera = None
    98         operating_system = None
    99         chipset = None
    100         battery = None
    101         cpu = None
    102         front_camera = None
    103         offer_shop_code = None
    104         offer_description = None
    105 
    106         if len(specifications) != 0:
    107             colors_tags = specifications[0].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a')
    108             temp_colors = []
    109             for color_tag in colors_tags:
    110                 temp_colors.append(color_tag.get_text().strip())
    111             color = ','.join(temp_colors)
    112 
    113         if len(specifications) >= 2:
    114             temp_rom = specifications[1].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a')
    115             rom_list = []
    116             for rom in temp_rom:
    117                 rom_list.append(rom.get('title'))
    118             rom_memory = ','.join(rom_list)
    119 
    120         if len(specifications) >= 3:
    121             temp_ram = specifications[2].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a')
    122             ram_list = []
    123             for ram in temp_ram:
    124                 ram_list.append(ram.get('title'))
    125 
    126             ram_memory = ','.join(ram_list)
    127 
    128         if 'Xiaomi' in brand:
    129             temp = color
    130             color = rom_memory
    131             rom_memory = temp
    132 
    133             temp = ram_memory
    134             ram_memory = color
    135             color = temp
    136 
    137         new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory,
    138                                      color, front_camera, back_camera, chipset, battery, operating_system, cpu,
    139                                      image_url,
    140                                      offer_url, last_updated, is_validated, offer_description, offer_shop_code))
    141 
    142 for new_offer in new_offers:
    143     flag = False
    144     flag_price = False
    145     offer_id = None
     74        soup1 = BeautifulSoup(ledikom_html, 'html.parser')
     75
     76        phones = soup1.find('div', {'id': 'content'}) \
     77            .find('div', {'class': 'container'}).find('div', {'class': 'row'}).find('div', {'class': 'item-display'}) \
     78            .find_all('div', {'class': 'item-in-grid'})
     79
     80        if len(phones) == 0:
     81            continue
     82
     83        for phone in phones:
     84            offer_url = 'https://ledikom.mk' + phone.find('a').get('href')
     85            image_url = phone.find('a').find('img').get('src')
     86            temp_offer_name = phone.find('div', {'class': 'item-name'}).find('a').get_text().strip()
     87            offer_name = ' '.join(temp_offer_name.split())
     88            brand = offer_name.split(' ')[0]
     89            price = int(phone.find('span', {'class': 'price'}).get_text().replace('ден.', '')
     90                        .replace('ден', '')
     91                        .replace('.', '').strip())
     92
     93            driver1 = webdriver.Safari(executable_path='/usr/bin/safaridriver')
     94            driver1.get(offer_url)
     95            # getting offer page html
     96            offer_html = driver1.page_source
     97            driver1.close()
     98
     99            soup2 = BeautifulSoup(offer_html, 'html.parser')
     100
     101            specifications = soup2.find('div', {'id': 'content'}).find('section', {'class': 'padding-section'}) \
     102                .find_all('div', {'class': 'container'})[1].find('div', {'class': 'col-md-7'}) \
     103                .find_all('div', {'class': 'row'})
     104
     105            color = None
     106            rom_memory = None
     107            ram_memory = None
     108            back_camera = None
     109            operating_system = None
     110            chipset = None
     111            battery = None
     112            cpu = None
     113            front_camera = None
     114            offer_shop_code = None
     115            offer_description = None
     116
     117            if len(specifications) != 0:
     118                colors_tags = specifications[0].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a')
     119                temp_colors = []
     120                for color_tag in colors_tags:
     121                    temp_colors.append(color_tag.get_text().strip())
     122                color = ','.join(temp_colors)
     123
     124            if len(specifications) >= 2:
     125                temp_rom = specifications[1].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a')
     126                rom_list = []
     127                for rom in temp_rom:
     128                    rom_list.append(rom.get('title'))
     129                rom_memory = ','.join(rom_list)
     130
     131            if len(specifications) >= 3:
     132                temp_ram = specifications[2].find('div', {'class': 'col-md-12 col-xs-12'}).find_all('a')
     133                ram_list = []
     134                for ram in temp_ram:
     135                    ram_list.append(ram.get('title'))
     136
     137                ram_memory = ','.join(ram_list)
     138
     139            if 'Xiaomi' in brand:
     140                temp = color
     141                color = rom_memory
     142                rom_memory = temp
     143
     144                temp = ram_memory
     145                ram_memory = color
     146                color = temp
     147
     148            new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory,
     149                                         color, front_camera, back_camera, chipset, battery, operating_system, cpu,
     150                                         image_url,
     151                                         offer_url, last_updated, is_validated, offer_description, offer_shop_code))
     152
     153    for new_offer in new_offers:
     154        flag = False
     155        flag_price = False
     156        offer_id = None
     157
     158        for old_offer in database_offers:
     159
     160            if new_offer.offer_name == old_offer.offer_name:
     161                flag = True
     162                if new_offer.price != old_offer.price:
     163                    flag_price = True
     164                    offer_id = old_offer.offer_id
     165
     166        if flag:
     167            # print('ALREADY IN DATABASE')
     168            # print(new_offer)
     169            # if it's already in database, check PRICE and if it's changed, change it !!!!!!
     170            if flag_price:
     171                print('PRICE CHANGED!')  # CHANGE PRICE
     172                print('offer id: ' + str(offer_id))
     173                headers = {'Content-type': 'application/json'}
     174                requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price),
     175                             headers=headers)
     176        else:
     177            print('ADDED')  # ADD OFFER
     178            print(new_offer)
     179            headers = {'Content-type': 'application/json'}
     180            requests.post('http://localhost:8080/phoneoffer/addoffer',
     181                          headers=headers, data=json.dumps(new_offer.__dict__, default=str))
     182
     183    print('------------------------------------')
    146184
    147185    for old_offer in database_offers:
    148 
    149         if new_offer.offer_name == old_offer.offer_name:
    150             flag = True
    151             if new_offer.price != old_offer.price:
    152                 flag_price = True
    153                 offer_id = old_offer.offer_id
    154 
    155     if flag:
    156         # print('ALREADY IN DATABASE')
    157         # print(new_offer)
    158         # if it's already in database, check PRICE and if it's changed, change it !!!!!!
    159         if flag_price:
    160             print('PRICE CHANGED!')  # CHANGE PRICE
    161             print('offer id: ' + str(offer_id))
    162             headers = {'Content-type': 'application/json'}
    163             requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price),
    164                          headers=headers)
    165     else:
    166         print('ADDED')  # ADD OFFER
    167         print(new_offer)
    168         headers = {'Content-type': 'application/json'}
    169         requests.post('http://localhost:8080/phoneoffer/addoffer',
    170                       headers=headers, data=json.dumps(new_offer.__dict__, default=str))
    171 
    172 print('------------------------------------')
    173 
    174 for old_offer in database_offers:
    175     flag = False
    176     for new_offer in new_offers:
    177         if old_offer.offer_name == new_offer.offer_name:
    178             flag = True
    179 
    180     if not flag:
    181         print('OFFER DELETED')
    182         print(old_offer)
    183         # DELETE OFFER
    184         requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id))
     186        flag = False
     187        for new_offer in new_offers:
     188            if old_offer.offer_name == new_offer.offer_name:
     189                flag = True
     190
     191        if not flag:
     192            print('OFFER DELETED')
     193            print(old_offer)
     194            # DELETE OFFER
     195            requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id))
     196except Exception:
     197    traceback.print_exc()
     198    insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \
     199                    ' VALUES (%s, %s, %s);'
     200    insert_value = (offer_shop, last_updated, 'failed')
     201    cur.execute(insert_script, insert_value)
     202    db_connection.commit()
     203    cur.close()
     204    db_connection.close()
     205else:
     206    insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \
     207                    ' VALUES (%s, %s, %s);'
     208    insert_value = (offer_shop, last_updated, 'success')
     209    cur.execute(insert_script, insert_value)
     210    db_connection.commit()
     211    cur.close()
     212    db_connection.close()
     213
Note: See TracChangeset for help on using the changeset viewer.