Ignore:
Timestamp:
11/20/22 16:34:52 (20 months ago)
Author:
Marko <Marko@…>
Branches:
master
Parents:
ffd50db
Message:

Final features implemented

File:
1 edited

Legend:

Unmodified
Added
Removed
  • phonelux_scrappers/scrappers/mobelix_scrapper.py

    rffd50db r47f4eaf  
    33import unicodedata
    44from datetime import datetime
    5 
     5import traceback
    66import psycopg2
    77import config_read
     
    1919is_validated = False
    2020
    21 # Mobelix phone offers that are already in database
    22 
    23 offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/mobelix').text))
    24 
    25 database_offers = []
    26 
    27 for offer in offers:
    28     phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'],
    29                             offer['ram_memory'],
    30                             offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'],
    31                             offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'],
    32                             offer['image_url'],
    33                             offer['offer_url'], offer['last_updated'], offer['is_validated'],
    34                             offer['offer_description'],
    35                             offer['offer_shop_code'])
    36     database_offers.append(phoneOffer)
    37 
    38 new_offers = []
    39 
    40 for i in range(1, 17):
    41     mobelix_url = "https://mobelix.com.mk/mk/mobilni-telefoni?page=" + str(i)
    42 
    43     response1 = requests.get(mobelix_url)
    44     soup1 = BeautifulSoup(response1.content, 'html.parser')
    45 
    46     phones = soup1.find_all('div', {'class': 'p-2 rounded text-dark bg-white d-flex w-100'})
    47 
    48     for phone in phones:
    49         offer_url = phone.find('a').get('href')
    50         image_url = phone.find_all('div', {'class': 'col-12'})[0].find('img').get('src')
    51         brand = phone.find_all('div', {'class': 'col-12'})[1].find('h5', {'class': 'mb-0'}).get_text().strip()
    52         offer_name = phone.find_all('div', {'class': 'col-12'})[1] \
    53             .find('h3', {'class': 'h5 font-weight-normal'}).get_text().strip()
    54 
    55         if 'Watch' in offer_name or 'Pad' in offer_name or 'Tab' in offer_name or 'Pods' in offer_name or 'Buds' in offer_name or 'HomePod' in offer_name:
    56             continue
    57 
    58         if brand not in offer_name:
    59             offer_name = brand + " " + offer_name
    60 
    61         temp_prices = phone.find_all('div', {'class': 'col-12'})[1] \
    62             .find('p', {'class': 'h5 price'}).get_text(separator='/').strip()
    63 
    64         if len(temp_prices.split('/')) > 1:
    65             price = int(float(temp_prices.split('/')[1].replace(',', '').replace('ден', '').strip()))
     21# Call to read the configuration file and connect to database
     22cinfo = config_read.get_databaseconfig("../postgresdb.config")
     23db_connection = psycopg2.connect(
     24    database=cinfo[0],
     25    host=cinfo[1],
     26    user=cinfo[2],
     27    password=cinfo[3]
     28)
     29cur = db_connection.cursor()
     30
     31try:
     32    # Mobelix phone offers that are already in database
     33    offers = json.loads(unicodedata.normalize('NFKD', requests.get('http://localhost:8080/phoneoffer/shop/mobelix').text))
     34
     35    database_offers = []
     36
     37    for offer in offers:
     38        phoneOffer = PhoneOffer(offer['id'], offer['offer_shop'], offer['offer_name'], offer['price'],
     39                                    offer['ram_memory'],
     40                                    offer['rom_memory'], offer['color'], offer['front_camera'], offer['back_camera'],
     41                                    offer['chipset'], offer['battery'], offer['operating_system'], offer['cpu'],
     42                                    offer['image_url'],
     43                                    offer['offer_url'], offer['last_updated'], offer['is_validated'],
     44                                    offer['offer_description'],
     45                                    offer['offer_shop_code'])
     46        database_offers.append(phoneOffer)
     47
     48    new_offers = []
     49
     50    for i in range(1, 17):
     51        mobelix_url = "https://mobelix.com.mk/mk/mobilni-telefoni?page=" + str(i)
     52
     53        response1 = requests.get(mobelix_url)
     54        soup1 = BeautifulSoup(response1.content, 'html.parser')
     55
     56        phones = soup1.find_all('div', {'class': 'p-2 rounded text-dark bg-white d-flex w-100'})
     57
     58        for phone in phones:
     59            offer_url = phone.find('a').get('href')
     60            image_url = phone.find_all('div', {'class': 'col-12'})[0].find('img').get('src')
     61            brand = phone.find_all('div', {'class': 'col-12'})[1].find('h5', {'class': 'mb-0'}).get_text().strip()
     62            offer_name = phone.find_all('div', {'class': 'col-12'})[1] \
     63                .find('h3', {'class': 'h5 font-weight-normal'}).get_text().strip()
     64
     65            if 'Watch' in offer_name or 'Pad' in offer_name or 'Tab' in offer_name or 'Pods' in offer_name or 'Buds' in offer_name or 'HomePod' in offer_name:
     66                continue
     67
     68            if brand not in offer_name:
     69                offer_name = brand + " " + offer_name
     70
     71            temp_prices = phone.find_all('div', {'class': 'col-12'})[1] \
     72                .find('p', {'class': 'h5 price'}).get_text(separator='/').strip()
     73
     74            if len(temp_prices.split('/')) > 1:
     75                price = int(float(temp_prices.split('/')[1].replace(',', '').replace('ден', '').strip()))
     76            else:
     77                price = int(float(temp_prices.split('/')[0].replace(',', '').replace('ден', '').strip()))
     78
     79            response2 = requests.get(offer_url)
     80            soup2 = BeautifulSoup(response2.content, 'html.parser')
     81
     82            colors_divs = soup2.find('div', {'class': 'color-wrapper mt-2 mb-1'}) \
     83                .find_all('div', {'class': 'color-box d-inline-block'})  # color div tags
     84
     85            temp_colors = []
     86            for div in colors_divs:
     87                temp_colors.append(div.get('title'))
     88
     89            color = ",".join(temp_colors)  # available colors for offer
     90
     91            tables = soup2.find('div', {'class': 'mobelix-specs table-white bordered-table'}).find_all('table')
     92
     93            operating_system = None
     94            chipset = None
     95            battery = None
     96            ram_memory = None
     97            rom_memory = None
     98            front_camera = ''
     99            back_camera = ''
     100            cpu = None
     101            offer_shop_code = None
     102            offer_description = None
     103
     104            for table in tables:
     105                for cell in table.find_all('td'):
     106                    if cell.get('data-spec') is None:
     107                        continue
     108
     109                    if cell.get('data-spec') == 'os':
     110                        operating_system = unicodedata.normalize('NFKD', cell.get_text().strip())
     111
     112                    if cell.get('data-spec') == 'chipset':
     113                        chipset = unicodedata.normalize('NFKD', cell.get_text().strip())
     114
     115                    if cell.get('data-spec') == 'cpu':
     116                        cpu = unicodedata.normalize('NFKD', cell.get_text().strip())
     117
     118                    if cell.get('data-spec') == 'internalmemory':
     119                        temp_rom = []
     120                        temp_ram = []
     121                        temp_internalmemory = unicodedata.normalize('NFKD', cell.get_text().strip())
     122                        for internalmemory in temp_internalmemory.split(','):
     123                            temp_rom.append(internalmemory.strip().split(' ')[0])
     124                            if len(internalmemory.strip().split(' ')) > 1:
     125                                temp_ram.append(internalmemory.strip().split(' ')[1])
     126                        rom_memory = ','.join(temp_rom)
     127                        ram_memory = ','.join(temp_ram)
     128
     129                    if cell.get('data-spec') == 'cam1modules' or cell.get('data-spec') == 'cam1features' or cell.get(
     130                                'data-spec') == 'cam1video':
     131                        back_camera += unicodedata.normalize('NFKD', cell.get_text().strip()) + '\n'
     132
     133                    if cell.get('data-spec') == 'cam2modules' or cell.get('data-spec') == 'cam2features' or cell.get(
     134                                'data-spec') == 'cam2video':
     135                        front_camera += unicodedata.normalize('NFKD', cell.get_text().strip()) + '\n'
     136
     137                    if cell.get('data-spec') == 'batdescription1':
     138                        battery = unicodedata.normalize('NFKD', cell.get_text().strip())
     139
     140            if front_camera == 'No':
     141                front_camera = None
     142
     143            if back_camera == 'No':
     144                back_camera = None
     145
     146            new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory,
     147                                            color, front_camera, back_camera, chipset, battery, operating_system, cpu,
     148                                            image_url,
     149                                            offer_url, last_updated, is_validated, offer_description, offer_shop_code))
     150
     151
     152    for new_offer in new_offers:
     153        flag = False
     154        flag_price = False
     155        offer_id = None
     156
     157        for old_offer in database_offers:
     158
     159            if new_offer.offer_name == old_offer.offer_name:
     160                flag = True
     161                if new_offer.price != old_offer.price:
     162                    flag_price = True
     163                    offer_id = old_offer.offer_id
     164
     165        if flag:
     166            # print('ALREADY IN DATABASE')
     167            # print(new_offer)
     168            # if it's already in database, check PRICE and if it's changed, change it !!!!!!
     169            if flag_price:
     170                print('PRICE CHANGED!')  # CHANGE PRICE
     171                print('offer id: ' + str(offer_id))
     172                headers = {'Content-type': 'application/json'}
     173                requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price),
     174                        headers=headers)
    66175        else:
    67             price = int(float(temp_prices.split('/')[0].replace(',', '').replace('ден', '').strip()))
    68 
    69         response2 = requests.get(offer_url)
    70         soup2 = BeautifulSoup(response2.content, 'html.parser')
    71 
    72         colors_divs = soup2.find('div', {'class': 'color-wrapper mt-2 mb-1'}) \
    73             .find_all('div', {'class': 'color-box d-inline-block'})  # color div tags
    74 
    75         temp_colors = []
    76         for div in colors_divs:
    77             temp_colors.append(div.get('title'))
    78 
    79         color = ",".join(temp_colors)  # available colors for offer
    80 
    81         tables = soup2.find('div', {'class': 'mobelix-specs table-white bordered-table'}).find_all('table')
    82 
    83         operating_system = None
    84         chipset = None
    85         battery = None
    86         ram_memory = None
    87         rom_memory = None
    88         front_camera = ''
    89         back_camera = ''
    90         cpu = None
    91         offer_shop_code = None
    92         offer_description = None
    93 
    94         for table in tables:
    95             for cell in table.find_all('td'):
    96                 if cell.get('data-spec') is None:
    97                     continue
    98 
    99                 if cell.get('data-spec') == 'os':
    100                     operating_system = unicodedata.normalize('NFKD', cell.get_text().strip())
    101 
    102                 if cell.get('data-spec') == 'chipset':
    103                     chipset = unicodedata.normalize('NFKD', cell.get_text().strip())
    104 
    105                 if cell.get('data-spec') == 'cpu':
    106                     cpu = unicodedata.normalize('NFKD', cell.get_text().strip())
    107 
    108                 if cell.get('data-spec') == 'internalmemory':
    109                     temp_rom = []
    110                     temp_ram = []
    111                     temp_internalmemory = unicodedata.normalize('NFKD', cell.get_text().strip())
    112                     for internalmemory in temp_internalmemory.split(','):
    113                         temp_rom.append(internalmemory.strip().split(' ')[0])
    114                         if len(internalmemory.strip().split(' ')) > 1:
    115                             temp_ram.append(internalmemory.strip().split(' ')[1])
    116                     rom_memory = ','.join(temp_rom)
    117                     ram_memory = ','.join(temp_ram)
    118 
    119                 if cell.get('data-spec') == 'cam1modules' or cell.get('data-spec') == 'cam1features' or cell.get(
    120                         'data-spec') == 'cam1video':
    121                     back_camera += unicodedata.normalize('NFKD', cell.get_text().strip()) + '\n'
    122 
    123                 if cell.get('data-spec') == 'cam2modules' or cell.get('data-spec') == 'cam2features' or cell.get(
    124                         'data-spec') == 'cam2video':
    125                     front_camera += unicodedata.normalize('NFKD', cell.get_text().strip()) + '\n'
    126 
    127                 if cell.get('data-spec') == 'batdescription1':
    128                     battery = unicodedata.normalize('NFKD', cell.get_text().strip())
    129 
    130         if front_camera == 'No':
    131             front_camera = None
    132 
    133         if back_camera == 'No':
    134             back_camera = None
    135 
    136         new_offers.append(PhoneOffer(offer_shop, offer_name, price, ram_memory, rom_memory,
    137                                      color, front_camera, back_camera, chipset, battery, operating_system, cpu,
    138                                      image_url,
    139                                      offer_url, last_updated, is_validated, offer_description, offer_shop_code))
    140 
    141 
    142 for new_offer in new_offers:
    143     flag = False
    144     flag_price = False
    145     offer_id = None
     176            print('ADDED')  # ADD OFFER
     177            print(new_offer)
     178            headers = {'Content-type': 'application/json'}
     179            requests.post('http://localhost:8080/phoneoffer/addoffer',
     180                        headers=headers, data=json.dumps(new_offer.__dict__, default=str))
     181
     182    print('------------------------------------')
    146183
    147184    for old_offer in database_offers:
    148 
    149         if new_offer.offer_name == old_offer.offer_name:
    150             flag = True
    151             if new_offer.price != old_offer.price:
    152                 flag_price = True
    153                 offer_id = old_offer.offer_id
    154 
    155     if flag:
    156         # print('ALREADY IN DATABASE')
    157         # print(new_offer)
    158         # if it's already in database, check PRICE and if it's changed, change it !!!!!!
    159         if flag_price:
    160             print('PRICE CHANGED!')  # CHANGE PRICE
    161             print('offer id: ' + str(offer_id))
    162             headers = {'Content-type': 'application/json'}
    163             requests.put('http://localhost:8080/phoneoffer/' + str(offer_id) + '/changeprice/' + str(new_offer.price),
    164                          headers=headers)
    165     else:
    166         print('ADDED')  # ADD OFFER
    167         print(new_offer)
    168         headers = {'Content-type': 'application/json'}
    169         requests.post('http://localhost:8080/phoneoffer/addoffer',
    170                       headers=headers, data=json.dumps(new_offer.__dict__, default=str))
    171 
    172 print('------------------------------------')
    173 
    174 for old_offer in database_offers:
    175     flag = False
    176     for new_offer in new_offers:
    177         if old_offer.offer_name == new_offer.offer_name:
    178             flag = True
    179 
    180     if not flag:
    181         print('OFFER DELETED')
    182         print(old_offer)
    183         # DELETE OFFER
    184         requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id))
     185        flag = False
     186        for new_offer in new_offers:
     187            if old_offer.offer_name == new_offer.offer_name:
     188                flag = True
     189
     190        if not flag:
     191            print('OFFER DELETED')
     192            print(old_offer)
     193            # DELETE OFFER
     194            requests.delete('http://localhost:8080/phoneoffer/deleteoffer/' + str(old_offer.offer_id))
     195except Exception:
     196    traceback.print_exc()
     197    insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \
     198                    ' VALUES (%s, %s, %s);'
     199    insert_value = (offer_shop, last_updated, 'failed')
     200    cur.execute(insert_script, insert_value)
     201    db_connection.commit()
     202    cur.close()
     203    db_connection.close()
     204else:
     205    insert_script = 'INSERT INTO scrapper_info (store, recieved_at, status)' \
     206                    ' VALUES (%s, %s, %s);'
     207    insert_value = (offer_shop, last_updated, 'success')
     208    cur.execute(insert_script, insert_value)
     209    db_connection.commit()
     210    cur.close()
     211    db_connection.close()
     212
Note: See TracChangeset for help on using the changeset viewer.