| 1 | import csv
|
|---|
| 2 | import random
|
|---|
| 3 | from datetime import datetime, timedelta
|
|---|
| 4 |
|
|---|
| 5 | from mk_gragjani_lists import maski_iminja, zenski_iminja, preziminja, ulici
|
|---|
| 6 |
|
|---|
| 7 | N = 2_000_000
|
|---|
| 8 | OUTPUT_FILE = "gragjanin.csv"
|
|---|
| 9 |
|
|---|
| 10 | region_codes = {
|
|---|
| 11 | "Skopje": "45", "Bitola": "41", "Resen": "41",
|
|---|
| 12 | "Kumanovo": "42", "Kriva Palanka": "42", "Kratovo": "42",
|
|---|
| 13 | "Ohrid": "43", "Struga": "43", "Debar": "43",
|
|---|
| 14 | "Prilep": "44", "Krushevo": "44", "Makedonski Brod": "44",
|
|---|
| 15 | "Strumica": "46", "Radovish": "46", "Gevgelija": "46", "Valandovo": "46",
|
|---|
| 16 | "Tetovo": "47", "Gostivar": "47",
|
|---|
| 17 | "Veles": "48", "Kavadarci": "48", "Negotino": "48",
|
|---|
| 18 | "Shtip": "49", "Kochani": "49", "Berovo": "49",
|
|---|
| 19 | "Delchevo": "49", "Vinica": "49", "Probishtip": "49"
|
|---|
| 20 | }
|
|---|
| 21 |
|
|---|
| 22 | cities = list(region_codes.keys())
|
|---|
| 23 | used_embg = set()
|
|---|
| 24 |
|
|---|
| 25 |
|
|---|
| 26 | def random_date(start_year=1950, end_year=2025):
|
|---|
| 27 | start = datetime(start_year, 1, 1)
|
|---|
| 28 | end = datetime(end_year, 12, 31)
|
|---|
| 29 | return start + timedelta(days=random.randint(0, (end - start).days))
|
|---|
| 30 |
|
|---|
| 31 |
|
|---|
| 32 | def random_phone():
|
|---|
| 33 | return f"07{random.randint(0, 9)}-{random.randint(100, 999)}-{random.randint(100, 999)}"
|
|---|
| 34 |
|
|---|
| 35 |
|
|---|
| 36 | def random_address():
|
|---|
| 37 | return f"ул. {random.choice(ulici)} бр. {random.randint(1, 200)}"
|
|---|
| 38 |
|
|---|
| 39 |
|
|---|
| 40 | def calculate_control_digit(first_12_digits):
|
|---|
| 41 | weights = [7, 6, 5, 4, 3, 2, 7, 6, 5, 4, 3, 2]
|
|---|
| 42 | total = sum(int(digit) * weight for digit, weight in zip(first_12_digits, weights))
|
|---|
| 43 | remainder = total % 11
|
|---|
| 44 | control = 11 - remainder
|
|---|
| 45 |
|
|---|
| 46 | if control == 10:
|
|---|
| 47 | return None
|
|---|
| 48 | if control == 11:
|
|---|
| 49 | return 0
|
|---|
| 50 |
|
|---|
| 51 | return control
|
|---|
| 52 |
|
|---|
| 53 |
|
|---|
| 54 | def generate_unique_embg(date_of_birth, city, gender):
|
|---|
| 55 | date_part = date_of_birth.strftime("%d%m") + date_of_birth.strftime("%Y")[1:]
|
|---|
| 56 | region = region_codes[city]
|
|---|
| 57 |
|
|---|
| 58 | while True:
|
|---|
| 59 | if gender == "M":
|
|---|
| 60 | serial = random.randint(0, 499)
|
|---|
| 61 | else:
|
|---|
| 62 | serial = random.randint(500, 999)
|
|---|
| 63 |
|
|---|
| 64 | first_12 = f"{date_part}{region}{serial:03d}"
|
|---|
| 65 | control = calculate_control_digit(first_12)
|
|---|
| 66 |
|
|---|
| 67 | if control is None:
|
|---|
| 68 | continue
|
|---|
| 69 |
|
|---|
| 70 | embg = first_12 + str(control)
|
|---|
| 71 |
|
|---|
| 72 | if embg not in used_embg:
|
|---|
| 73 | used_embg.add(embg)
|
|---|
| 74 | return embg
|
|---|
| 75 |
|
|---|
| 76 |
|
|---|
| 77 | with open(OUTPUT_FILE, "w", newline="", encoding="utf-8") as f:
|
|---|
| 78 | writer = csv.writer(f)
|
|---|
| 79 |
|
|---|
| 80 | writer.writerow([
|
|---|
| 81 | "embg",
|
|---|
| 82 | "ime",
|
|---|
| 83 | "prezime",
|
|---|
| 84 | "adresa",
|
|---|
| 85 | "grad",
|
|---|
| 86 | "telefonski_broj",
|
|---|
| 87 | "datum_ragjanje",
|
|---|
| 88 | "pol"
|
|---|
| 89 | ])
|
|---|
| 90 |
|
|---|
| 91 | for i in range(1, N + 1):
|
|---|
| 92 | date = random_date()
|
|---|
| 93 | city = random.choice(cities)
|
|---|
| 94 | gender = random.choice(["M", "F"])
|
|---|
| 95 |
|
|---|
| 96 | ime = random.choice(maski_iminja) if gender == "M" else random.choice(zenski_iminja)
|
|---|
| 97 | embg = generate_unique_embg(date, city, gender)
|
|---|
| 98 |
|
|---|
| 99 | writer.writerow([
|
|---|
| 100 | embg,
|
|---|
| 101 | ime,
|
|---|
| 102 | random.choice(preziminja),
|
|---|
| 103 | random_address(),
|
|---|
| 104 | city,
|
|---|
| 105 | random_phone(),
|
|---|
| 106 | date.strftime("%Y-%m-%d"),
|
|---|
| 107 | gender
|
|---|
| 108 | ])
|
|---|
| 109 |
|
|---|
| 110 | if i % 100_000 == 0:
|
|---|
| 111 | print(f"{i} rows generated...")
|
|---|
| 112 |
|
|---|
| 113 | print("CSV file generated:", OUTPUT_FILE)
|
|---|
| 114 | print("Total unique EMBG:", len(used_embg)) |
|---|