diff options
-rw-r--r-- | .gitignore | 7 | ||||
-rw-r--r-- | README.md | 42 | ||||
-rw-r--r-- | pyproject.toml | 13 | ||||
-rw-r--r-- | scrap/__init__.py | 0 | ||||
-rw-r--r-- | scrap/fincaraiz.py | 153 | ||||
-rw-r--r-- | scrap/scrap.py | 36 |
6 files changed, 251 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f099a5a --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +__pycache__/ +dist/ +*.pyc +*.egg-info +build +tags +test.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..14858e6 --- /dev/null +++ b/README.md @@ -0,0 +1,42 @@ +# Scrap + +Una herramienta para obtener informacion de viviendas usando la API the finca +raiz + +## CLI +```shell +Usage: + scrap [options] <city> + +Options: + -h --help Show this message + -v --verbose Be more verbose + -o FILE Redirect standard output to FILE. + -p INT Number of pages to get [default: 1] + +Examples: + $ scrap Medellín -p 20 +``` + +## Libreria +```python +get_json(city: str, pages: int = 1) -> list: + """Get json data from fincaraiz.com.co + + Parameters + ---------- + city : str + City to search houses + pages : int + Pages to download, each page has 100 offers. + + Returns + ------- + list[dict]: + json file with house results + + """ + +>>> from scrap.fincaraiz import get_json +>>> data = get_json('Bogota, d.c') +``` diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..f101373 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,13 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[project] +name = "scrap" +version = "0.0.1" +dependencies = [ + "docopt", +] + +[project.scripts] +scrap = "scrap.scrap:main" diff --git a/scrap/__init__.py b/scrap/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/scrap/__init__.py diff --git a/scrap/fincaraiz.py b/scrap/fincaraiz.py new file mode 100644 index 0000000..e8adcf0 --- /dev/null +++ b/scrap/fincaraiz.py @@ -0,0 +1,153 @@ +import re +import requests + +HEADERS = { + #"host": "api.fincaraiz.com.co", + "authority": "api.fincaraiz.com.co", + "origin": "https://fincaraiz.com.co", + "referer": "https://fincaraiz.com.co", + #"content-Type": "application/json", + "accept": "*/*" + } + + + +JSON_DATA = { + "filter": { + "offer": { + "slug": [ + "sell" + ] + }, + }, + "fields": { + "exclude": [], + "facets": [], + "include": [ + "area", + "baths.id", + "baths.name", + "baths.slug", + "client.client_type", + "client.company_name", + "client.first_name", + "client.fr_client_id", + "client.last_name", + "client.logo.full_size", + "garages.name", + "is_new", + "locations.cities.fr_place_id", + "locations.cities.name", + "locations.cities.slug", + "locations.countries.fr_place_id", + "locations.countries.name", + "locations.countries.slug", + "locations.groups.name", + "locations.groups.slug", + "locations.groups.subgroups.name", + "locations.groups.subgroups.slug", + "locations.neighbourhoods.fr_place_id", + "locations.neighbourhoods.name", + "locations.neighbourhoods.slug", + "locations.states.fr_place_id", + "locations.states.name", + "locations.states.slug", + "locations.location_point", + "max_area", + "max_price", + "media.photos.list.image.full_size", + "media.photos.list.is_main", + "media.videos.list.is_main", + "media.videos.list.video", + "media.logo.full_size", + "min_area", + "min_price", + "offer.name", + "price", + "products.configuration.tag_id", + "products.configuration.tag_name", + "products.label", + "products.name", + "products.slug", + "property_id", + "property_type.name", + "fr_property_id", + "fr_parent_property_id", + "rooms.id", + "rooms.name", + "rooms.slug", + "stratum.name", + "title" + ], + "limit": 100, + "offset": 0, + "ordering": [], + "platform": 40, + "with_algorithm": True + } +} + +def get_json(city: str, pages: int = 1) -> list: + """Get json data from fincaraiz.com.co + + Parameters + ---------- + city : str + City to search houses + pages : int + Pages to download, each page has 100 offers. + + Returns + ------- + list[dict]: + json file with house results + + Examples + -------- + >>> data = get_json('Bogota, d.c') + """ + + json_data = JSON_DATA.copy() + + def normalize(x: str) -> str: + normalized = re.sub("[áä]", "a", x.lower()) + normalized = re.sub("[éë]", "e", normalized) + normalized = re.sub("[íï]", "i", normalized) + normalized = re.sub("[óö]", "o", normalized) + return re.sub("[úü]", "u", normalized) + + ncity = normalize(city) + def town_exist(town: str) -> bool: + return True if normalize(town) == ncity else False + + def city_code(x: dict) -> str: + departamento = int(x["c_digo_dane_del_departamento"]) + municipio = int(x["c_digo_dane_del_municipio"].split(".")[1]) + return "city-colombia-%02d-%03d" % (departamento, municipio) + + location = [city_code(x) + for x in requests.get("https://www.datos.gov.co/resource/xdk5-pm3f.json").json() + if town_exist(x['municipio'])] + + if location: + json_data["filter"]["locations"] = {"cities": {"slug": location } } + else: + json_data["filter"]["location_path"] = city + + json_output = [] + for i in range(pages): + json_data["fields"]["offset"] = i * json_data["fields"]["limit"] + response = requests.post("https://api.fincaraiz.com.co/document/api/1.0/listing/search", + headers = HEADERS, + json = json_data) + + json_output += response.json()["hits"]["hits"] + + if not len(response.json()["hits"]["hits"]): break + + out = [] + for offer in json_output: + out_ids = [i["_id"] for i in out] + if offer["_id"] not in out_ids: out.append(offer) + + return out diff --git a/scrap/scrap.py b/scrap/scrap.py new file mode 100644 index 0000000..50ef3d4 --- /dev/null +++ b/scrap/scrap.py @@ -0,0 +1,36 @@ +"""Scrap +Usage: + scrap [options] <city> + +Options: + -h --help Show this message + -v --verbose Be more verbose + -o FILE Redirect standard output to FILE. + -p INT Number of pages to get [default: 1] + +Examples: + $ scrap Medellín -p 20 +""" + +import json +from docopt import docopt +from sys import stdout + +from .fincaraiz import get_json + + +def main(): + if not __doc__: return 1 + + args = docopt(__doc__) + city: str = args["<city>"] + options = {"pages": int(args["-p"])} + + json_data = get_json(city, **options) + + if args["-o"]: + with open(args["-o"], "w") as json_file: + json.dump(json_data, json_file) + else: + json.dump(json_data, stdout) + return 0 |