aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.gitignore7
-rw-r--r--README.md42
-rw-r--r--pyproject.toml13
-rw-r--r--scrap/__init__.py0
-rw-r--r--scrap/fincaraiz.py153
-rw-r--r--scrap/scrap.py36
6 files changed, 251 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..f099a5a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,7 @@
+__pycache__/
+dist/
+*.pyc
+*.egg-info
+build
+tags
+test.py
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..14858e6
--- /dev/null
+++ b/README.md
@@ -0,0 +1,42 @@
+# Scrap
+
+Una herramienta para obtener informacion de viviendas usando la API the finca
+raiz
+
+## CLI
+```shell
+Usage:
+ scrap [options] <city>
+
+Options:
+ -h --help Show this message
+ -v --verbose Be more verbose
+ -o FILE Redirect standard output to FILE.
+ -p INT Number of pages to get [default: 1]
+
+Examples:
+ $ scrap Medellín -p 20
+```
+
+## Libreria
+```python
+get_json(city: str, pages: int = 1) -> list:
+ """Get json data from fincaraiz.com.co
+
+ Parameters
+ ----------
+ city : str
+ City to search houses
+ pages : int
+ Pages to download, each page has 100 offers.
+
+ Returns
+ -------
+ list[dict]:
+ json file with house results
+
+ """
+
+>>> from scrap.fincaraiz import get_json
+>>> data = get_json('Bogota, d.c')
+```
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..f101373
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,13 @@
+[build-system]
+requires = ["setuptools"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "scrap"
+version = "0.0.1"
+dependencies = [
+ "docopt",
+]
+
+[project.scripts]
+scrap = "scrap.scrap:main"
diff --git a/scrap/__init__.py b/scrap/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/scrap/__init__.py
diff --git a/scrap/fincaraiz.py b/scrap/fincaraiz.py
new file mode 100644
index 0000000..e8adcf0
--- /dev/null
+++ b/scrap/fincaraiz.py
@@ -0,0 +1,153 @@
+import re
+import requests
+
+HEADERS = {
+ #"host": "api.fincaraiz.com.co",
+ "authority": "api.fincaraiz.com.co",
+ "origin": "https://fincaraiz.com.co",
+ "referer": "https://fincaraiz.com.co",
+ #"content-Type": "application/json",
+ "accept": "*/*"
+ }
+
+
+
+JSON_DATA = {
+ "filter": {
+ "offer": {
+ "slug": [
+ "sell"
+ ]
+ },
+ },
+ "fields": {
+ "exclude": [],
+ "facets": [],
+ "include": [
+ "area",
+ "baths.id",
+ "baths.name",
+ "baths.slug",
+ "client.client_type",
+ "client.company_name",
+ "client.first_name",
+ "client.fr_client_id",
+ "client.last_name",
+ "client.logo.full_size",
+ "garages.name",
+ "is_new",
+ "locations.cities.fr_place_id",
+ "locations.cities.name",
+ "locations.cities.slug",
+ "locations.countries.fr_place_id",
+ "locations.countries.name",
+ "locations.countries.slug",
+ "locations.groups.name",
+ "locations.groups.slug",
+ "locations.groups.subgroups.name",
+ "locations.groups.subgroups.slug",
+ "locations.neighbourhoods.fr_place_id",
+ "locations.neighbourhoods.name",
+ "locations.neighbourhoods.slug",
+ "locations.states.fr_place_id",
+ "locations.states.name",
+ "locations.states.slug",
+ "locations.location_point",
+ "max_area",
+ "max_price",
+ "media.photos.list.image.full_size",
+ "media.photos.list.is_main",
+ "media.videos.list.is_main",
+ "media.videos.list.video",
+ "media.logo.full_size",
+ "min_area",
+ "min_price",
+ "offer.name",
+ "price",
+ "products.configuration.tag_id",
+ "products.configuration.tag_name",
+ "products.label",
+ "products.name",
+ "products.slug",
+ "property_id",
+ "property_type.name",
+ "fr_property_id",
+ "fr_parent_property_id",
+ "rooms.id",
+ "rooms.name",
+ "rooms.slug",
+ "stratum.name",
+ "title"
+ ],
+ "limit": 100,
+ "offset": 0,
+ "ordering": [],
+ "platform": 40,
+ "with_algorithm": True
+ }
+}
+
+def get_json(city: str, pages: int = 1) -> list:
+ """Get json data from fincaraiz.com.co
+
+ Parameters
+ ----------
+ city : str
+ City to search houses
+ pages : int
+ Pages to download, each page has 100 offers.
+
+ Returns
+ -------
+ list[dict]:
+ json file with house results
+
+ Examples
+ --------
+ >>> data = get_json('Bogota, d.c')
+ """
+
+ json_data = JSON_DATA.copy()
+
+ def normalize(x: str) -> str:
+ normalized = re.sub("[áä]", "a", x.lower())
+ normalized = re.sub("[éë]", "e", normalized)
+ normalized = re.sub("[íï]", "i", normalized)
+ normalized = re.sub("[óö]", "o", normalized)
+ return re.sub("[úü]", "u", normalized)
+
+ ncity = normalize(city)
+ def town_exist(town: str) -> bool:
+ return True if normalize(town) == ncity else False
+
+ def city_code(x: dict) -> str:
+ departamento = int(x["c_digo_dane_del_departamento"])
+ municipio = int(x["c_digo_dane_del_municipio"].split(".")[1])
+ return "city-colombia-%02d-%03d" % (departamento, municipio)
+
+ location = [city_code(x)
+ for x in requests.get("https://www.datos.gov.co/resource/xdk5-pm3f.json").json()
+ if town_exist(x['municipio'])]
+
+ if location:
+ json_data["filter"]["locations"] = {"cities": {"slug": location } }
+ else:
+ json_data["filter"]["location_path"] = city
+
+ json_output = []
+ for i in range(pages):
+ json_data["fields"]["offset"] = i * json_data["fields"]["limit"]
+ response = requests.post("https://api.fincaraiz.com.co/document/api/1.0/listing/search",
+ headers = HEADERS,
+ json = json_data)
+
+ json_output += response.json()["hits"]["hits"]
+
+ if not len(response.json()["hits"]["hits"]): break
+
+ out = []
+ for offer in json_output:
+ out_ids = [i["_id"] for i in out]
+ if offer["_id"] not in out_ids: out.append(offer)
+
+ return out
diff --git a/scrap/scrap.py b/scrap/scrap.py
new file mode 100644
index 0000000..50ef3d4
--- /dev/null
+++ b/scrap/scrap.py
@@ -0,0 +1,36 @@
+"""Scrap
+Usage:
+ scrap [options] <city>
+
+Options:
+ -h --help Show this message
+ -v --verbose Be more verbose
+ -o FILE Redirect standard output to FILE.
+ -p INT Number of pages to get [default: 1]
+
+Examples:
+ $ scrap Medellín -p 20
+"""
+
+import json
+from docopt import docopt
+from sys import stdout
+
+from .fincaraiz import get_json
+
+
+def main():
+ if not __doc__: return 1
+
+ args = docopt(__doc__)
+ city: str = args["<city>"]
+ options = {"pages": int(args["-p"])}
+
+ json_data = get_json(city, **options)
+
+ if args["-o"]:
+ with open(args["-o"], "w") as json_file:
+ json.dump(json_data, json_file)
+ else:
+ json.dump(json_data, stdout)
+ return 0
Feel free to download, copy and edit any repo