Created
January 11, 2019 15:57
-
-
Save rasmi/f1651b34c7cc620d4718044be8029a25 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Copyright 2018 Google LLC. All Rights Reserved. | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# ============================================================================== | |
"""Create BigQuery schema for GAIA data types.""" | |
def infer_types_from_webpage(): | |
"""Infer GAIA schema (fields, datatypes, and descriptions) from DR2 webpage. | |
Returns: | |
A dict mapping from field name to a tuple of (BQ dtype, description). | |
""" | |
import requests | |
from lxml import html | |
PAGE_URL = ( | |
'https://gea.esac.esa.int/archive/documentation/GDR2/Gaia_archive/' | |
'chap_datamodel/sec_dm_main_tables/ssec_dm_gaia_source.html') | |
page = requests.get(PAGE_URL) | |
root = html.document_fromstring(page.text) | |
content = root.find_class('ltx_page_content')[0] | |
paragraphs = content.cssselect('p.ltx_p') | |
text_elements = [element.text_content() for element in paragraphs] | |
field_descriptions_list = [text for text in text_elements if ' : ' in text] | |
field_descriptions = {field: description | |
for field, description in ( | |
field_description.split(' : ') | |
for field_description in field_descriptions_list)} | |
BQ_TYPE_PATTERNS = { | |
'(float': 'FLOAT64', | |
'(double': 'FLOAT64', | |
'(short': 'INT64', | |
'(int': 'INT64', | |
'(long': 'INT64', | |
'(byte': 'INT64', | |
'(string': 'STRING', | |
'(boolean': 'BOOL' | |
} | |
def find_bq_type(text): | |
"""Check if the text contains a data type that maps to a BQ data type. | |
Args: | |
text: Input text to search for data types. | |
Returns: | |
A string of the BQ data type, if it exists in the input text. | |
""" | |
for pattern, bq_type in BQ_TYPE_PATTERNS.items(): | |
if pattern in text: | |
return bq_type | |
type_and_description = { | |
field: (find_bq_type(description), description) | |
for field, description in field_descriptions.items()} | |
return type_and_description | |
def infer_types_from_csv(filename): | |
"""Infer GAIA schema (fields and datatypes) from a sample CSV file. | |
Args: | |
filename: Filename of a sample CSV file of GAIA data to infer types from. | |
Returns: | |
A dict mapping from field name to a tuple of (BQ dtype, description). | |
In this case, description is the empty string ''. | |
""" | |
import numpy as np | |
import pandas as pd | |
data = pd.read_csv(filename) | |
NP_BQ_TYPES = { | |
np.dtype('float64'): 'FLOAT64', | |
np.dtype('int64'): 'INT64', | |
np.dtype('object'): 'STRING', | |
np.dtype('bool'): 'BOOL' | |
} | |
type_and_description = { | |
field: (NP_BQ_TYPES[dtype], '') for field, dtype in data.dtypes.items()} | |
return type_and_description | |
def to_bq_schema(type_and_description): | |
"""Convert a dict of field names, dtypes, and descriptions to BQ schema json. | |
Args: | |
type_and_description: A dict mapping from field name | |
to a tuple of (BQ dtype, description). | |
Returns: | |
A list of dicts of the form {'name', 'type', 'mode', 'description'}. | |
""" | |
MODE = 'NULLABLE' | |
bq_schema = [ | |
{'name': field, 'type': dtype, 'mode': MODE, 'description': description} | |
for field, (dtype, description) in type_and_description.items()] | |
return bq_schema | |
if __name__ == '__main__': | |
import json | |
types = infer_types_from_webpage() | |
bq_schema = to_bq_schema(types) | |
with open('schema.json', 'w') as output: | |
output.write(json.dumps(bq_schema)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment