I hereby claim:
- I am andrewgross on github.
- I am andrewwgross (https://keybase.io/andrewwgross) on keybase.
- I have a public key ASDsj8ie3y_QBUpm4aBzm-ty7Hr9w_Y5PtWIcLZfQlt9JQo
To claim this, I am signing this object:
| // Thinking token levels | |
| const THINK_LEVEL = { | |
| HIGHEST: 31999, | |
| MIDDLE: 10000, | |
| BASIC: 4000, | |
| NONE: 0, | |
| }; | |
| // Language cues for thinking intensity | |
| const THINK_CUES = { |
| import base64 | |
| import json | |
| import requests | |
| from pathlib import Path | |
| from typing import Union, Optional | |
| class RemarkableError(Exception): | |
| pass | |
| def upload_to_remarkable( |
| import os | |
| from urllib.parse import urlparse | |
| from pyspark.sql.functions import desc, asc | |
| from pyspark.sql.types import ( | |
| StructType, | |
| StructField, | |
| StringType, | |
| LongType, |
| import re | |
| import pyspark.sql.types as T | |
| from math import ceil | |
| def repartition_for_writing(df): | |
| count = df.count() | |
| sampled_df = get_sampled_df(df, count=count) | |
| string_column_sizes = get_string_column_sizes(sampled_df) | |
| num_files = get_num_files(count, df.schema, string_column_sizes) |
| import datetime | |
| import json | |
| BUCKET_NAME = "<s3_bucket_name>" | |
| INVENTORY_PREFIX = "<prefix_given_to_s3_inventory>" # Should have data/, hive/, and some dated folders inside of it | |
| ACCOUNT_CUID = "<your_canonical_user_id_for_cross_account>" # Account which is not the owner of S3 bucket, but trying to access it. Controls ROLE_ARN | |
| ROLE_ARN = "<role_in_cross_account_that_can_assume_to_main_account>" | |
| def role_arn_to_session(role_arn): |
| def get_files_per_partition(df, partition_key, file_type="parquet", compression="snappy", byte_array_size=256): | |
| rows = df.count() | |
| print "Dataset has {} rows".format(rows) | |
| schema = df.schema | |
| num_partitions = 1 | |
| if partition_key is not None: | |
| num_partitions = df.select([partition_key]).distinct().count() | |
| print "Dataset has {} distinct partition keys".format(num_partitions) | |
| _df = df.drop(partition_key) | |
| schema = _df.schema |
| Hey, | |
| This is going to be a bit of an esoteric ticket. I noticed some strange behavior recently when comparing Spectrum and Redshift results on the same dataset. | |
| Redshift Data: fare.txt | |
| Parquet Data: fare.parquet | |
| The parquet data was generated from fare.txt with PySpark using convert.py on Spark 2.2.0 | |
| Redshift Table Schema: |
I hereby claim:
To claim this, I am signing this object:
| import cmd | |
| from urlparse import urlparse | |
| import sys | |
| import re | |
| import subprocess | |
| def write_to_clipboard(output): | |
| process = subprocess.Popen( | |
| 'pbcopy', env={'LANG': 'en_US.UTF-8'}, stdin=subprocess.PIPE) | |
| process.communicate(output.encode('utf-8')) |
| # -*- coding: utf-8 -*- | |
| from __future__ import unicode_literals | |
| import base64 | |
| import json | |
| import logging | |
| from urlparse import parse_qs | |
| import requests |