|
本帖最后由 〇〇 于 2021-11-12 12:38 编辑
处理真实数据
从https://www12.statcan.gc.ca/cens ... ETYPE=CSV&GEONO=055 下载加拿大人口普查发布结果
然后
解压缩,从meta文件找到列名,创建表,导入csv文件,再复制到parquet
再建立视图
D:\duckdb>duckdb
v0.3.0 46a0fc50a
Enter ".help" for usage hints.
Connected to a transient in-memory database.
Use ".open FILENAME" to reopen on a persistent database.
D create table X2016055
> (
> "CENSUS_YEAR" text,
> "GEO_CODE (POR)"text,
> "GEO_LEVEL"text,
> "GEO_NAME"text,
> "GNR"text,
> "GNR_LF"text,
> "DATA_QUALITY_FLAG"text,
> "CSD_TYPE_NAME"text,
> "ALT_GEO_CODE"text,
> "DIM: Profile of Census Divisions/Census Subdivisions (2247)"text,
> "Member ID: Profile of Census Divisions/Census Subdivisions (2247)"text,
> "Notes: Profile of Census Divisions/Census Subdivisions (2247)"text,
> "Dim: Sex (3): Member ID: [1]: Total - Sex"text,
> "Dim: Sex (3): Member ID: [2]: Male"text,
> "Dim: Sex (3): Member ID: [3]: Female"text
> );
D .timer on
D insert into X2016055 select * from 'd:/cac/98-401-X2016055_English_CSV_data.csv';
Run Time: real 30.980 user 29.156587 sys 1.825212
D select count(*) from X2016055;
| count_star() |
|--------------|
| 12288843 |
Run Time: real 0.020 user 0.000000 sys 0.000000
D
D .mode markdown
D COPY (SELECT * FROM X2016055) TO 'X2016055-zstd.parquet' (FORMAT 'parquet' , CODEC 'ZSTD');
Run Time: real 11.422 user 9.937264 sys 1.326008
D select count(*) from 'X2016055-zstd.parquet';
| count_star() |
|--------------|
| 12288843 |
Run Time: real 0.040 user 0.031200 sys 0.000000
D
D describe X2016055;
| Field | Type | Null | Key | Default | Extra |
|-------------------------------------------------------------------|---------|------|-----|---------|-------|
| CENSUS_YEAR | VARCHAR | YES | | | |
| GEO_CODE (POR) | VARCHAR | YES | | | |
| GEO_LEVEL | VARCHAR | YES | | | |
| GEO_NAME | VARCHAR | YES | | | |
| GNR | VARCHAR | YES | | | |
| GNR_LF | VARCHAR | YES | | | |
| DATA_QUALITY_FLAG | VARCHAR | YES | | | |
| CSD_TYPE_NAME | VARCHAR | YES | | | |
| ALT_GEO_CODE | VARCHAR | YES | | | |
| DIM: Profile of Census Divisions/Census Subdivisions (2247) | VARCHAR | YES | | | |
| Member ID: Profile of Census Divisions/Census Subdivisions (2247) | VARCHAR | YES | | | |
| Notes: Profile of Census Divisions/Census Subdivisions (2247) | VARCHAR | YES | | | |
| Dim: Sex (3): Member ID: [1]: Total - Sex | VARCHAR | YES | | | |
| Dim: Sex (3): Member ID: [2]: Male | VARCHAR | YES | | | |
| Dim: Sex (3): Member ID: [3]: Female | VARCHAR | YES | | | |
Run Time: real 0.140 user 0.000000 sys 0.000000
D select count(distinct GEO_LEVEL),count(distinct "GEO_CODE (POR)") from X2016055;
| count(geo_level) | count(GEO_CODE (POR)) |
|------------------|-----------------------|
| 4 | 5469 |
Run Time: real 1.410 user 1.404009 sys 0.000000
D select count(distinct GEO_NAME),count(distinct "CSD_TYPE_NAME") from X2016055;
| count(geo_name) | count(CSD_TYPE_NAME) |
|-----------------|----------------------|
| 5114 | 54 |
Run Time: real 1.450 user 1.450809 sys 0.000000
D select count(distinct "Member ID: Profile of Census Divisions/Census Subdivisions (2247)") from X2016055;
| count(Member ID: Profile of Census Divisions/Census Subdivisions (2247)) |
|--------------------------------------------------------------------------|
| 2247 |
Run Time: real 0.912 user 0.920406 sys 0.000000
D CREATE view strings as SELECT
> "CENSUS_YEAR" "year" ,
> "GEO_CODE (POR)" geo_code_por ,
> "GEO_LEVEL" geo_level ,
> "GEO_NAME" geo_name ,
> "GNR" gnr ,
> "GNR_LF" gnr_lf ,
> "DATA_QUALITY_FLAG" data_quality_flag ,
> "CSD_TYPE_NAME" csd_type_name ,
> "ALT_GEO_CODE" alt_geo_code ,
> "DIM: Profile of Census Divisions/Census Subdivisions (2247)" profile ,
> "Member ID: Profile of Census Divisions/Census Subdivisions (2247)" profile_id ,
> "Notes: Profile of Census Divisions/Census Subdivisions (2247)" notes ,
> "Dim: Sex (3): Member ID: [1]: Total - Sex" total ,
> "Dim: Sex (3): Member ID: [2]: Male" male ,
> "Dim: Sex (3): Member ID: [3]: Female" female
> from X2016055;
Run Time: real 0.000 user 0.000000 sys 0.000000
D WITH inputs AS (
> SELECT
> geo_name,
> CASE WHEN profile_id = '1930' THEN 'total' ELSE 'cyclist' END AS mode,
> female,
> male
> FROM strings
> WHERE
> profile_id IN ('1930', '1935') AND
> csd_type_name = 'CY' AND
> geo_name IN ('Victoria', 'Dawson Creek', 'Kitchener')
> )
> SELECT
> total.geo_name,
> cyclist.male,
> cyclist.female,
> (100.0 * cyclist.male::int / total.male::int)::numeric(4,2) AS pct_male,
> (100.0 * cyclist.female::int / total.female::int)::numeric(4,2) AS pct_female
> FROM inputs AS total
> JOIN inputs AS cyclist USING (geo_name)
> WHERE total.mode = 'total' AND cyclist.mode = 'cyclist';
| geo_name | male | female | pct_male | pct_female |
|--------------|------|--------|----------|------------|
| Kitchener | 905 | 280 | 1.51 | 0.51 |
| Victoria | 2650 | 2130 | 12.57 | 9.73 |
| Dawson Creek | 25 | 0 | 0.86 | 0.00 |
Run Time: real 0.420 user 0.374402 sys 0.000000
D WITH total AS (
> SELECT
> geo_name,
> female::int female,
> male::int male
> FROM strings
> WHERE
> geo_name in ('Dawson Creek', 'Victoria', 'Kitchener') AND
> csd_type_name = 'CY' AND
> profile_id = '1930'
> ), cyclist as (
> SELECT
> geo_name,
> female::int female,
> male::int male
> FROM strings
> WHERE
> geo_name in ('Dawson Creek', 'Victoria', 'Kitchener') AND
> csd_type_name = 'CY' AND
> profile_id = '1935'
> )
> SELECT
> total.geo_name,
> cyclist.male,
> cyclist.female,
> 100.0 * cyclist.male / total.male,
> 100.0 * cyclist.female / total.female
> FROM total
> JOIN cyclist USING (geo_name);
| geo_name | male | female | 100.0 * cyclist.male / total.male | 100.0 * cyclist.female / total.female |
|--------------|------|--------|-----------------------------------|---------------------------------------|
| Kitchener | 905 | 280 | 1.512998411769623 | 0.5145639988973628 |
| Victoria | 2650 | 2130 | 12.574139976275207 | 9.730470534490635 |
| Dawson Creek | 25 | 0 | 0.8635578583765112 | 0.0 |
Run Time: real 0.735 user 0.639604 sys 0.000000 |
|