3323
This is a few lines that can be stored in a single file and pointed to with Hadoop in standalone or pseudo cluster mode. These were taken directly from the census data on the Amazon EBS volume.
"07","050","00","01","007","","","","","","","","","A","Bibb County, Alabama",20826,10745,10081,1449,1530,1454,1407,1422,3248,3177,2725,1093,908,1324,755,334,34.7,15540,8031,7509,14702,2924,2413,1000,1413,20719,15966,4624,49,17,8,1,5,0,1,1,1,2,1,1,0,0,61,107,16064,4651,115,25,4,76,20826,210,113,7,0,90,20616,15870,20826,19587,7421,4331,6120,4597,1174,586,541,269,1239,1024,215,7421,5581,2554,4331,1912,939,505,1840,1638,700,2859,1778,2.64,3.08,8345,7421,924,122,1.4,9.3,7421,5951,1470,2.70,2.39 "07","050","00","02","050","","","","","","","","","S","Bethel Census Area, Alaska",16006,8500,7506,1605,1906,1886,1468,1065,2156,2464,1699,599,327,517,250,64,25.3,9629,5104,4525,8909,1042,831,419,412,15389,2006,61,13114,168,7,12,21,2,116,1,9,9,6,0,2,1,31,617,2526,111,13680,217,23,77,16006,140,82,5,1,52,15866,1958,16006,15765,4226,2123,7299,5469,1407,752,710,363,241,194,47,4226,3175,2157,2123,1531,642,385,1051,839,117,2472,667,3.73,4.41,5188,4226,962,508,2.0,6.2,4226,2581,1645,4.16,3.06 "07","050","00","04","007","","","","","","","","","A","Gila County, Arizona",51335,25249,26086,3116,3578,3946,3413,2117,4726,6704,7051,3351,3174,5748,3426,985,42.3,38445,18651,19794,36795,12111,10159,4677,5482,50411,39951,197,6630,220,60,39,46,29,18,12,16,28,6,10,6,6,3385,924,40763,257,7060,304,54,3854,51335,8546,6791,68,12,1675,42789,35391,51335,50404,20140,11103,13712,10782,3214,1728,2235,1001,931,828,103,20140,14090,5306,11103,3594,2174,1266,6050,5203,2473,6174,7000,2.50,2.99,28189,20140,8049,5725,3.2,11.3,20140,15858,4282,2.47,2.61 "07","050","00","05","007","","","","","","","","","A","Benton County, Arkansas",153406,75686,77720,11616,11423,11301,10473,9169,21910,23177,18193,7417,6754,12199,7682,2092,35.3,112585,54805,57780,106655,25977,21973,9867,12106,150615,139399,629,2531,1673,414,148,155,59,124,503,270,130,27,29,13,61,6253,2791,142028,817,4196,2012,209,7002,153406,13469,9596,223,71,3579,139937,133094,153406,151275,58212,36675,44561,37555,6145,2411,5682,2258,2131,1152,979,58212,43474,20023,36675,15761,4778,3068,14738,12292,4945,21588,14796,2.60,3.01,64281,58212,6069,1731,2.6,8.0,58212,42005,16207,2.61,2.57 "07","050","00","06","007","","","","","","","","","A","Butte County, California",203171,99546,103625,11637,13409,14704,17101,19648,23087,27249,26809,9527,7944,15207,12630,4219,35.8,154404,74247,80157,141860,36728,32056,13597,18459,195248,171728,2816,3866,6752,511,637,500,611,181,192,4120,296,119,40,60,77,9790,7923,178739,3873,7271,8349,695,12756,203171,21339,17134,391,127,3687,181832,162564,203171,197327,79566,37130,53903,43521,8697,3583,18031,5016,5844,1630,4214,79566,49386,22571,37130,14929,8879,5619,30180,21636,8826,24810,22122,2.48,3.02,85523,79566,5957,1350,2.1,5.2,79566,48336,31230,2.48,2.48 "07","050","00","08","007","","","","","","","","","A","Archuleta County, Colorado",9898,5016,4882,531,662,799,735,402,940,1639,1806,697,509,805,300,73,40.8,7391,3687,3704,7109,1442,1178,612,566,9641,8743,35,139,31,7,5,3,10,3,1,2,3,2,0,1,0,690,257,8993,55,234,40,6,833,9898,1659,472,4,4,1179,8239,7927,9898,9814,3980,2381,2711,2326,327,147,415,198,84,61,23,3980,2872,1257,2381,919,325,233,1108,878,238,1346,826,2.47,2.89,6212,3980,2232,1456,4.0,11.0,3980,3057,923,2.48,2.41 "07","060","00","09","001","08980","","","","","","","","A","Brookfield town, Fairfield County, Connecticut",15664,7617,8047,1023,1335,1271,905,516,1570,2998,2664,1014,684,973,536,175,39.2,11376,5452,5924,11024,2066,1684,710,974,15540,14926,119,11,388,136,114,52,21,28,11,26,0,0,0,0,0,96,124,15039,146,35,443,4,127,15664,372,61,85,16,210,15292,14666,15664,15586,5572,3797,5208,4100,521,157,488,199,78,0,78,5572,4367,2176,3797,1905,433,216,1205,961,361,2284,1177,2.80,3.18,5781,5572,209,122,0.5,1.4,5572,4960,612,2.88,2.16 "08","160","00","10","","","01400","","","","","","","A","Arden village, Delaware",474,232,242,15,27,27,22,10,43,71,112,31,27,41,34,14,46.5,388,183,205,381,102,89,43,46,465,451,4,2,8,5,2,0,0,1,0,0,0,0,0,0,0,0,9,460,4,3,15,1,0,474,11,3,0,1,7,463,440,474,474,229,89,113,83,20,3,23,13,0,0,0,229,123,49,89,37,26,8,106,88,23,52,65,2.07,2.80,243,229,14,0,1.7,3.4,229,173,56,2.23,1.59
import java.io.IOException;
import java.util.*;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.conf.*;
import org.apache.hadoop.io.*;
import org.apache.hadoop.mapred.*;
import org.apache.hadoop.util.*;
public class Gender {
private static String genderCheck = "female";
public static class Map extends MapReduceBase implements Mapper {
private final static IntWritable one = new IntWritable(1);
private Text locText = new Text();
public void map(LongWritable key, Text value, OutputCollector output, Reporter reporter) throws IOException {
String line = value.toString();
String location = line.split(",")[14] + "," + line.split(",")[15];
long male = 0L;
long female = 0L;
if (line.split(",")[17].matches("\d+") && line.split(",")[18].matches("\d+")) {
male = Long.parseLong(line.split(",")[17]);
female = Long.parseLong(line.split(",")[18]);
}
long diff = male - female;
locText.set(location);
if (Gender.genderCheck.toLowerCase().equals("female") && diff < 0) {
output.collect(locText, new LongWritable(diff * -1L));
}
else if (Gender.genderCheck.toLowerCase().equals("male") && diff > 0) {
output.collect(locText, new LongWritable(diff));
}
}
}
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf(Gender.class);
conf.setJobName("gender");
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(LongWritable.class);
conf.setMapperClass(Map.class);
if (args.length != 3) {
System.out.println("Usage:");
System.out.println("[male/female] /path/to/2kh/files /path/to/output");
System.exit(1);
}
if (!args[0].equalsIgnoreCase("male") && !args[0].equalsIgnoreCase("female")) {
System.out.println("first argument must be male or female");
System.exit(1);
}
Gender.genderCheck = args[0];
conf.setInputFormat(TextInputFormat.class);
conf.setOutputFormat(TextOutputFormat.class);
FileInputFormat.setInputPaths(conf, new Path(args[1]));
FileOutputFormat.setOutputPath(conf, new Path(args[2]));
JobClient.runJob(conf);
}
}