配置
core-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://192.168.0.1:9000</value>
<description>The name of the default file system. Either the
literal string "local" or a host:port for NDFS.
</description>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>/home/dbkehadoop/hadoop_tmp_dir</value>
</property>
<property>
<name>dfs.permissions</name>
<value>false</value>
<description>
If "true", enable permission checking in HDFS.
If "false", permission checking is turned off,
but all other behavior is unchanged.
Switching from one parameter value to the other does not change the mode,
owner or group of files or directories.
</description>
</property>
</configuration>
mapred-site.xml
<?xml version="1.0"?>
<?xml-stylesheet type="text/xsl" href="configuration.xsl"?>
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>mapred.job.tracker</name>
<value>192.168.0.1:9001</value>
</property>
<property>
<name>mapred.input.dir.recursive</name>
<value>true</value>
</property>
<property>
<name>mapred.hosts.exclude</name>
<value>/home/dbkehadoop/hadoop-1.2.1/conf/excludes</value>
<description>Names a file that contains the list of hosts that
should be excluded by the jobtracker. If the value is empty, no
hosts are excluded.</description>
</property>
<property>
<name>mapred.submit.replication</name>
<value>2</value>
<description>The replication level for submitted job files. This
should be around the square root of the number of nodes.
</description>
</property>
<property>
<name>mapred.child.java.opts</name>
<value>-Xmx2048m</value>
<description>Java opts for the task tracker child processes.
The following symbol, if present, will be interpolated: @taskid@ is replaced
by current TaskID. Any other occurrences of '@' will go unchanged.
For example, to enable verbose gc logging to a file named for the taskid in
/tmp and to set the heap maximum to be a gigabyte, pass a 'value' of:
-Xmx1024m -verbose:gc -Xloggc:/tmp/@taskid@.gc
The configuration variable mapred.child.ulimit can be used to control the
maximum virtual memory of the child processes. </description>
</property>
<property>
<name>mapred.hosts</name>
<value>/home/dbkehadoop/hadoop-1.2.1/conf/includes</value>
<description>Names a file that contains the list of nodes that may
connect to the jobtracker. If the value is empty, all hosts are
permitted.</description>
</property>
</configuration>
hdfs-site.xml
<configuration>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
</configuration>
压缩
压缩map输出
压缩类型为可选项
Configuration conf = new Configuration();
conf.setBoolean("mapred.compress.map.output", true);
conf.setClass("mapred.map.output.compression.codec", GzipCodec.class,
CompressionCodec.class);
Job job = new Job(conf, "EmitLinkRelations");
压缩reduce输出
TextOutputFormat.setCompressOutput(job, true);
TextOutputFormat.setOutputCompressorClass(job, GzipCodec.class);