最近初学Hadoop,仿照参考书上编写了一个wordcount程序,本文主要解决运行过程中出现的一些问题,下边先看一下这个项目。
项目结构  
WordMapper类 package  wordcount;import  java.io.IOException ;import  java.util.StringTokenizer ;import  org.apache.hadoop.io.IntWritable ;import  org.apache.hadoop.io.Text ;import  org.apache.hadoop.mapreduce.Mapper ;public class  WordMapper  extends   Mapper<Object , Text , Text , IntWritable>  { 	private  final  static IntWritable  one = new  IntWritable (1 ); 	private  Text  word = new  Text (); 	public void map(Object  key, Text  value, Context  context) 			throws  IOException , InterruptedException  { 		StringTokenizer  itr = new  StringTokenizer (value.toString()); 		while  (itr.hasMoreTokens()) { 			word.set(itr.nextToken()); 			context.write(word, one); 		} 	} } 
 
WordReducer类 package wordcount; import  java.io.IOException ;import  org.apache.hadoop.io.IntWritable ;import  org.apache.hadoop.io.Text ;import  org.apache.hadoop.mapreduce.Reducer ;public class WordReducer  extends Reducer <Text , IntWritable , Text , IntWritable > { 	private IntWritable  result  = new IntWritable (); 	public void  reduce(Text  key, Iterable <IntWritable > values, Context  context) 			throws IOException , InterruptedException  { 		int  sum = 0 ; 		for  (IntWritable  val : values) { 			sum += val.get(); 		} 		result .set (sum); 		context.write(key, result ); 	} } 
 
WordMain类 package wordcount; import  org.apache.hadoop.conf.Configuration;import  org.apache.hadoop.fs.Path;import  org.apache.hadoop.io.IntWritable;import  org.apache.hadoop.io.Text;import  org.apache.hadoop.mapreduce.Job;import  org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import  org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import  org.apache.hadoop.util.GenericOptionsParser;public  class  WordMain   {	public  static  void main(String [] args) throws  Exception  { 		Configuration  conf = new Configuration (); 		String [] otherArgs = new GenericOptionsParser (conf, args) 				.getRemainingArgs(); 		if  (otherArgs.length != 2 ) { 			System .out.println ("Usage:wordcount<in> <out>" ); 			System .exit(2 ); 		} 		Job  job = new Job (conf, "word count" ); 		job.setJarByClass(WordMain .class ); 		job.setMapperClass(WordMapper .class ); 		job.setCombinerClass(WordReducer .class ); 		job.setReducerClass(WordReducer .class ); 		job.setOutputKeyClass(Text .class ); 		job.setOutputValueClass(IntWritable .class ); 		FileInputFormat .addInputPath(job, new Path (otherArgs[0 ])); 		FileOutputFormat .setOutputPath(job, new Path (otherArgs[1 ])); 		System .exit(job.waitForCompletion(true ) ? 0  : 1 ); 	} } 
 
统计单词存放文件  file1.txt Hello , i  love  coding are  you  ok ?Hello , i  love  hadoop are  you  ok ?
 file2.txt Hello  i  love  coding are  you  ok ?Hello  i  love  hadoop are  you  ok ?
将wordcount打包  
 
 
导入相关文件到虚拟机 在linux的opt文件下新建一个file文件,将file1.txt和file2.txt复制进去,同时将wordcount.jar也复制到opt目录中
 
运行程序 进入hadoop的bin目录下,输入以下命令
 
运行时会出现Input path does not exist错误
 
这是因为没有设置路径造成的 。
回到WordMain代码中
 
改进后的WordMain代码:
package wordcount; import  org.apache.hadoop.conf.Configuration;import  org.apache.hadoop.fs.Path;import  org.apache.hadoop.io.IntWritable;import  org.apache.hadoop.io.Text;import  org.apache.hadoop.mapreduce.Job;import  org.apache.hadoop.mapreduce.lib.input.FileInputFormat;import  org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;import  org.apache.hadoop.util.GenericOptionsParser;public  class  WordMain   {	public  static  void main(String [] args) throws  Exception  { 		Configuration  conf = new Configuration (); 		conf.set ("mapred.job.tracker" , "127.0.0.1:9001" ); 		String [] ars = new String [] { "input" , "output"  }; 		String [] otherArgs = new GenericOptionsParser (conf, ars) 				.getRemainingArgs(); 		if  (otherArgs.length != 2 ) { 			System .err.println ("Usage: wordcount <in> <out>" ); 			System .exit(2 ); 		} 		Job  job = new Job (conf, "word count" ); 		job.setJarByClass(WordMain .class ); 		job.setMapperClass(WordMapper .class ); 		job.setCombinerClass(WordReducer .class ); 		job.setReducerClass(WordReducer .class ); 		job.setOutputKeyClass(Text .class ); 		job.setOutputValueClass(IntWritable .class ); 		FileInputFormat .addInputPath(job, new Path (otherArgs[0 ])); 		FileOutputFormat .setOutputPath(job, new Path (otherArgs[1 ])); 		System .exit(job.waitForCompletion(true ) ? 0  : 1 ); 	} } 
 
再次运行,没有错误
 
Hadoop常用的几个配置文件 core-site.xml <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration > <property >   <name > hadoop.tmp.dir</name >      <value > /hadoop</value >      </property >      <property >        <name > fs.default.name</name >          <value > hdfs://master:9000</value >          </property >          <property >             <name > dfs.name.dir</name >                    <value > /hadoop/name</value >       </property >  </configuration > 
 
mapred-site.xml <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration > <property >     <name > mapred.job.tracker</name >        <value > master:9001</value >  </property > <property >     <name > mapred.system.dir</name >        <value > /hadoop/mapred_system</value >  </property > <property >     <name > mapred.local.dir</name >        <value > /hadoop/mapred_local</value >  </property > </configuration > 
 
hdfs-site.xml <?xml version="1.0"?> <?xml-stylesheet type="text/xsl" href="configuration.xsl"?> <configuration > <property >     <name > dfs.replication</name >        <value > 3</value >  </property > <property >     <name > dfs.data.dir</name >        <value > /hadoop/data</value >  </property > </configuration > 
 
hadoop-env.sh 这个文件中主要是配置Java路径,我的路径为/usr/java/jdk1.7.0_75
# Set  Hadoop-specific environment variables  here.# The only  required  environment variable  is  JAVA_HOME.  All others are  # optional.  When  running a distributed  configuration it is  best to  # set  JAVA_HOME in  this file , so that it is  correctly defined on  # remote nodes. # The java  implementation to  use .  Required . export  JAVA_HOME=/usr/java /jdk1.7 .0 _75# Extra Java  CLASSPATH elements.  Optional. # export  HADOOP_CLASSPATH= # The maximum amount of  heap  to  use , in  MB. Default  is  1000.  # export  HADOOP_HEAPSIZE=2000  # Extra Java  runtime options.  Empty  by  default . # export  HADOOP_OPTS=-server  # Command specific options appended to  HADOOP_OPTS when  specified export  HADOOP_NAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_NAMENODE_OPTS" export  HADOOP_SECONDARYNAMENODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_SECONDARYNAMENODE_OPTS" export  HADOOP_DATANODE_OPTS="-Dcom.sun.management.jmxremote $HADOOP_DATANODE_OPTS" export  HADOOP_BALANCER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_BALANCER_OPTS" export  HADOOP_JOBTRACKER_OPTS="-Dcom.sun.management.jmxremote $HADOOP_JOBTRACKER_OPTS" # export  HADOOP_TASKTRACKER_OPTS= # The following  applies to  multiple commands (fs, dfs, fsck, distcp etc) # export  HADOOP_CLIENT_OPTS # Extra ssh options.  Empty  by  default . # export  HADOOP_SSH_OPTS="-o ConnectTimeout=1 -o SendEnv=HADOOP_CONF_DIR"  # Where  log  files are  stored .  $HADOOP_HOME/logs  by  default . # export  HADOOP_LOG_DIR=${HADOOP_HOME}/logs  # File  naming remote slave  hosts .  $HADOOP_HOME/conf/slaves by  default . # export  HADOOP_SLAVES=${HADOOP_HOME}/conf/slaves # host:path  where  hadoop code should be rsync'd from.  Unset by default. # export HADOOP_MASTER=master:/home/$USER/src/hadoop # Seconds to sleep between slave commands.  Unset by default.  This # can be useful in large clusters, where, e.g., slave rsyncs can # otherwise arrive faster than the master can service them. # export HADOOP_SLAVE_SLEEP=0.1 # The directory where pid files are stored. /tmp by default. # export HADOOP_PID_DIR=/var/hadoop/pids # A string representing this instance of hadoop. $USER by default. # export HADOOP_IDENT_STRING=$USER # The scheduling priority for daemon processes.  See ' man nice'. # export HADOOP_NICENESS=10   
 
/etc/hosts配置 输入ifconfig,查看当前虚拟机IP,找到inet addr 
 
配置hosts,设置master为虚拟机的inet addr 
eclipse连接hdfs成功
 
Eclipse运行Hadoop常见错误及解决办法 Eclipse下搭建Hadoop开发环境 
Could not obtain block 
Too many fetch-failures 
unknown host: hadoop 
NameNode is in safe mode 
org.apache.hadoop.security.AccessControlException: Permission denied: user=d, access=WRITE, inode=”data”:zxg:supergroup:rwxr-xr-x