Scala 正则表达式解析器无法解析结果
Scala regex parser could not parse the result out
我是 scala 的新手,想使用 class RegexParsers 来提取一些重要的特征
abstract class LogLine extends java.io.Serializable {
def app: String
}
case class AppSummary(timestamp: String, app: String, name: String, user: String, state:String, url:String, host: String, startTime: String, endTime: String, finalStatus: String) extends LogLine
case class OperSum(title: String, user: String, operation:String,target:String,result:String, app: String, container: String) extends LogLine
case object UnknownLine extends LogLine {
val app = "unknown"
}
object LogP extends RegexParsers with java.io.Serializable {
def logline: Parser[LogLine] = (
timestamp~"INFO org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary: appId="~ident
~",name="~identW
~",user="~ident
~",queue=default,state="~ident
~",trackingUrl="~url
~",appMasterHost="~ident
~".icdatacluster2,startTime="~ident
~",finishTime="~ident
~",finalStatus="~ident ^^ {
case t~_~app~_~name~_~user~_~state~_~url~_~host~_~stime~_~etime~_~finalStatus =>
AppSummary(t, app, name, user, state, url, host, stime, etime, finalStatus)
}
| timestamp~"INFO org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger: USER="~identY
~"OPERATION="~identY
~"TARGET="~identY
~"RESULT="~identY
~"APPID="~identY
~"CONTAINERID="~ident ^^ {
case t~_~user~_~operation~_~target~_~result~_~app~_~container =>
OperSum(t, user, operation, target, result, app, container)
}
)
val ident: Parser[String] = "[A-Za-z0-9_]+".r
val identY: Parser[String] ="[A-Za-z0-9_]+\s".r
val identW: Parser[String] = "[A-Za-z0-9_ ]+".r
val timestamp: Parser[String] = "2015-[0-9][0-9]-[0-9][0-9] [0-9:,]+".r
val url: Parser[String] = "http://[a-zA-Z0-1.]+:[0-9]+/[a-zA-Z0-9_/]+".r
}
它可以在第一种情况下工作,但不能在第二种情况下工作,例如:以下单词:
2015-03-09 01:36:39,016 信息 org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger: USER=pwalch OPERATION=AM 已发布容器 TARGET=SchedulerApp RESULT=SUCCESS APPID=application_1425682538854_0741 CONTAINERID=container_1425682538854_0741_01_000004
无法提取。希望某人。可以帮助
一些可能的原因:
- 您似乎没有处理
timestamp
和 INFO
开头的字符串之间的空格?
- 字符串 AM Released Container 包含空格,您的
identY
正则表达式 "[A-Za-z0-9_]+\s".r
最多只能匹配 AM - 因此后续匹配器将全部失败
我是 scala 的新手,想使用 class RegexParsers 来提取一些重要的特征
abstract class LogLine extends java.io.Serializable {
def app: String
}
case class AppSummary(timestamp: String, app: String, name: String, user: String, state:String, url:String, host: String, startTime: String, endTime: String, finalStatus: String) extends LogLine
case class OperSum(title: String, user: String, operation:String,target:String,result:String, app: String, container: String) extends LogLine
case object UnknownLine extends LogLine {
val app = "unknown"
}
object LogP extends RegexParsers with java.io.Serializable {
def logline: Parser[LogLine] = (
timestamp~"INFO org.apache.hadoop.yarn.server.resourcemanager.RMAppManager$ApplicationSummary: appId="~ident
~",name="~identW
~",user="~ident
~",queue=default,state="~ident
~",trackingUrl="~url
~",appMasterHost="~ident
~".icdatacluster2,startTime="~ident
~",finishTime="~ident
~",finalStatus="~ident ^^ {
case t~_~app~_~name~_~user~_~state~_~url~_~host~_~stime~_~etime~_~finalStatus =>
AppSummary(t, app, name, user, state, url, host, stime, etime, finalStatus)
}
| timestamp~"INFO org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger: USER="~identY
~"OPERATION="~identY
~"TARGET="~identY
~"RESULT="~identY
~"APPID="~identY
~"CONTAINERID="~ident ^^ {
case t~_~user~_~operation~_~target~_~result~_~app~_~container =>
OperSum(t, user, operation, target, result, app, container)
}
)
val ident: Parser[String] = "[A-Za-z0-9_]+".r
val identY: Parser[String] ="[A-Za-z0-9_]+\s".r
val identW: Parser[String] = "[A-Za-z0-9_ ]+".r
val timestamp: Parser[String] = "2015-[0-9][0-9]-[0-9][0-9] [0-9:,]+".r
val url: Parser[String] = "http://[a-zA-Z0-1.]+:[0-9]+/[a-zA-Z0-9_/]+".r
}
它可以在第一种情况下工作,但不能在第二种情况下工作,例如:以下单词: 2015-03-09 01:36:39,016 信息 org.apache.hadoop.yarn.server.resourcemanager.RMAuditLogger: USER=pwalch OPERATION=AM 已发布容器 TARGET=SchedulerApp RESULT=SUCCESS APPID=application_1425682538854_0741 CONTAINERID=container_1425682538854_0741_01_000004 无法提取。希望某人。可以帮助
一些可能的原因:
- 您似乎没有处理
timestamp
和INFO
开头的字符串之间的空格? - 字符串 AM Released Container 包含空格,您的
identY
正则表达式"[A-Za-z0-9_]+\s".r
最多只能匹配 AM - 因此后续匹配器将全部失败