各位用户为了找寻关于python自定义解析简单xml格式文件的方法的资料费劲了很多周折。这里教程网为您整理了关于python自定义解析简单xml格式文件的方法的相关资料,仅供查阅,以下为您介绍关于python自定义解析简单xml格式文件的方法的详细内容
本文实例讲述了python自定义解析简单xml格式文件的方法。分享给大家供大家参考。具体分析如下:
因为公司内部的接口返回的字串支持2种形式:php数组,xml;结果php数组python不能直接用,而xml字符串的格式不是标准的,所以也不能用标准模块解析。【不标准的地方是某些节点会的名称是以数字开头的】,所以写个简单的脚步来解析一下文件,用来做接口测试。
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71#!/usr/bin/env python
#encoding: utf-8
import
re
class
xmlparse:
def
__init__(
self
, xmlstr):
self
.xmlstr
=
xmlstr
self
.xmldom
=
self
.__convet2utf8()
self
.xmlnodelist
=
[]
self
.xpath
=
''
def
__convet2utf8(
self
):
headstr
=
self
.__get_head()
xmldomstr
=
self
.xmlstr.replace(headstr, '')
if
'gbk'
in
headstr:
xmldomstr
=
xmldomstr.decode(
'gbk'
).encode(
'utf-8'
)
elif
'gb2312'
in
headstr:
xmldomstr
=
self
.xmlstr.decode(
'gb2312'
).encode(
'utf-8'
)
return
xmldomstr
def
__get_head(
self
):
headpat
=
r
'<?xml.*?>'
headpatobj
=
re.
compile
(headpat)
headregobj
=
headpatobj.match(
self
.xmlstr)
if
headregobj:
headstr
=
headregobj.group()
return
headstr
else
:
return
''
def
parse(
self
, xpath):
self
.xpath
=
xpath
xpatlist
=
[]
xpatharr
=
self
.xpath.split(
'/'
)
for
xnode
in
xpatharr:
if
xnode:
spcindex
=
xnode.find(
'['
)
if
spcindex >
-
1
:
index
=
int
(xnode[spcindex
+
1
:
-
1
])
xnode
=
xnode[:spcindex]
else
:
index
=
0
;
temppat
=
(
'<%s>(.*?)</%s>'
%
(xnode, xnode),index)
xpatlist.append(temppat)
xmlnodestr
=
self
.xmldom
for
xpat,index
in
xpatlist:
xmlnodelist
=
re.findall(xpat,xmlnodestr)
xmlnodestr
=
xmlnodelist[index]
if
xmlnodestr.startswith(r
'<![CDATA['
):
xmlnodestr
=
xmlnodestr.replace(r
'<![CDATA['
,'')[:
-
3
]
self
.xmlnodelist
=
xmlnodelist
return
xmlnodestr
if
'__main__'
=
=
__name__:
xmlstr
=
'<?xml version="1.0" encoding="utf-8" standalone="yes" ?><resultObject><a><product_id>aaaaa</product_id><product_name><![CDATA[bbbbb]]></a><b><product_id>bbbbb</product_id><product_name><![CDATA[bbbbb]]></b></product_name></resultObject>'
xpath1
=
'/product_id'
xpath2
=
'/product_id[1]'
xpath3
=
'/a/product_id'
xp
=
xmlparse(xmlstr)
print
'xmlstr:'
,xp.xmlstr
print
'xmldom:'
,xp.xmldom
print
'------------------------------'
getstr
=
xp.parse(xpath1)
print
'xpath:'
,xp.xpath
print
'get list:'
,xp.xmlnodelist
print
'get string:'
, getstr
print
'------------------------------'
getstr
=
xp.parse(xpath2)
print
'xpath:'
,xp.xpath
print
'get list:'
,xp.xmlnodelist
print
'get string:'
, getstr
print
'------------------------------'
getstr
=
xp.parse(xpath3)
print
'xpath:'
,xp.xpath
print
'get list:'
,xp.xmlnodelist
print
'get string:'
, getstr
运行结果:
? 1 2 3 4 5 6 7 8 9 10 11 12 13 14xmlstr: <?xml version
=
"1.0"
encoding
=
"utf-8"
standalone
=
"yes"
?><resultObject><a><product_id>aaaaa<
/
product_id><product_name><![CDATA[bbbbb]]><
/
a><b><product_id>bbbbb<
/
product_id><product_name><![CDATA[bbbbb]]><
/
b><
/
product_name><
/
resultObject>
xmldom: <resultObject><a><product_id>aaaaa<
/
product_id><product_name><![CDATA[bbbbb]]><
/
a><b><product_id>bbbbb<
/
product_id><product_name><![CDATA[bbbbb]]><
/
b><
/
product_name><
/
resultObject>
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
xpath:
/
product_id
get
list
: [
'aaaaa'
,
'bbbbb'
]
get string: aaaaa
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
xpath:
/
product_id[
1
]
get
list
: [
'aaaaa'
,
'bbbbb'
]
get string: bbbbb
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
xpath:
/
a
/
product_id
get
list
: [
'aaaaa'
]
get string: aaaaa
因为返回的xml格式比较简单,没有带属性的节点,所以处理起来就比较简单了。但测试还是发现有一个bug。即当相同节点嵌套时会出现正则匹配出问题,该问题的可以通过避免在xpath中出现有嵌套节点的名称来解决,否则只有重写复杂的机制了。
希望本文所述对大家的Python程序设计有所帮助。