1. Über python
Linguistische Datenverarbeitung mit python
2. Beispiel: Parsing eines XML-Dokuments und Ausgabe aller <p>-Inhalte
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import xml.etree.ElementTree as etree
f = open('greif.csv', 'w')
#PARSING
tree = etree.parse('greif.xml')
root = tree.getroot()
#DEBUG: OUTPUT TAGS
for x in root.iter():
print x.tag + ' = ' + (x.text if x.text else '')
#AUTOR-EXTRAKTION
for x in root.iter('{http://www.tei-c.org/ns/1.0}author'):
author = x.text.replace('\n','').strip()
#TITEL-EXTRAKTION
for x in root.iter('{http://www.tei-c.org/ns/1.0}head'):
title = x.text.replace('\n','').strip()
#TEXT-EXTRAKTION
for x in root.iter('{http://www.tei-c.org/ns/1.0}p'):
if '{http://www.w3.org/XML/1998/namespace}id' in x.attrib:
text = "".join(x.itertext())
#TOKENISIERUNG
text = re.sub('([\.,:;!\?\-\'\"])',r' \1',text)
tokens = text.split()
#AUSGABE
for token in tokens:
output = author+'\t'+title+'\t'+token+'\n'
output = output.encode('utf-8')
f.write(output)
f.close()
