 # pdf2txt.py -M 10 -L 5 apollo11.pdf > apollo11.txt
# remove top part of file (everything up to "APOLLO 11 - AIR-TO-GROUND VOICE TRANSCRIPTION")

in_file = open('apollo11.txt')
out_file = open('out.txt', 'w')

for line in in_file:
    if line.rstrip().isdigit():
        continue

    if 'END OF TAPE' in line:
        continue

    if line.upper() == line:
        continue

    if "Tape" in line or "t a p e" in line:
        continue

    if 'asterisks denote clipping' in line:
        continue 

    if not line[0:11].replace(' ', '').isdigit():
        pos = out_file.tell()
        out_file.seek(pos - 1)
        out_file.write(" ")
    else:
        day = int(line[0:2])
        hour = int(line[3:5])
        minute = int(line[6:8])
        second = int(line[9:11])
        line = line[11:]
    out_file.write(line)

in_file.close()
out_file.close()

in_file = open('out.txt')
out_file = open('appollo.spk', 'w')
for line in in_file:
    # Some cleanup
    line = line.replace("&", "\\&")
    line = line.replace(". ", ", ")
    line = line.replace("CDR", "[:n1]")
    line = line.replace("CC", "[:n0]")
    line = line.replace("CMP", "[:n4]")
    line = line.replace("Houston", "Hewston")
    line = line.replace(". . .", "\\ldots\\ ")
    line = line.replace('', '')
    line = line.replace("***", "\\ldots\\ ")
    line = line.replace('$', '\\$')
    line = line.replace('-', '')
    line = line.replace('  ', ' ')
    line = line.replace('\\ldots\\ \\ldots\\', '\\ldots\\')
    line = line.replace('\\ldots\\ .', '\\ldots.')

    line2 = ""

    done = False
    found = False
    for word in line.split(' '):
        if word == word.upper() and not done:
            if word.count(":") == 2 or word.isdigit() or "11" in word or len(word) == 1:
                line2 += word + " "
            else:
                found = True
                line2 +=  word +  " "
        else:
            if found:
                done = True
            line2 += word + " "




    out_file.write(line2)

in_file.close()
out_file.close()