File size: 4,979 Bytes
75c309e
 
fa09dfd
 
89123c5
fa09dfd
7e16e49
fa09dfd
75c309e
fa09dfd
a51da34
 
 
 
 
0f00a6d
a51da34
e59171d
a51da34
 
 
1c29917
a51da34
 
 
 
 
 
 
 
 
 
5a290c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e59171d
5a290c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a51da34
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
---
license: llama2
widget:
  - example_title: "ALMA-Cymraeg-13B"
    text: "Cyfieithwch y testun Saesneg canlynol i'r Gymraeg.\n### Saesneg:\nFor the first time, GPs no longer have to physically print, sign and hand a green paper prescription form to the patient or wait for it to be taken to the pharmacy. Instead, the prescription is sent electronically from the surgery via the IT system to the patient’s chosen pharmacy - even without the patient needing to visit the surgery to pick up a repeat prescription form.\n\n### Cymraeg:\n"
    output:
      text: "Am y tro cyntaf, nid oes rhaid i feddygon teulu bellach argraffu, llofnodi a throsglwyddo ffurflen bresgripsiwn werdd i'r claf neu aros iddi gael ei chludo i'r fferyllfa. Yn lle hynny, caiff y presgripsiwn ei anfon yn electronig gan y practis drwy'r system TG at fferyllfa ddewisedig y claf - heb fod angen i'r claf ymweld â'r practis er mwyn casglu ffurflen bresgripsiwn ailadrodd."
pipeline_tag: text-generation
---

# ALMA-Cymraeg-13B
Fersiwn Gymraeg o fodel cyfieithu [ALMA](https://github.com/fe1ixxu/ALMA) a ddisgrifir yn [https://arxiv.org/abs/2309.11674](https://arxiv.org/abs/2309.11674). \
_This is a Welsh version of the [ALMA](https://github.com/fe1ixxu/ALMA) LLM-based translation model._

Mae'r model LLM yn seiliedig ar Lama-2-13B, gyda hyfforddiant parhaus ar ddata Gymreig [OSCAR-2301](https://huggingface.co/datasets/oscar-corpus/OSCAR-2301) am 3 Epoch
ac yna hyfforddiant cywrain pellach ar ddata Cofnod y Cynulliad a ddarparir gan [TechIaith](https://huggingface.co/techiaith).

Mae'r fersiwn yma wedi ei gywasgu i 4.0bpw er mwyn llwytho mewn cof GPU o 10GB gyda testyn hyd at 4096 tocyn gan ddefnyddio [ExLlamaV2](https://github.com/turboderp/exllamav2).

### Fformat Sgwrs

Mae'r hyfforddiant cywrain wedi defnyddio'r fformat canlynol ar gyfer trosi o'r Saesneg i'r Gymraeg (a'r naill ffordd i'r llall).
```
Cyfieithwch y testun Saesneg canlynol i'r Gymraeg.
### Saesneg:
{prompt}

### Cymraeg:

```



#### Esiampl

```python
import time
import sys, os
import dataclasses
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from exllamav2 import(
    ExLlamaV2,
    ExLlamaV2Config,
    ExLlamaV2Cache,
    ExLlamaV2Tokenizer,
    ExLlamaV2Lora,    
)

from exllamav2.generator import (
    ExLlamaV2StreamingGenerator,
    ExLlamaV2Sampler
)


class ModelClass:
    def __init__(self, generator, tokenizer, model):
        self.generator = generator
        self.tokenizer = tokenizer
        self.model = model

DEBUG = os.environ.get("DEBUG") and True or False

# Cychwyn model a storfa
def load_model(model_directory, max_seq_len=4096):
    """
    Yn llwytho model o gyfeiriadur ac yn dychwelyd y generadur a'r tocynnwr
    """
    config = ExLlamaV2Config()
    config.model_dir = model_directory
    config.max_seq_len = max_seq_len
    config.prepare()

    model = ExLlamaV2(config)
    print("Llwytho model: " + model_directory)

    cache = ExLlamaV2Cache(model, lazy = True, max_seq_len=max_seq_len)
    model.load_autosplit(cache)

    tokenizer = ExLlamaV2Tokenizer(config)
    generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
    model = ModelClass(generator=generator, tokenizer=tokenizer, model=model)
    generator.warmup()
    return model

def generate_text(prompt, settings, max_new_tokens):
    sys.stdout.flush()
    input_ids = base_model.tokenizer.encode(prompt)
    generated_tokens = 0 # input_ids.shape[-1]
    base_model.generator.set_stop_conditions(["\n"])
    base_model.generator.begin_stream(input_ids, settings)
    time_begin = time.time()

    while True:
        chunk, eos, _ = base_model.generator.stream()
        generated_tokens += 1
        print (chunk, end = "")
        sys.stdout.flush()
        if eos or generated_tokens == max_new_tokens: break

    time_end = time.time()
    time_total = time_end - time_begin
    print(f"\nYmateb cyflawn mewn {time_total:.2f} eiliad, {generated_tokens} tocyn, {generated_tokens / time_total:.2f} tocyn/eiliad")
    return ""

base_model = load_model("./ALMA-Cymraeg-13B-0.1-4.0bpw-exl2")

settings = ExLlamaV2Sampler.Settings()
settings.temperature = 0.15 # newid fel bod angen e.e. 0.75
settings.top_k = 90 # newid fel bod angen e.e. 50
settings.top_p = 1.0 # ayyb
settings.token_repetition_penalty = 1.15 # ayyb
max_new_tokens = 2000 # ayyb

system_prompt = "Cyfieithwch y testun Saesneg canlynol i'r Gymraeg."

while True:
    user_input = input("Saesneg: ")

    prompt = f"{system_prompt}\n\n### Saesneg:\n{user_input}\n\n### Cymraeg:\n"
    if DEBUG: print(f"{prompt}\n\n")
    print("Cymraeg:")
    response = generate_text(prompt, settings, max_new_tokens)
    print("="*132)
```

## Hawlfraint

Mae'r model yn seiliedig ar Llama2 ac felly dan drwydded gan [Meta](https://ai.meta.com/llama/license/). \
Mae'r data Cofnod y Cynulliad dan drywdded [Llywodraeth Agored](https://www.nationalarchives.gov.uk/doc/open-government-licence-cymraeg/version/3/).